Skip to content

Add testc_tools (misspell) #977

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/pypi-publish.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: CC0-1.0

name: Upload package to PyPI

on:
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/pypi-test.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: CC0-1.0

name: PyPI Unit test

on:
Expand Down
7 changes: 5 additions & 2 deletions .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: CC0-1.0

name: Unit test

on:
Expand Down Expand Up @@ -100,9 +103,9 @@ jobs:
# Test cases loaded is defined in __init__.py in the tests directory.
# See also tests/README.md
- name: Coverage report
if: matrix.python-version == env.PYTHON_VERSION_LATEST
if: matrix.os == 'ubuntu-latest' && matrix.python-version == env.PYTHON_VERSION_LATEST
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COVERALLS_SERVICE_NAME: github
run: coveralls
# Only submit a report from the latest Python version
# Only submit a report from the latest Python version on ubuntu-latest.
9 changes: 5 additions & 4 deletions pythainlp/util/strftime.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,18 @@ def _std_strftime(dt_obj: datetime, fmt_char: str) -> str:
try:
str_ = dt_obj.strftime(f"%{fmt_char}")
if not str_ or str_ == "%{}".format(fmt_char):
# normalize outputs for unsupported directives
# in different platforms
# "%Q" may result "%Q", "Q", or "", make it "Q"
# Normalize outputs for unsupported directives
# in different platforms:
# "%Q" may result "", "%Q", or "Q", make it all "Q"
str_ = fmt_char
except ValueError as err:
# Unsupported directives may raise ValueError on Windows,
# in that case just use the fmt_char
warnings.warn(
(
f"String format directive unknown/not support: %{fmt_char}\n"
f"The system raises this ValueError: {err}"
f"The system raises this ValueError: {err}\n"
f"Continue working without the directive."
),
UserWarning,
)
Expand Down
3 changes: 2 additions & 1 deletion tests/compact/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@

# Names of module to be tested
test_packages: list[str] = [
"tests.compact.testc_tag",
"tests.compact.testc_parse",
"tests.compact.testc_tokenize",
"tests.compact.testc_tools",
"tests.compact.testc_util",
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pythainlp.tag import chunk_parse, pos_tag


class TagTestCase(unittest.TestCase):
class ChunkParseTestCase(unittest.TestCase):
def test_chunk_parse(self):
tokens = ["ผม", "รัก", "คุณ"]

Expand Down
15 changes: 10 additions & 5 deletions tests/compact/testc_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0

# Tests for tokenize functions that need "compact" dependencies

import unittest

from pythainlp.tokenize import (
Expand All @@ -23,8 +25,7 @@
)


# Tests for functions that need "compact" dependencies
class TokenizeTestCaseCompact(unittest.TestCase):
class WordTokenizeICUTestCase(unittest.TestCase):
def test_icu(self):
self.assertEqual(pyicu.segment(None), [])
self.assertEqual(pyicu.segment(""), [])
Expand All @@ -33,6 +34,11 @@ def test_icu(self):
["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
)

def test_word_tokenize_icu(self):
self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))


class SentTokenizeCRFCutTestCase(unittest.TestCase):
def test_sent_tokenize(self):
# Use default engine (crfcut)
self.assertEqual(sent_tokenize(None), [])
Expand Down Expand Up @@ -67,6 +73,8 @@ def test_sent_tokenize(self):
[["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]],
)


class SubwordTokenizeHanSoloTestCase(unittest.TestCase):
def test_subword_tokenize(self):
self.assertEqual(subword_tokenize(None, engine="han_solo"), [])
self.assertEqual(
Expand All @@ -80,6 +88,3 @@ def test_subword_tokenize(self):
self.assertNotIn(
"า", subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
)

def test_word_tokenize_icu(self):
self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pythainlp.tools.misspell import misspell


def _count_difference(st1, st2):
def _count_difference(st1: str, st2: str) -> int:
# this assumes len(st1) == len(st2)

count = 0
Expand Down
2 changes: 1 addition & 1 deletion tests/compact/testc_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pythainlp.util.spell_words import spell_word


class UtilTestCaseX(unittest.TestCase):
class SpellWordTestCase(unittest.TestCase):
def test_spell_word(self):
self.assertEqual(spell_word("เสือ"), ["สอ", "เอือ", "เสือ"])
self.assertEqual(spell_word("เสื้อ"), ["สอ", "เอือ", "ไม้โท", "เสื้อ"])
Expand Down
79 changes: 48 additions & 31 deletions tests/core/test_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
unigram,
)

TEST_TOKENS = ["ผม", "รัก", "คุณ"]


class TagTestCase(unittest.TestCase):
# ### pythainlp.tag.pos_tag
"""Test pythainlp.tag.pos_tag"""

def test_pos_tag(self):
tokens = ["ผม", "รัก", "คุณ"]

self.assertEqual(pos_tag(None), [])
self.assertEqual(pos_tag([]), [])
self.assertEqual(
Expand All @@ -40,25 +40,31 @@ def test_pos_tag(self):
self.assertEqual(unigram.tag(None, corpus="tud"), [])
self.assertEqual(unigram.tag([], corpus="tud"), [])
self.assertIsNotNone(
pos_tag(tokens, engine="unigram", corpus="orchid")
pos_tag(TEST_TOKENS, engine="unigram", corpus="orchid")
)
self.assertIsNotNone(
pos_tag(tokens, engine="unigram", corpus="orchid_ud")
pos_tag(TEST_TOKENS, engine="unigram", corpus="orchid_ud")
)
self.assertIsNotNone(
pos_tag(TEST_TOKENS, engine="unigram", corpus="pud")
)
self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
self.assertIsNotNone(
pos_tag(tokens, engine="unigram", corpus="blackboard")
pos_tag(TEST_TOKENS, engine="unigram", corpus="blackboard")
)
self.assertIsNotNone(
pos_tag([""], engine="unigram", corpus="blackboard")
)
self.assertIsNotNone(
pos_tag([""], engine="unigram", corpus="blackboard_ud")
)
self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="tdtb"))
self.assertIsNotNone(
pos_tag(TEST_TOKENS, engine="unigram", corpus="tdtb")
)
self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="tdtb"))
self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="tud"))
self.assertIsNotNone(
pos_tag(TEST_TOKENS, engine="unigram", corpus="tud")
)
self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="tud"))
self.assertEqual(
pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
Expand All @@ -72,6 +78,25 @@ def test_pos_tag(self):
pos_tag(["ความ", "พอเพียง"], corpus="orchid_ud")[0][1], "NOUN"
)

self.assertEqual(pos_tag_sents(None), [])
self.assertEqual(pos_tag_sents([]), [])
self.assertEqual(
pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
[
[("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")],
[("แมว", "NCMN"), ("วิ่ง", "VACT")],
],
)


class PerceptronTaggerTestCase(unittest.TestCase):
"""Test pythainlp.tag.PerceptronTagger

:param unittest: _description_
:type unittest: _type_
"""

def test_perceptron_tagger(self):
self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
self.assertEqual(perceptron.tag([], corpus="orchid"), [])
self.assertEqual(perceptron.tag(None, corpus="orchid_ud"), [])
Expand All @@ -82,44 +107,34 @@ def test_pos_tag(self):
self.assertEqual(perceptron.tag([], corpus="blackboard"), [])
self.assertEqual(perceptron.tag(None, corpus="tud"), [])
self.assertEqual(perceptron.tag([], corpus="tud"), [])

self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="orchid")
pos_tag(TEST_TOKENS, engine="perceptron", corpus="orchid")
)
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="orchid_ud")
pos_tag(TEST_TOKENS, engine="perceptron", corpus="orchid_ud")
)
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="pud")
pos_tag(TEST_TOKENS, engine="perceptron", corpus="pud")
)
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="blackboard")
pos_tag(TEST_TOKENS, engine="perceptron", corpus="blackboard")
)
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="blackboard_ud")
pos_tag(TEST_TOKENS, engine="perceptron", corpus="blackboard_ud")
)
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="tdtb")
pos_tag(TEST_TOKENS, engine="perceptron", corpus="tdtb")
)
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="tdtb")
pos_tag(TEST_TOKENS, engine="perceptron", corpus="tdtb")
)
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="tud")
)

self.assertEqual(pos_tag_sents(None), [])
self.assertEqual(pos_tag_sents([]), [])
self.assertEqual(
pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
[
[("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")],
[("แมว", "NCMN"), ("วิ่ง", "VACT")],
],
pos_tag(TEST_TOKENS, engine="perceptron", corpus="tud")
)

# ### pythainlp.tag.PerceptronTagger

def test_perceptron_tagger(self):
def test_perceptron_tagger_custom(self):
"""Test pythainlp.tag.PerceptronTagger"""
tagger = PerceptronTagger()
# train data, with "กิน" > 20 instances to trigger conditions
# in _make_tagdict()
Expand Down Expand Up @@ -182,7 +197,9 @@ def test_perceptron_tagger(self):
with self.assertRaises(IOError):
tagger.load("ptagger_notexistX4AcOcX.pkl") # file does not exist

# ### pythainlp.tag.locations

class TagLocationsTestCase(unittest.TestCase):
"""Test pythainlp.tag.locations"""

def test_ner_locations(self):
self.assertEqual(
Expand Down
4 changes: 4 additions & 0 deletions tests/core/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,10 @@ def test_tcc_p(self):
self.assertEqual(list(tcc_p.tcc("")), [])
self.assertEqual(tcc_p.tcc_pos(""), set())


class DetokenizeTestCase(unittest.TestCase):
"""Detokenize and regrouping test cases"""

def test_word_detokenize(self):
self.assertIsInstance(word_detokenize(["ผม", "5"]), str)
self.assertEqual(
Expand Down
2 changes: 1 addition & 1 deletion tests/extra/testx_spell.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
symspellpy,
)

from .test_spell import SENT_TOKS
from ..core.test_spell import SENT_TOKS


class SpellTestCaseX(unittest.TestCase):
Expand Down
Loading