Skip to content

Commit 524759a

Browse files
committed
pythainlp.util.is_native_thai moved to pythainlp.morpheme.is_native_thai
1 parent 3d324e3 commit 524759a

File tree

6 files changed

+173
-131
lines changed

6 files changed

+173
-131
lines changed

docs/api/morpheme.rst

+6-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,9 @@ pythainlp.morpheme
55

66
The `pythainlp.benchmarks` module is collect functions for morpheme analysis, word formation and more for Thai language.
77

8-
.. autofunction:: nighit
8+
.. autofunction:: nighit
9+
10+
.. autofunction:: is_native_thai
11+
:noindex:
12+
13+
The `is_native_thai` function is a language detection tool that identifies whether text is predominantly in the Thai language or not. It aids in language identification and text categorization tasks.

docs/api/util.rst

-5
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,6 @@ Modules
7777

7878
The `ipa_to_rtgs` function focuses on converting International Phonetic Alphabet (IPA) transcriptions into Royal Thai General System of Transcription (RTGS) format. This is valuable for phonetic analysis and pronunciation guides.
7979

80-
.. autofunction:: is_native_thai
81-
:noindex:
82-
83-
The `is_native_thai` function is a language detection tool that identifies whether text is predominantly in the Thai language or not. It aids in language identification and text categorization tasks.
84-
8580
.. autofunction:: isthai
8681
:noindex:
8782

pythainlp/morpheme/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,9 @@
55
"""
66
PyThaiNLP morpheme
77
"""
8+
__all__ = [
9+
"nighit",
10+
"is_native_thai"
11+
]
812
from pythainlp.morpheme.word_formation import nighit
13+
from pythainlp.morpheme.thaiwordcheck import is_native_thai

pythainlp/morpheme/thaiwordcheck.py

+129
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3+
# SPDX-License-Identifier: Apache-2.0
4+
"""
5+
Check if a word is a "native Thai word"
6+
7+
Adapted from
8+
https://github.com/wannaphong/open-thai-nlp-document/blob/master/check_thai_word.md
9+
10+
References
11+
- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ http://www.trueplookpanya.com/learning/detail/30589-043067
12+
- วารุณี บำรุงรส 2010. คำไทยแท้ https://www.gotoknow.org/posts/377619
13+
"""
14+
import re
15+
16+
_THANTHAKHAT_CHAR = "\u0e4c" # Thanthakhat (cancellation of sound)
17+
18+
# Non-native Thai characters
19+
_TH_NON_NATIVE_CHARS = {
20+
"ฆ",
21+
"ณ",
22+
"ฌ",
23+
"ฎ",
24+
"ฏ",
25+
"ฐ",
26+
"ฑ",
27+
"ฒ",
28+
"ธ",
29+
"ศ",
30+
"ษ",
31+
"ฬ",
32+
_THANTHAKHAT_CHAR,
33+
}
34+
35+
# Native Thai final consonants
36+
_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"}
37+
38+
# Known native Thai words (exceptions)
39+
_TH_NATIVE_WORDS = {
40+
"ฆ่า",
41+
"เฆี่ยน",
42+
"ศึก",
43+
"ศอก",
44+
"เศิก",
45+
"เศร้า",
46+
"ธ",
47+
"ณ",
48+
"ฯพณฯ",
49+
"ใหญ่",
50+
"หญ้า",
51+
"ควาย",
52+
"ความ",
53+
"กริ่งเกรง",
54+
"ผลิ",
55+
}
56+
57+
# Diphthong prefixes (can start native Thai word)
58+
_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"}
59+
60+
# Thai consonant filter
61+
# O ANG (U+0E2D) is omitted, as it can be considered as vowel
62+
_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U)
63+
64+
65+
def is_native_thai(word: str) -> bool:
66+
"""
67+
Check if a word is an "native Thai word" (Thai: "คำไทยแท้")
68+
This function is based on a simple heuristic algorithm
69+
and cannot be entirely reliable.
70+
71+
:param str word: word
72+
:return: True or False
73+
:rtype: bool
74+
75+
:Example:
76+
77+
English word::
78+
79+
from pythainlp.util import is_native_thai
80+
81+
is_native_thai("Avocado")
82+
# output: False
83+
84+
Native Thai word::
85+
86+
is_native_thai("มะม่วง")
87+
# output: True
88+
is_native_thai("ตะวัน")
89+
# output: True
90+
91+
Non-native Thai word::
92+
93+
is_native_thai("สามารถ")
94+
# output: False
95+
is_native_thai("อิสริยาภรณ์")
96+
# output: False
97+
"""
98+
if not isinstance(word, str) or not word.strip():
99+
return False
100+
101+
word = word.strip()
102+
103+
# Known native Thai words (exceptions)
104+
if word in _TH_NATIVE_WORDS:
105+
return True
106+
107+
# If a word contains non-Thai chars, it is not a native Thai
108+
if any(ch in word for ch in _TH_NON_NATIVE_CHARS):
109+
return False
110+
111+
# If it does not contain any Thai consonants -> it cannot be Thai
112+
chs = re.findall(_TH_CONSONANTS_PATTERN, word)
113+
if not chs:
114+
return False
115+
116+
# If there's only one Thai consonant -> it can be a native Thai
117+
if len(chs) == 1:
118+
return True
119+
120+
# If a word ends with native final, it can be a native Thai
121+
if word[-1] in _TH_NATIVE_FINALS:
122+
return True
123+
124+
# Note: This will not work, as it check the whole word, not the prefix.
125+
# Prefix-sensitive tokenization is required in order to be able to check this.
126+
if word in _TH_PREFIX_DIPHTHONG:
127+
return True
128+
129+
return False

pythainlp/util/thaiwordcheck.py

+11-124
Original file line numberDiff line numberDiff line change
@@ -1,129 +1,16 @@
11
# -*- coding: utf-8 -*-
22
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
33
# SPDX-License-Identifier: Apache-2.0
4-
"""
5-
Check if a word is a "native Thai word"
6-
7-
Adapted from
8-
https://github.com/wannaphong/open-thai-nlp-document/blob/master/check_thai_word.md
9-
10-
References
11-
- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ http://www.trueplookpanya.com/learning/detail/30589-043067
12-
- วารุณี บำรุงรส 2010. คำไทยแท้ https://www.gotoknow.org/posts/377619
13-
"""
14-
import re
15-
16-
_THANTHAKHAT_CHAR = "\u0e4c" # Thanthakhat (cancellation of sound)
17-
18-
# Non-native Thai characters
19-
_TH_NON_NATIVE_CHARS = {
20-
"ฆ",
21-
"ณ",
22-
"ฌ",
23-
"ฎ",
24-
"ฏ",
25-
"ฐ",
26-
"ฑ",
27-
"ฒ",
28-
"ธ",
29-
"ศ",
30-
"ษ",
31-
"ฬ",
32-
_THANTHAKHAT_CHAR,
33-
}
34-
35-
# Native Thai final consonants
36-
_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"}
37-
38-
# Known native Thai words (exceptions)
39-
_TH_NATIVE_WORDS = {
40-
"ฆ่า",
41-
"เฆี่ยน",
42-
"ศึก",
43-
"ศอก",
44-
"เศิก",
45-
"เศร้า",
46-
"ธ",
47-
"ณ",
48-
"ฯพณฯ",
49-
"ใหญ่",
50-
"หญ้า",
51-
"ควาย",
52-
"ความ",
53-
"กริ่งเกรง",
54-
"ผลิ",
55-
}
56-
57-
# Diphthong prefixes (can start native Thai word)
58-
_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"}
59-
60-
# Thai consonant filter
61-
# O ANG (U+0E2D) is omitted, as it can be considered as vowel
62-
_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U)
63-
4+
import warnings
645

656
def is_native_thai(word: str) -> bool:
66-
"""
67-
Check if a word is an "native Thai word" (Thai: "คำไทยแท้")
68-
This function is based on a simple heuristic algorithm
69-
and cannot be entirely reliable.
70-
71-
:param str word: word
72-
:return: True or False
73-
:rtype: bool
74-
75-
:Example:
76-
77-
English word::
78-
79-
from pythainlp.util import is_native_thai
80-
81-
is_native_thai("Avocado")
82-
# output: False
83-
84-
Native Thai word::
85-
86-
is_native_thai("มะม่วง")
87-
# output: True
88-
is_native_thai("ตะวัน")
89-
# output: True
90-
91-
Non-native Thai word::
92-
93-
is_native_thai("สามารถ")
94-
# output: False
95-
is_native_thai("อิสริยาภรณ์")
96-
# output: False
97-
"""
98-
if not isinstance(word, str) or not word.strip():
99-
return False
100-
101-
word = word.strip()
102-
103-
# Known native Thai words (exceptions)
104-
if word in _TH_NATIVE_WORDS:
105-
return True
106-
107-
# If a word contains non-Thai chars, it is not a native Thai
108-
if any(ch in word for ch in _TH_NON_NATIVE_CHARS):
109-
return False
110-
111-
# If it does not contain any Thai consonants -> it cannot be Thai
112-
chs = re.findall(_TH_CONSONANTS_PATTERN, word)
113-
if not chs:
114-
return False
115-
116-
# If there's only one Thai consonant -> it can be a native Thai
117-
if len(chs) == 1:
118-
return True
119-
120-
# If a word ends with native final, it can be a native Thai
121-
if word[-1] in _TH_NATIVE_FINALS:
122-
return True
123-
124-
# Note: This will not work, as it check the whole word, not the prefix.
125-
# Prefix-sensitive tokenization is required in order to be able to check this.
126-
if word in _TH_PREFIX_DIPHTHONG:
127-
return True
128-
129-
return False
7+
warnings.warn(
8+
"""
9+
pythainlp.util.is_native_thai is rename as \
10+
pythainlp.morpheme.is_native_thai.
11+
This function will remove in PyThaiNLP 5.1.
12+
"""
13+
, DeprecationWarning)
14+
from pythainlp.morpheme import is_native_thai as check
15+
16+
return check(word)

tests/test_morpheme.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
import unittest
6-
from pythainlp.morpheme import nighit
6+
from pythainlp.morpheme import nighit, is_native_thai
77

88

99
class TestMorphemePackage(unittest.TestCase):
@@ -14,3 +14,24 @@ def test_nighit(self):
1414
self.assertEqual(nighit("สํ", "นิษฐาน"), "สันนิษฐาน")
1515
self.assertEqual(nighit("สํ", "ปทา"), "สัมปทา")
1616
self.assertEqual(nighit("สํ", "โยค"), "สังโยค")
17+
18+
def test_is_native_thai(self):
19+
self.assertEqual(is_native_thai(None), False)
20+
self.assertEqual(is_native_thai(""), False)
21+
self.assertEqual(is_native_thai("116"), False)
22+
self.assertEqual(is_native_thai("abc"), False)
23+
self.assertEqual(is_native_thai("ตา"), True)
24+
self.assertEqual(is_native_thai("ยา"), True)
25+
self.assertEqual(is_native_thai("ฆ่า"), True)
26+
self.assertEqual(is_native_thai("คน"), True)
27+
self.assertEqual(is_native_thai("กะ"), True)
28+
self.assertEqual(is_native_thai("มอ"), True)
29+
self.assertEqual(is_native_thai("กะ"), True)
30+
self.assertEqual(is_native_thai("กระ"), True)
31+
self.assertEqual(is_native_thai("ประท้วง"), True)
32+
self.assertEqual(is_native_thai("ศา"), False)
33+
self.assertEqual(is_native_thai("ลักษ์"), False)
34+
self.assertEqual(is_native_thai("มาร์ค"), False)
35+
self.assertEqual(is_native_thai("เลข"), False)
36+
self.assertEqual(is_native_thai("เทเวศน์"), False)
37+
self.assertEqual(is_native_thai("เทเวศร์"), False)

0 commit comments

Comments
 (0)