|
1 | 1 | # -*- coding: utf-8 -*-
|
2 | 2 | # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
3 | 3 | # SPDX-License-Identifier: Apache-2.0
|
4 |
| -""" |
5 |
| -Check if a word is a "native Thai word" |
6 |
| -
|
7 |
| -Adapted from |
8 |
| -https://github.com/wannaphong/open-thai-nlp-document/blob/master/check_thai_word.md |
9 |
| -
|
10 |
| -References |
11 |
| -- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ http://www.trueplookpanya.com/learning/detail/30589-043067 |
12 |
| -- วารุณี บำรุงรส 2010. คำไทยแท้ https://www.gotoknow.org/posts/377619 |
13 |
| -""" |
14 |
| -import re |
15 |
| - |
16 |
| -_THANTHAKHAT_CHAR = "\u0e4c" # Thanthakhat (cancellation of sound) |
17 |
| - |
18 |
| -# Non-native Thai characters |
19 |
| -_TH_NON_NATIVE_CHARS = { |
20 |
| - "ฆ", |
21 |
| - "ณ", |
22 |
| - "ฌ", |
23 |
| - "ฎ", |
24 |
| - "ฏ", |
25 |
| - "ฐ", |
26 |
| - "ฑ", |
27 |
| - "ฒ", |
28 |
| - "ธ", |
29 |
| - "ศ", |
30 |
| - "ษ", |
31 |
| - "ฬ", |
32 |
| - _THANTHAKHAT_CHAR, |
33 |
| -} |
34 |
| - |
35 |
| -# Native Thai final consonants |
36 |
| -_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"} |
37 |
| - |
38 |
| -# Known native Thai words (exceptions) |
39 |
| -_TH_NATIVE_WORDS = { |
40 |
| - "ฆ่า", |
41 |
| - "เฆี่ยน", |
42 |
| - "ศึก", |
43 |
| - "ศอก", |
44 |
| - "เศิก", |
45 |
| - "เศร้า", |
46 |
| - "ธ", |
47 |
| - "ณ", |
48 |
| - "ฯพณฯ", |
49 |
| - "ใหญ่", |
50 |
| - "หญ้า", |
51 |
| - "ควาย", |
52 |
| - "ความ", |
53 |
| - "กริ่งเกรง", |
54 |
| - "ผลิ", |
55 |
| -} |
56 |
| - |
57 |
| -# Diphthong prefixes (can start native Thai word) |
58 |
| -_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"} |
59 |
| - |
60 |
| -# Thai consonant filter |
61 |
| -# O ANG (U+0E2D) is omitted, as it can be considered as vowel |
62 |
| -_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U) |
63 |
| - |
| 4 | +import warnings |
64 | 5 |
|
65 | 6 | def is_native_thai(word: str) -> bool:
|
66 |
| - """ |
67 |
| - Check if a word is an "native Thai word" (Thai: "คำไทยแท้") |
68 |
| - This function is based on a simple heuristic algorithm |
69 |
| - and cannot be entirely reliable. |
70 |
| -
|
71 |
| - :param str word: word |
72 |
| - :return: True or False |
73 |
| - :rtype: bool |
74 |
| -
|
75 |
| - :Example: |
76 |
| -
|
77 |
| - English word:: |
78 |
| -
|
79 |
| - from pythainlp.util import is_native_thai |
80 |
| -
|
81 |
| - is_native_thai("Avocado") |
82 |
| - # output: False |
83 |
| -
|
84 |
| - Native Thai word:: |
85 |
| -
|
86 |
| - is_native_thai("มะม่วง") |
87 |
| - # output: True |
88 |
| - is_native_thai("ตะวัน") |
89 |
| - # output: True |
90 |
| -
|
91 |
| - Non-native Thai word:: |
92 |
| -
|
93 |
| - is_native_thai("สามารถ") |
94 |
| - # output: False |
95 |
| - is_native_thai("อิสริยาภรณ์") |
96 |
| - # output: False |
97 |
| - """ |
98 |
| - if not isinstance(word, str) or not word.strip(): |
99 |
| - return False |
100 |
| - |
101 |
| - word = word.strip() |
102 |
| - |
103 |
| - # Known native Thai words (exceptions) |
104 |
| - if word in _TH_NATIVE_WORDS: |
105 |
| - return True |
106 |
| - |
107 |
| - # If a word contains non-Thai chars, it is not a native Thai |
108 |
| - if any(ch in word for ch in _TH_NON_NATIVE_CHARS): |
109 |
| - return False |
110 |
| - |
111 |
| - # If it does not contain any Thai consonants -> it cannot be Thai |
112 |
| - chs = re.findall(_TH_CONSONANTS_PATTERN, word) |
113 |
| - if not chs: |
114 |
| - return False |
115 |
| - |
116 |
| - # If there's only one Thai consonant -> it can be a native Thai |
117 |
| - if len(chs) == 1: |
118 |
| - return True |
119 |
| - |
120 |
| - # If a word ends with native final, it can be a native Thai |
121 |
| - if word[-1] in _TH_NATIVE_FINALS: |
122 |
| - return True |
123 |
| - |
124 |
| - # Note: This will not work, as it check the whole word, not the prefix. |
125 |
| - # Prefix-sensitive tokenization is required in order to be able to check this. |
126 |
| - if word in _TH_PREFIX_DIPHTHONG: |
127 |
| - return True |
128 |
| - |
129 |
| - return False |
| 7 | + warnings.warn( |
| 8 | + """ |
| 9 | + pythainlp.util.is_native_thai is rename as \ |
| 10 | + pythainlp.morpheme.is_native_thai. |
| 11 | + This function will remove in PyThaiNLP 5.1. |
| 12 | + """ |
| 13 | + , DeprecationWarning) |
| 14 | + from pythainlp.morpheme import is_native_thai as check |
| 15 | + |
| 16 | + return check(word) |
0 commit comments