Skip to content

Commit c981042

Browse files
authored
Merge pull request #862 from konbraphat51/dev
Add: remove_trailing_repeat_consonants()
2 parents edb52b3 + 3315cb0 commit c981042

File tree

3 files changed

+306
-17
lines changed

3 files changed

+306
-17
lines changed

pythainlp/util/__init__.py

+15-11
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,21 @@
2121
"abbreviation_to_full_text",
2222
"arabic_digit_to_thai_digit",
2323
"bahttext",
24-
"convert_years",
2524
"collate",
26-
"countthai",
25+
"convert_years",
2726
"count_thai_chars",
27+
"countthai",
2828
"dict_trie",
2929
"digit_to_text",
3030
"display_thai_char",
3131
"emoji_to_thai",
3232
"eng_to_thai",
3333
"find_keyword",
34+
"ipa_to_rtgs",
3435
"is_native_thai",
3536
"isthai",
3637
"isthaichar",
38+
"nectec_to_ipa",
3739
"normalize",
3840
"now_reign_year",
3941
"num_to_thaiword",
@@ -42,11 +44,18 @@
4244
"remove_dangling",
4345
"remove_dup_spaces",
4446
"remove_repeat_vowels",
47+
"remove_tone_ipa",
4548
"remove_tonemark",
49+
"remove_trailing_repeat_consonants",
4650
"remove_zw",
4751
"reorder_vowels",
4852
"rhyme",
53+
"sound_syllable",
54+
"spell_words",
55+
"syllable_length",
56+
"syllable_open_close_detector",
4957
"text_to_arabic_digit",
58+
"text_to_num",
5059
"text_to_thai_digit",
5160
"thai_digit_to_arabic_digit",
5261
"thai_keyboard_dist",
@@ -58,17 +67,9 @@
5867
"thaiword_to_num",
5968
"thaiword_to_time",
6069
"time_to_thaiword",
61-
"text_to_num",
70+
"tis620_to_utf8",
6271
"tone_detector",
6372
"words_to_num",
64-
"sound_syllable",
65-
"syllable_length",
66-
"syllable_open_close_detector",
67-
"nectec_to_ipa",
68-
"ipa_to_rtgs",
69-
"remove_tone_ipa",
70-
"tis620_to_utf8",
71-
"spell_words",
7273
]
7374

7475
from pythainlp.util.collate import collate
@@ -103,6 +104,9 @@
103104
remove_zw,
104105
reorder_vowels,
105106
)
107+
from pythainlp.util.remove_trailing_repeat_consonants import (
108+
remove_trailing_repeat_consonants,
109+
)
106110
from pythainlp.util.numtoword import bahttext, num_to_thaiword
107111
from pythainlp.util.strftime import thai_strftime
108112
from pythainlp.util.thai import (
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (C) 2016-2023 PyThaiNLP Project
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
Removement of repeated consonants at the end of words
17+
"""
18+
from pythainlp.corpus import thai_words
19+
from pythainlp.util.trie import Trie
20+
from pythainlp import thai_consonants as consonants
21+
from typing import Tuple, List
22+
23+
# used by remove_trailing_repeat_consonants()
24+
# contains all words that has repeating consonants at the end
25+
# for each consonant
26+
# when dictionary updated, this should be updated too
27+
# key: consonant
28+
# value: list of words that has repeating consonants at the end
29+
last_consonants_repeaters = {}
30+
31+
32+
def remove_trailing_repeat_consonants(
33+
text: str, dictionary: Trie = None, has_dictionary_updated: bool = True
34+
) -> str:
35+
"""
36+
Remove repeating consonants at the last of the sentence.
37+
38+
This function will remove the repeating consonants
39+
before a whitespace, new line or at the last
40+
so that the last word matches a word in the given dictionary.
41+
If there is no match, the repeating consonants will be
42+
reduced to one.
43+
If there are several match, the longest word will be used.
44+
Since this function uses a dictionary, the result may differs
45+
depending on the dictionary used.
46+
Plus, it is recommended to use normalize() to have a better result.
47+
48+
:param str text: input text
49+
:param Trie dictionary: Trie dictionary to check the last word.
50+
If None, pythainlp.corpus.thai_words() will be used
51+
:param bool has_dictionary_updated: If the dictionary is updated
52+
or the first time using in the kernel, set this true.
53+
If not, set this false to save time.
54+
:return: text without repeating Thai consonants
55+
:rtype: str
56+
57+
:Example:
58+
::
59+
60+
from pythainlp.util import remove_trailing_repeat_consonants
61+
from pythainlp.util import dict_trie
62+
63+
# use default dictionary (pythainlp.corpus.thai_words())
64+
remove_trailing_repeat_consonants('เริ่ดดดดดดดด')
65+
# output: เริ่ด
66+
67+
remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม')
68+
# output: อืมมม
69+
# "อืมมม" is in the default dictionary
70+
71+
# use custom dictionary
72+
custom_dictionary = dict_trie(["อืมมมมม"])
73+
remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary)
74+
# output: อืมมมมม
75+
76+
# long text
77+
remove_trailing_repeat_consonants('อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '\
78+
'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ')
79+
# output: อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ
80+
# นี่เป็นความลับ
81+
"""
82+
# use default dictionary if not given
83+
if dictionary is None:
84+
dictionary = thai_words()
85+
86+
# update repeaters dictionary if not updated
87+
if has_dictionary_updated:
88+
_update_consonant_repeaters(dictionary)
89+
90+
# seperate by newline
91+
modified_lines = []
92+
for line in text.split("\n"):
93+
segments = line.split(" ")
94+
95+
for cnt, segment in enumerate(segments):
96+
segments[cnt] = _remove_repeat_trailing_consonants_from_segment(
97+
segment
98+
)
99+
100+
# revert spaces
101+
modified_line = " ".join(segments)
102+
modified_lines.append(modified_line)
103+
104+
# revert newlines
105+
modified_text = "\n".join(modified_lines)
106+
107+
return modified_text
108+
109+
110+
def _remove_repeat_trailing_consonants_from_segment(segment: str) -> str:
111+
"""
112+
Remove repeating consonants at the last of the segment.
113+
114+
This function process only at the last of the given text.
115+
Details is same as remove_repeat_consonants().
116+
117+
:param str segment: segment of text
118+
:return: segment without repeating Thai consonants
119+
:rtype: str
120+
"""
121+
# skip if the segment is not the target
122+
if not (
123+
# the segment is long enough
124+
(len(segment) > 1)
125+
# last is Thai consonant
126+
and (segment[-1] in consonants)
127+
# has repiitition
128+
and (segment[-1] == segment[-2])
129+
):
130+
# no need to process
131+
return segment
132+
133+
# duplicating character
134+
dup = segment[-1]
135+
136+
# find the words that has 2 or more duplication of
137+
# this character at the end.
138+
repeaters = last_consonants_repeaters[dup]
139+
140+
# remove all of the last repeating character
141+
segment_head = _remove_all_last_consonants(segment, dup)
142+
143+
# find the longest word that matches the segment
144+
longest_word, repetition = _find_longest_consonant_repeaters_match(
145+
segment_head, repeaters
146+
)
147+
148+
if len(longest_word) > 0:
149+
# if there is a match, use it
150+
segment = segment_head + (dup * repetition)
151+
else:
152+
# if none found,
153+
# the chance is that the correct is one character,
154+
# or it's not in the dictionary.
155+
156+
# make the repition to once
157+
segment = segment_head + (dup * 1)
158+
159+
return segment
160+
161+
162+
def _remove_all_last_consonants(text: str, dup: str) -> str:
163+
"""
164+
Reduce repeating characters at the end of the text.
165+
166+
This function will remove the repeating characters at the last.
167+
The text just before the repeating characters will be returned.
168+
169+
:param str text: input text
170+
:param str dup: repeating character to be removed
171+
:return: text without repeating characters at the end
172+
:rtype: str
173+
"""
174+
removed = text
175+
while (len(removed) > 0) and (removed[-1] == dup):
176+
removed = removed[:-1]
177+
178+
return removed
179+
180+
181+
def _update_consonant_repeaters(dictionary: Trie) -> None:
182+
"""
183+
Update dictionary of all words that has
184+
repeating consonants at the end from the dictionary.
185+
186+
Search all words in the dictionary that has more than 1 consonants
187+
repeating at the end and store them in the global dictionary.
188+
189+
:param str consonant: consonant to be searched
190+
:param Trie dictionary: Trie dictionary to search
191+
:rtype: None
192+
"""
193+
# initialize dictionary
194+
for consonant in list(consonants):
195+
last_consonants_repeaters[consonant] = []
196+
197+
# register
198+
for word in dictionary:
199+
if _is_last_consonant_repeater(word):
200+
last_consonants_repeaters[word[-1]].append(word)
201+
202+
return
203+
204+
205+
def _is_last_consonant_repeater(word: str) -> bool:
206+
"""
207+
Check if the word has repeating consonants at the end.
208+
209+
This function checks if the word has
210+
more than 1 repeating consonants at the end.
211+
212+
:param str word: word to be checked
213+
:return: True if the word has repeating consonants at the end.
214+
:rtype: bool
215+
"""
216+
return (
217+
(len(word) > 1) and (word[-1] == word[-2]) and (word[-1] in consonants)
218+
)
219+
220+
221+
def _find_longest_consonant_repeaters_match(
222+
segment_head: str, repeaters: List[str]
223+
) -> Tuple[str, int]:
224+
"""
225+
Find the longest word that matches the segment.
226+
227+
Find the longest word that matches the last
228+
of the segment from the given repeaters list.
229+
This returns the word and
230+
how much the last character is repeated correctly.
231+
232+
:param str segment: segment of text
233+
:param List[str] repeaters: list of words
234+
that has repeating consonants at the end
235+
:return: "tuple of the word" and
236+
"how much the last character is repeated correctly"
237+
If none, ("", 0) will be returned.
238+
:rtype: Tuple[str, int]
239+
"""
240+
longest_word = "" # the longest word that matches the segment
241+
repetition = 0 # how much the last character is repeated correctly
242+
for repeater in repeaters:
243+
# remove all of the last repeating character
244+
repeater_head = _remove_all_last_consonants(repeater, repeater[-1])
245+
246+
# check match
247+
if (
248+
(len(segment_head) >= len(repeater_head))
249+
and (segment_head[-len(repeater_head) :] == repeater_head)
250+
# matched confirmed, check it's longer
251+
and (len(repeater) > len(longest_word))
252+
):
253+
longest_word = repeater
254+
repetition = len(repeater) - len(repeater_head)
255+
256+
return longest_word, repetition

tests/test_util.py

+35-6
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
ipa_to_rtgs,
6161
remove_tone_ipa,
6262
tis620_to_utf8,
63+
remove_trailing_repeat_consonants
6364
)
6465
from pythainlp.util.spell_words import spell_word
6566

@@ -832,7 +833,8 @@ def test_convert_years(self):
832833
self.assertEqual(convert_years("242", src="re", target="ad"), "2023")
833834
self.assertEqual(convert_years("242", src="re", target="ah"), "1444")
834835
with self.assertRaises(NotImplementedError):
835-
self.assertIsNotNone(convert_years("2023", src="cat", target="dog"))
836+
self.assertIsNotNone(convert_years(
837+
"2023", src="cat", target="dog"))
836838

837839
def test_nectec_to_ipa(self):
838840
self.assertEqual(nectec_to_ipa("kl-uua-j^-2"), 'kl uua j ˥˩')
@@ -846,17 +848,44 @@ def test_remove_tone_ipa(self):
846848
self.assertEqual(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"), "laː.sa.maj")
847849

848850
def test_tis620_to_utf8(self):
849-
self.assertEqual(tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม")
851+
self.assertEqual(tis620_to_utf8(
852+
"¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม")
850853

851854
def test_spell_word(self):
852-
self.assertEqual(spell_word("เสือ"),['สอ', 'เอือ', 'เสือ'])
853-
self.assertEqual(spell_word("เสื้อ"),['สอ', 'เอือ', 'ไม้โท', 'เสื้อ'])
854-
self.assertEqual(spell_word("คน"),['คอ', 'นอ', 'คน'])
855-
self.assertEqual(spell_word("คนดี"),['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'])
855+
self.assertEqual(spell_word("เสือ"), ['สอ', 'เอือ', 'เสือ'])
856+
self.assertEqual(spell_word("เสื้อ"), ['สอ', 'เอือ', 'ไม้โท', 'เสื้อ'])
857+
self.assertEqual(spell_word("คน"), ['คอ', 'นอ', 'คน'])
858+
self.assertEqual(spell_word("คนดี"), [
859+
'คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'])
856860

857861
def test_rhyme(self):
858862
self.assertIsInstance(rhyme("แมว"), list)
859863
self.assertTrue(len(rhyme("แมว")) > 2)
860864

865+
def test_remove_repeat_consonants(self):
866+
# update of pythainlp.copus.thai_words() able to break this
867+
self.assertEqual(
868+
remove_trailing_repeat_consonants('เริ่ดดดดดดดด'),
869+
'เริ่ด'
870+
)
871+
self.assertEqual(
872+
remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม'),
873+
'อืมมม'
874+
)
875+
876+
custom_dictionary = dict_trie(["อืมมมมม"])
877+
self.assertEqual(
878+
remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary),
879+
'อืมมมมม'
880+
)
881+
882+
self.assertEqual(
883+
remove_trailing_repeat_consonants(
884+
'อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '
885+
'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ'
886+
),
887+
'อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ'
888+
)
889+
861890
# def test_abbreviation_to_full_text(self):
862891
# self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list))

0 commit comments

Comments
 (0)