diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 2b2ff40e4..55302507b 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -21,19 +21,21 @@ "abbreviation_to_full_text", "arabic_digit_to_thai_digit", "bahttext", - "convert_years", "collate", - "countthai", + "convert_years", "count_thai_chars", + "countthai", "dict_trie", "digit_to_text", "display_thai_char", "emoji_to_thai", "eng_to_thai", "find_keyword", + "ipa_to_rtgs", "is_native_thai", "isthai", "isthaichar", + "nectec_to_ipa", "normalize", "now_reign_year", "num_to_thaiword", @@ -42,11 +44,18 @@ "remove_dangling", "remove_dup_spaces", "remove_repeat_vowels", + "remove_tone_ipa", "remove_tonemark", + "remove_trailing_repeat_consonants", "remove_zw", "reorder_vowels", "rhyme", + "sound_syllable", + "spell_words", + "syllable_length", + "syllable_open_close_detector", "text_to_arabic_digit", + "text_to_num", "text_to_thai_digit", "thai_digit_to_arabic_digit", "thai_keyboard_dist", @@ -58,17 +67,9 @@ "thaiword_to_num", "thaiword_to_time", "time_to_thaiword", - "text_to_num", + "tis620_to_utf8", "tone_detector", "words_to_num", - "sound_syllable", - "syllable_length", - "syllable_open_close_detector", - "nectec_to_ipa", - "ipa_to_rtgs", - "remove_tone_ipa", - "tis620_to_utf8", - "spell_words", ] from pythainlp.util.collate import collate @@ -103,6 +104,9 @@ remove_zw, reorder_vowels, ) +from pythainlp.util.remove_trailing_repeat_consonants import ( + remove_trailing_repeat_consonants, +) from pythainlp.util.numtoword import bahttext, num_to_thaiword from pythainlp.util.strftime import thai_strftime from pythainlp.util.thai import ( diff --git a/pythainlp/util/remove_trailing_repeat_consonants.py b/pythainlp/util/remove_trailing_repeat_consonants.py new file mode 100644 index 000000000..7aae7e519 --- /dev/null +++ b/pythainlp/util/remove_trailing_repeat_consonants.py @@ -0,0 +1,256 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Removement of repeated consonants at the end of words +""" +from pythainlp.corpus import thai_words +from pythainlp.util.trie import Trie +from pythainlp import thai_consonants as consonants +from typing import Tuple, List + +# used by remove_trailing_repeat_consonants() +# contains all words that has repeating consonants at the end +# for each consonant +# when dictionary updated, this should be updated too +# key: consonant +# value: list of words that has repeating consonants at the end +last_consonants_repeaters = {} + + +def remove_trailing_repeat_consonants( + text: str, dictionary: Trie = None, has_dictionary_updated: bool = True +) -> str: + """ + Remove repeating consonants at the last of the sentence. + + This function will remove the repeating consonants + before a whitespace, new line or at the last + so that the last word matches a word in the given dictionary. + If there is no match, the repeating consonants will be + reduced to one. + If there are several match, the longest word will be used. + Since this function uses a dictionary, the result may differs + depending on the dictionary used. + Plus, it is recommended to use normalize() to have a better result. + + :param str text: input text + :param Trie dictionary: Trie dictionary to check the last word. + If None, pythainlp.corpus.thai_words() will be used + :param bool has_dictionary_updated: If the dictionary is updated + or the first time using in the kernel, set this true. + If not, set this false to save time. + :return: text without repeating Thai consonants + :rtype: str + + :Example: + :: + + from pythainlp.util import remove_trailing_repeat_consonants + from pythainlp.util import dict_trie + + # use default dictionary (pythainlp.corpus.thai_words()) + remove_trailing_repeat_consonants('เริ่ดดดดดดดด') + # output: เริ่ด + + remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม') + # output: อืมมม + # "อืมมม" is in the default dictionary + + # use custom dictionary + custom_dictionary = dict_trie(["อืมมมมม"]) + remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary) + # output: อืมมมมม + + # long text + remove_trailing_repeat_consonants('อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '\ + 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ') + # output: อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ + # นี่เป็นความลับ + """ + # use default dictionary if not given + if dictionary is None: + dictionary = thai_words() + + # update repeaters dictionary if not updated + if has_dictionary_updated: + _update_consonant_repeaters(dictionary) + + # seperate by newline + modified_lines = [] + for line in text.split("\n"): + segments = line.split(" ") + + for cnt, segment in enumerate(segments): + segments[cnt] = _remove_repeat_trailing_consonants_from_segment( + segment + ) + + # revert spaces + modified_line = " ".join(segments) + modified_lines.append(modified_line) + + # revert newlines + modified_text = "\n".join(modified_lines) + + return modified_text + + +def _remove_repeat_trailing_consonants_from_segment(segment: str) -> str: + """ + Remove repeating consonants at the last of the segment. + + This function process only at the last of the given text. + Details is same as remove_repeat_consonants(). + + :param str segment: segment of text + :return: segment without repeating Thai consonants + :rtype: str + """ + # skip if the segment is not the target + if not ( + # the segment is long enough + (len(segment) > 1) + # last is Thai consonant + and (segment[-1] in consonants) + # has repiitition + and (segment[-1] == segment[-2]) + ): + # no need to process + return segment + + # duplicating character + dup = segment[-1] + + # find the words that has 2 or more duplication of + # this character at the end. + repeaters = last_consonants_repeaters[dup] + + # remove all of the last repeating character + segment_head = _remove_all_last_consonants(segment, dup) + + # find the longest word that matches the segment + longest_word, repetition = _find_longest_consonant_repeaters_match( + segment_head, repeaters + ) + + if len(longest_word) > 0: + # if there is a match, use it + segment = segment_head + (dup * repetition) + else: + # if none found, + # the chance is that the correct is one character, + # or it's not in the dictionary. + + # make the repition to once + segment = segment_head + (dup * 1) + + return segment + + +def _remove_all_last_consonants(text: str, dup: str) -> str: + """ + Reduce repeating characters at the end of the text. + + This function will remove the repeating characters at the last. + The text just before the repeating characters will be returned. + + :param str text: input text + :param str dup: repeating character to be removed + :return: text without repeating characters at the end + :rtype: str + """ + removed = text + while (len(removed) > 0) and (removed[-1] == dup): + removed = removed[:-1] + + return removed + + +def _update_consonant_repeaters(dictionary: Trie) -> None: + """ + Update dictionary of all words that has + repeating consonants at the end from the dictionary. + + Search all words in the dictionary that has more than 1 consonants + repeating at the end and store them in the global dictionary. + + :param str consonant: consonant to be searched + :param Trie dictionary: Trie dictionary to search + :rtype: None + """ + # initialize dictionary + for consonant in list(consonants): + last_consonants_repeaters[consonant] = [] + + # register + for word in dictionary: + if _is_last_consonant_repeater(word): + last_consonants_repeaters[word[-1]].append(word) + + return + + +def _is_last_consonant_repeater(word: str) -> bool: + """ + Check if the word has repeating consonants at the end. + + This function checks if the word has + more than 1 repeating consonants at the end. + + :param str word: word to be checked + :return: True if the word has repeating consonants at the end. + :rtype: bool + """ + return ( + (len(word) > 1) and (word[-1] == word[-2]) and (word[-1] in consonants) + ) + + +def _find_longest_consonant_repeaters_match( + segment_head: str, repeaters: List[str] +) -> Tuple[str, int]: + """ + Find the longest word that matches the segment. + + Find the longest word that matches the last + of the segment from the given repeaters list. + This returns the word and + how much the last character is repeated correctly. + + :param str segment: segment of text + :param List[str] repeaters: list of words + that has repeating consonants at the end + :return: "tuple of the word" and + "how much the last character is repeated correctly" + If none, ("", 0) will be returned. + :rtype: Tuple[str, int] + """ + longest_word = "" # the longest word that matches the segment + repetition = 0 # how much the last character is repeated correctly + for repeater in repeaters: + # remove all of the last repeating character + repeater_head = _remove_all_last_consonants(repeater, repeater[-1]) + + # check match + if ( + (len(segment_head) >= len(repeater_head)) + and (segment_head[-len(repeater_head) :] == repeater_head) + # matched confirmed, check it's longer + and (len(repeater) > len(longest_word)) + ): + longest_word = repeater + repetition = len(repeater) - len(repeater_head) + + return longest_word, repetition diff --git a/tests/test_util.py b/tests/test_util.py index 1840e2dc0..e45319c99 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -60,6 +60,7 @@ ipa_to_rtgs, remove_tone_ipa, tis620_to_utf8, + remove_trailing_repeat_consonants ) from pythainlp.util.spell_words import spell_word @@ -832,7 +833,8 @@ def test_convert_years(self): self.assertEqual(convert_years("242", src="re", target="ad"), "2023") self.assertEqual(convert_years("242", src="re", target="ah"), "1444") with self.assertRaises(NotImplementedError): - self.assertIsNotNone(convert_years("2023", src="cat", target="dog")) + self.assertIsNotNone(convert_years( + "2023", src="cat", target="dog")) def test_nectec_to_ipa(self): self.assertEqual(nectec_to_ipa("kl-uua-j^-2"), 'kl uua j ˥˩') @@ -846,17 +848,44 @@ def test_remove_tone_ipa(self): self.assertEqual(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"), "laː.sa.maj") def test_tis620_to_utf8(self): - self.assertEqual(tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม") + self.assertEqual(tis620_to_utf8( + "¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม") def test_spell_word(self): - self.assertEqual(spell_word("เสือ"),['สอ', 'เอือ', 'เสือ']) - self.assertEqual(spell_word("เสื้อ"),['สอ', 'เอือ', 'ไม้โท', 'เสื้อ']) - self.assertEqual(spell_word("คน"),['คอ', 'นอ', 'คน']) - self.assertEqual(spell_word("คนดี"),['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']) + self.assertEqual(spell_word("เสือ"), ['สอ', 'เอือ', 'เสือ']) + self.assertEqual(spell_word("เสื้อ"), ['สอ', 'เอือ', 'ไม้โท', 'เสื้อ']) + self.assertEqual(spell_word("คน"), ['คอ', 'นอ', 'คน']) + self.assertEqual(spell_word("คนดี"), [ + 'คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']) def test_rhyme(self): self.assertIsInstance(rhyme("แมว"), list) self.assertTrue(len(rhyme("แมว")) > 2) + def test_remove_repeat_consonants(self): + # update of pythainlp.copus.thai_words() able to break this + self.assertEqual( + remove_trailing_repeat_consonants('เริ่ดดดดดดดด'), + 'เริ่ด' + ) + self.assertEqual( + remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม'), + 'อืมมม' + ) + + custom_dictionary = dict_trie(["อืมมมมม"]) + self.assertEqual( + remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary), + 'อืมมมมม' + ) + + self.assertEqual( + remove_trailing_repeat_consonants( + 'อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด ' + 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ' + ), + 'อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ' + ) + # def test_abbreviation_to_full_text(self): # self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list))