Merge pull request #862 from konbraphat51/dev

bact · web-flow · commit c981042eb264 · 2023-11-13T08:49:00.000Z
Add: remove_trailing_repeat_consonants()
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
@@ -21,19 +21,21 @@
     "abbreviation_to_full_text",
     "arabic_digit_to_thai_digit",
     "bahttext",
-    "convert_years",
     "collate",
-    "countthai",
+    "convert_years",
     "count_thai_chars",
+    "countthai",
     "dict_trie",
     "digit_to_text",
     "display_thai_char",
     "emoji_to_thai",
     "eng_to_thai",
     "find_keyword",
+    "ipa_to_rtgs",
     "is_native_thai",
     "isthai",
     "isthaichar",
+    "nectec_to_ipa",
     "normalize",
     "now_reign_year",
     "num_to_thaiword",
@@ -42,11 +44,18 @@
     "remove_dangling",
     "remove_dup_spaces",
     "remove_repeat_vowels",
+    "remove_tone_ipa",
     "remove_tonemark",
+    "remove_trailing_repeat_consonants",
     "remove_zw",
     "reorder_vowels",
     "rhyme",
+    "sound_syllable",
+    "spell_words",
+    "syllable_length",
+    "syllable_open_close_detector",
     "text_to_arabic_digit",
+    "text_to_num",
     "text_to_thai_digit",
     "thai_digit_to_arabic_digit",
     "thai_keyboard_dist",
@@ -58,17 +67,9 @@
     "thaiword_to_num",
     "thaiword_to_time",
     "time_to_thaiword",
-    "text_to_num",
+    "tis620_to_utf8",
     "tone_detector",
     "words_to_num",
-    "sound_syllable",
-    "syllable_length",
-    "syllable_open_close_detector",
-    "nectec_to_ipa",
-    "ipa_to_rtgs",
-    "remove_tone_ipa",
-    "tis620_to_utf8",
-    "spell_words",
 ]
 
 from pythainlp.util.collate import collate
@@ -103,6 +104,9 @@
     remove_zw,
     reorder_vowels,
 )
+from pythainlp.util.remove_trailing_repeat_consonants import (
+    remove_trailing_repeat_consonants,
+)
 from pythainlp.util.numtoword import bahttext, num_to_thaiword
 from pythainlp.util.strftime import thai_strftime
 from pythainlp.util.thai import (
diff --git a/pythainlp/util/remove_trailing_repeat_consonants.py b/pythainlp/util/remove_trailing_repeat_consonants.py
@@ -0,0 +1,256 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Removement of repeated consonants at the end of words
+"""
+from pythainlp.corpus import thai_words
+from pythainlp.util.trie import Trie
+from pythainlp import thai_consonants as consonants
+from typing import Tuple, List
+
+# used by remove_trailing_repeat_consonants()
+# contains all words that has repeating consonants at the end
+# for each consonant
+# when dictionary updated, this should be updated too
+# key: consonant
+# value: list of words that has repeating consonants at the end
+last_consonants_repeaters = {}
+
+
+def remove_trailing_repeat_consonants(
+    text: str, dictionary: Trie = None, has_dictionary_updated: bool = True
+) -> str:
+    """
+    Remove repeating consonants at the last of the sentence.
+
+    This function will remove the repeating consonants
+    before a whitespace, new line or at the last
+    so that the last word matches a word in the given dictionary.
+    If there is no match, the repeating consonants will be
+    reduced to one.
+    If there are several match, the longest word will be used.
+    Since this function uses a dictionary, the result may differs
+    depending on the dictionary used.
+    Plus, it is recommended to use normalize() to have a better result.
+
+    :param str text: input text
+    :param Trie dictionary: Trie dictionary to check the last word.
+    If None, pythainlp.corpus.thai_words() will be used
+    :param bool has_dictionary_updated: If the dictionary is updated 
+    or the first time using in the kernel, set this true.
+    If not, set this false to save time.
+    :return: text without repeating Thai consonants
+    :rtype: str
+
+    :Example:
+    ::
+
+        from pythainlp.util import remove_trailing_repeat_consonants
+        from pythainlp.util import dict_trie
+
+        # use default dictionary (pythainlp.corpus.thai_words())
+        remove_trailing_repeat_consonants('เริ่ดดดดดดดด')
+        # output: เริ่ด
+
+        remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม')
+        # output: อืมมม
+        # "อืมมม" is in the default dictionary
+
+        # use custom dictionary
+        custom_dictionary = dict_trie(["อืมมมมม"])
+        remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary)
+        # output: อืมมมมม
+
+        # long text
+        remove_trailing_repeat_consonants('อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '\
+        'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ')
+        # output: อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ
+        #         นี่เป็นความลับ
+    """
+    # use default dictionary if not given
+    if dictionary is None:
+        dictionary = thai_words()
+
+    # update repeaters dictionary if not updated
+    if has_dictionary_updated:
+        _update_consonant_repeaters(dictionary)
+
+    # seperate by newline
+    modified_lines = []
+    for line in text.split("\n"):
+        segments = line.split(" ")
+
+        for cnt, segment in enumerate(segments):
+            segments[cnt] = _remove_repeat_trailing_consonants_from_segment(
+                segment
+            )
+
+        # revert spaces
+        modified_line = " ".join(segments)
+        modified_lines.append(modified_line)
+
+    # revert newlines
+    modified_text = "\n".join(modified_lines)
+
+    return modified_text
+
+
+def _remove_repeat_trailing_consonants_from_segment(segment: str) -> str:
+    """
+    Remove repeating consonants at the last of the segment.
+
+    This function process only at the last of the given text.
+    Details is same as remove_repeat_consonants().
+
+    :param str segment: segment of text
+    :return: segment without repeating Thai consonants
+    :rtype: str
+    """
+    # skip if the segment is not the target
+    if not (
+        # the segment is long enough
+        (len(segment) > 1)
+        # last is Thai consonant
+        and (segment[-1] in consonants)
+        # has repiitition
+        and (segment[-1] == segment[-2])
+    ):
+        # no need to process
+        return segment
+
+    # duplicating character
+    dup = segment[-1]
+
+    # find the words that has 2 or more duplication of
+    # this character at the end.
+    repeaters = last_consonants_repeaters[dup]
+
+    # remove all of the last repeating character
+    segment_head = _remove_all_last_consonants(segment, dup)
+
+    # find the longest word that matches the segment
+    longest_word, repetition = _find_longest_consonant_repeaters_match(
+        segment_head, repeaters
+    )
+
+    if len(longest_word) > 0:
+        # if there is a match, use it
+        segment = segment_head + (dup * repetition)
+    else:
+        # if none found,
+        # the chance is that the correct is one character,
+        # or it's not in the dictionary.
+
+        # make the repition to once
+        segment = segment_head + (dup * 1)
+
+    return segment
+
+
+def _remove_all_last_consonants(text: str, dup: str) -> str:
+    """
+    Reduce repeating characters at the end of the text.
+
+    This function will remove the repeating characters at the last.
+    The text just before the repeating characters will be returned.
+
+    :param str text: input text
+    :param str dup: repeating character to be removed
+    :return: text without repeating characters at the end
+    :rtype: str
+    """
+    removed = text
+    while (len(removed) > 0) and (removed[-1] == dup):
+        removed = removed[:-1]
+
+    return removed
+
+
+def _update_consonant_repeaters(dictionary: Trie) -> None:
+    """
+    Update dictionary of all words that has
+    repeating consonants at the end from the dictionary.
+
+    Search all words in the dictionary that has more than 1 consonants
+    repeating at the end and store them in the global dictionary.
+
+    :param str consonant: consonant to be searched
+    :param Trie dictionary: Trie dictionary to search
+    :rtype: None
+    """
+    # initialize dictionary
+    for consonant in list(consonants):
+        last_consonants_repeaters[consonant] = []
+
+    # register
+    for word in dictionary:
+        if _is_last_consonant_repeater(word):
+            last_consonants_repeaters[word[-1]].append(word)
+
+    return
+
+
+def _is_last_consonant_repeater(word: str) -> bool:
+    """
+    Check if the word has repeating consonants at the end.
+
+    This function checks if the word has
+    more than 1 repeating consonants at the end.
+
+    :param str word: word to be checked
+    :return: True if the word has repeating consonants at the end.
+    :rtype: bool
+    """
+    return (
+        (len(word) > 1) and (word[-1] == word[-2]) and (word[-1] in consonants)
+    )
+
+
+def _find_longest_consonant_repeaters_match(
+    segment_head: str, repeaters: List[str]
+) -> Tuple[str, int]:
+    """
+    Find the longest word that matches the segment.
+
+    Find the longest word that matches the last
+    of the segment from the given repeaters list.
+    This returns the word and
+    how much the last character is repeated correctly.
+
+    :param str segment: segment of text
+    :param List[str] repeaters: list of words
+    that has repeating consonants at the end
+    :return: "tuple of the word" and
+    "how much the last character is repeated correctly"
+    If none, ("", 0) will be returned.
+    :rtype: Tuple[str, int]
+    """
+    longest_word = ""  # the longest word that matches the segment
+    repetition = 0  # how much the last character is repeated correctly
+    for repeater in repeaters:
+        # remove all of the last repeating character
+        repeater_head = _remove_all_last_consonants(repeater, repeater[-1])
+
+        # check match
+        if (
+            (len(segment_head) >= len(repeater_head))
+            and (segment_head[-len(repeater_head) :] == repeater_head)
+            # matched confirmed, check it's longer
+            and (len(repeater) > len(longest_word))
+        ):
+            longest_word = repeater
+            repetition = len(repeater) - len(repeater_head)
+
+    return longest_word, repetition
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -60,6 +60,7 @@
     ipa_to_rtgs,
     remove_tone_ipa,
     tis620_to_utf8,
+    remove_trailing_repeat_consonants
 )
 from pythainlp.util.spell_words import spell_word
 
@@ -832,7 +833,8 @@ def test_convert_years(self):
         self.assertEqual(convert_years("242", src="re", target="ad"), "2023")
         self.assertEqual(convert_years("242", src="re", target="ah"), "1444")
         with self.assertRaises(NotImplementedError):
-            self.assertIsNotNone(convert_years("2023", src="cat", target="dog"))
+            self.assertIsNotNone(convert_years(
+                "2023", src="cat", target="dog"))
 
     def test_nectec_to_ipa(self):
         self.assertEqual(nectec_to_ipa("kl-uua-j^-2"), 'kl uua j ˥˩')
@@ -846,17 +848,44 @@ def test_remove_tone_ipa(self):
         self.assertEqual(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"), "laː.sa.maj")
 
     def test_tis620_to_utf8(self):
-        self.assertEqual(tis620_to_utf8("¡ÃÐ·ÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม")
+        self.assertEqual(tis620_to_utf8(
+            "¡ÃÐ·ÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม")
 
     def test_spell_word(self):
-        self.assertEqual(spell_word("เสือ"),['สอ', 'เอือ', 'เสือ'])
-        self.assertEqual(spell_word("เสื้อ"),['สอ', 'เอือ', 'ไม้โท', 'เสื้อ'])
-        self.assertEqual(spell_word("คน"),['คอ', 'นอ', 'คน'])
-        self.assertEqual(spell_word("คนดี"),['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'])
+        self.assertEqual(spell_word("เสือ"), ['สอ', 'เอือ', 'เสือ'])
+        self.assertEqual(spell_word("เสื้อ"), ['สอ', 'เอือ', 'ไม้โท', 'เสื้อ'])
+        self.assertEqual(spell_word("คน"), ['คอ', 'นอ', 'คน'])
+        self.assertEqual(spell_word("คนดี"), [
+                         'คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'])
 
     def test_rhyme(self):
         self.assertIsInstance(rhyme("แมว"), list)
         self.assertTrue(len(rhyme("แมว")) > 2)
 
+    def test_remove_repeat_consonants(self):
+        # update of pythainlp.copus.thai_words() able to break this
+        self.assertEqual(
+            remove_trailing_repeat_consonants('เริ่ดดดดดดดด'),
+            'เริ่ด'
+        )
+        self.assertEqual(
+            remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม'),
+            'อืมมม'
+        )
+
+        custom_dictionary = dict_trie(["อืมมมมม"])
+        self.assertEqual(
+            remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary),
+            'อืมมมมม'
+        )
+
+        self.assertEqual(
+            remove_trailing_repeat_consonants(
+                'อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '
+                'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ'
+            ),
+            'อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ'
+        )
+
     # def test_abbreviation_to_full_text(self):
     #     self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list))