From 6ea41811f7b9837dfbb727b0bf39b319de8ff29e Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 00:09:55 +0900 Subject: [PATCH 01/36] documentation --- pythainlp/util/normalize.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index a8cacae22..bffe6d278 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -23,8 +23,10 @@ from pythainlp import thai_follow_vowels as follow_v from pythainlp import thai_lead_vowels as lead_v from pythainlp import thai_tonemarks as tonemarks +from pythainlp import thai_consonants as consonants from pythainlp.tokenize import word_tokenize - +from pythainlp.corpus import thai_words +from pythainlp.util.trie import Trie _DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" _RE_REMOVE_DANGLINGS = re.compile(f"^[{_DANGLING_CHARS}]+") @@ -218,6 +220,27 @@ def remove_repeat_vowels(text: str) -> str: return text +def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: + """ + Remove repeating consonants at the last of the sentence. + + This function will remove the repeating consonants + before a whitespace or new line until the last word matches + a word in the given dictionary. + If there is no match, the repeating consonants will be + reduced to one. + Since this function uses a dictionary, the result may differs + depending on the dictionary used. + Plus, it is recommended to use normalize() to have a better result. + + :param str text: input text + :param Trie dictionary: Trie dictionary to check the last word. + If None, pythainlp.corpus.thai_words() will be used + :return: text without repeating Thai consonants + :rtype: str + """ + + def normalize(text: str) -> str: """ Normalize and clean Thai text with normalizing rules as follows: From be29c00551da590560b752e3c865523d6ab1ffac Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 01:22:57 +0900 Subject: [PATCH 02/36] Add: implemation tested --- pythainlp/util/normalize.py | 99 ++++++++++++++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 2 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index bffe6d278..2beac8a38 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -26,6 +26,7 @@ from pythainlp import thai_consonants as consonants from pythainlp.tokenize import word_tokenize from pythainlp.corpus import thai_words +from pythainlp.util import isthaichar from pythainlp.util.trie import Trie _DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" @@ -225,10 +226,11 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: Remove repeating consonants at the last of the sentence. This function will remove the repeating consonants - before a whitespace or new line until the last word matches - a word in the given dictionary. + before a whitespace, new line or at the last + so that the last word matches a word in the given dictionary. If there is no match, the repeating consonants will be reduced to one. + If there are several match, the longest word will be used. Since this function uses a dictionary, the result may differs depending on the dictionary used. Plus, it is recommended to use normalize() to have a better result. @@ -238,7 +240,100 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: If None, pythainlp.corpus.thai_words() will be used :return: text without repeating Thai consonants :rtype: str + + :Example: + :: + + from pythainlp.util import remove_repeat_consonants + from pythainlp.util import dict_trie + + # use default dictionary (pythainlp.corpus.thai_words()) + remove_repeat_consonants('เริ่ดดดดดดดด') + # output: เริ่ด + + remove_repeat_consonants('อืมมมมมมมมมมมมมมม') + # output: อืมมม + # "อืมมม" is in the default dictionary + + # use custom dictionary + custom_dictionary = dict_trie(["อืมมมมม"]) + remove_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary) + # output: อืมมมมม + + # long text + remove_repeat_consonants('อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '\ + 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ') + # output: อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ """ + # use default dictionary if not given + if dictionary is None: + dictionary = thai_words() + + # seperate by newline + modified_lines = [] + for line in text.split("\n"): + segments = line.split(" ") + + for segment in segments: + # skip if the segment is not the target + if (not + ((len(segment) > 1) # the segment is long enough + and (isthaichar(segment[-1])) # the last is Thai + and (segment[-1] == segment[-2]))): # has repiitition + + # skip + continue + + # duplicating character + dup = segment[-1] + + # find the words that has 2 or more duplication of + # this character at the end. + # TODO: This maybe slow if the dictionary is large. + # If the dictionary not changed, this could be done + # only once in the kernel. + # But it will requires a global variable. + repeaters = [] + for word in dictionary: + if (len(word) > 1) and (word[-1] == word[-2] == dup): + repeaters.append(word) + + # remove all of the last repeating character + segment_head = segment + while ((len(segment) > 0) and (segment[-1] == dup)): + segment = segment[:-1] + + # find the longest word that matches the segment + longest_word = "" + repetition = 0 + for repeater in repeaters: + # remove all of the last repeating character + repeater_head = repeater + while ((len(repeater) > 0) and (repeater[-1] == dup)): + repeater = repeater[:-1] + + # check match + if ((len(segment) >= len(repeater)) + and (segment[-len(repeater):] == repeater)): + # matched + if len(repeater) > len(longest_word): + longest_word = repeater + + if len(longest_word) > 0: + # if there is a match, use it + segment = segment_head + (dup * repetition) + else: + # if none found, make the repition to once + segment = segment_head + (dup * 1) + + # revert spaces + modified_line = " ".join(segments) + modified_lines.append(modified_line) + + # revert newlines + modified_text = "\n".join(modified_lines) + + return modified_text def normalize(text: str) -> str: From 3e94234cfaa9ea876a8e7eaa804cd3bcaae7867d Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 01:32:37 +0900 Subject: [PATCH 03/36] Add: test code --- tests/test_util.py | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/tests/test_util.py b/tests/test_util.py index 1840e2dc0..b3165e25a 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -60,6 +60,7 @@ ipa_to_rtgs, remove_tone_ipa, tis620_to_utf8, + remove_repeat_consonants ) from pythainlp.util.spell_words import spell_word @@ -832,7 +833,8 @@ def test_convert_years(self): self.assertEqual(convert_years("242", src="re", target="ad"), "2023") self.assertEqual(convert_years("242", src="re", target="ah"), "1444") with self.assertRaises(NotImplementedError): - self.assertIsNotNone(convert_years("2023", src="cat", target="dog")) + self.assertIsNotNone(convert_years( + "2023", src="cat", target="dog")) def test_nectec_to_ipa(self): self.assertEqual(nectec_to_ipa("kl-uua-j^-2"), 'kl uua j ˥˩') @@ -846,17 +848,44 @@ def test_remove_tone_ipa(self): self.assertEqual(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"), "laː.sa.maj") def test_tis620_to_utf8(self): - self.assertEqual(tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม") + self.assertEqual(tis620_to_utf8( + "¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม") def test_spell_word(self): - self.assertEqual(spell_word("เสือ"),['สอ', 'เอือ', 'เสือ']) - self.assertEqual(spell_word("เสื้อ"),['สอ', 'เอือ', 'ไม้โท', 'เสื้อ']) - self.assertEqual(spell_word("คน"),['คอ', 'นอ', 'คน']) - self.assertEqual(spell_word("คนดี"),['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']) + self.assertEqual(spell_word("เสือ"), ['สอ', 'เอือ', 'เสือ']) + self.assertEqual(spell_word("เสื้อ"), ['สอ', 'เอือ', 'ไม้โท', 'เสื้อ']) + self.assertEqual(spell_word("คน"), ['คอ', 'นอ', 'คน']) + self.assertEqual(spell_word("คนดี"), [ + 'คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']) def test_rhyme(self): self.assertIsInstance(rhyme("แมว"), list) self.assertTrue(len(rhyme("แมว")) > 2) + def test_remove_repeat_consonants(self): + # update of pythainlp.copus.thai_words() able to break this + self.assertEqual( + remove_repeat_consonants('เริ่ดดดดดดดด'), + 'เริ่ด' + ) + self.assertEqual( + remove_repeat_consonants('อืมมมมมมมมมมมมมมม'), + 'อืมมม' + ) + + custom_dictionary = dict_trie(["อืมมมมม"]) + self.assertEqual( + remove_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary), + 'อืมมมมม' + ) + + self.assertEqual( + remove_repeat_consonants( + 'อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด ' + 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ' + ), + 'ออืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ' + ) + # def test_abbreviation_to_full_text(self): # self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list)) From ca6cd941d9e9edfc43dc6fcaf5ac9678d9cc9d03 Mon Sep 17 00:00:00 2001 From: Konbraphat <101827492+konbraphat51@users.noreply.github.com> Date: Fri, 10 Nov 2023 01:35:21 +0900 Subject: [PATCH 04/36] Add: remove_repeat_consonants() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit function to remove consonants เริ่ดดดดดดดด -> เริ่ด implementation + test code written. Test passed --- pythainlp/util/normalize.py | 120 +++++++++++++++++++++++++++++++++++- tests/test_util.py | 41 ++++++++++-- 2 files changed, 154 insertions(+), 7 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index a8cacae22..2beac8a38 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -23,8 +23,11 @@ from pythainlp import thai_follow_vowels as follow_v from pythainlp import thai_lead_vowels as lead_v from pythainlp import thai_tonemarks as tonemarks +from pythainlp import thai_consonants as consonants from pythainlp.tokenize import word_tokenize - +from pythainlp.corpus import thai_words +from pythainlp.util import isthaichar +from pythainlp.util.trie import Trie _DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" _RE_REMOVE_DANGLINGS = re.compile(f"^[{_DANGLING_CHARS}]+") @@ -218,6 +221,121 @@ def remove_repeat_vowels(text: str) -> str: return text +def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: + """ + Remove repeating consonants at the last of the sentence. + + This function will remove the repeating consonants + before a whitespace, new line or at the last + so that the last word matches a word in the given dictionary. + If there is no match, the repeating consonants will be + reduced to one. + If there are several match, the longest word will be used. + Since this function uses a dictionary, the result may differs + depending on the dictionary used. + Plus, it is recommended to use normalize() to have a better result. + + :param str text: input text + :param Trie dictionary: Trie dictionary to check the last word. + If None, pythainlp.corpus.thai_words() will be used + :return: text without repeating Thai consonants + :rtype: str + + :Example: + :: + + from pythainlp.util import remove_repeat_consonants + from pythainlp.util import dict_trie + + # use default dictionary (pythainlp.corpus.thai_words()) + remove_repeat_consonants('เริ่ดดดดดดดด') + # output: เริ่ด + + remove_repeat_consonants('อืมมมมมมมมมมมมมมม') + # output: อืมมม + # "อืมมม" is in the default dictionary + + # use custom dictionary + custom_dictionary = dict_trie(["อืมมมมม"]) + remove_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary) + # output: อืมมมมม + + # long text + remove_repeat_consonants('อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '\ + 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ') + # output: อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ + """ + # use default dictionary if not given + if dictionary is None: + dictionary = thai_words() + + # seperate by newline + modified_lines = [] + for line in text.split("\n"): + segments = line.split(" ") + + for segment in segments: + # skip if the segment is not the target + if (not + ((len(segment) > 1) # the segment is long enough + and (isthaichar(segment[-1])) # the last is Thai + and (segment[-1] == segment[-2]))): # has repiitition + + # skip + continue + + # duplicating character + dup = segment[-1] + + # find the words that has 2 or more duplication of + # this character at the end. + # TODO: This maybe slow if the dictionary is large. + # If the dictionary not changed, this could be done + # only once in the kernel. + # But it will requires a global variable. + repeaters = [] + for word in dictionary: + if (len(word) > 1) and (word[-1] == word[-2] == dup): + repeaters.append(word) + + # remove all of the last repeating character + segment_head = segment + while ((len(segment) > 0) and (segment[-1] == dup)): + segment = segment[:-1] + + # find the longest word that matches the segment + longest_word = "" + repetition = 0 + for repeater in repeaters: + # remove all of the last repeating character + repeater_head = repeater + while ((len(repeater) > 0) and (repeater[-1] == dup)): + repeater = repeater[:-1] + + # check match + if ((len(segment) >= len(repeater)) + and (segment[-len(repeater):] == repeater)): + # matched + if len(repeater) > len(longest_word): + longest_word = repeater + + if len(longest_word) > 0: + # if there is a match, use it + segment = segment_head + (dup * repetition) + else: + # if none found, make the repition to once + segment = segment_head + (dup * 1) + + # revert spaces + modified_line = " ".join(segments) + modified_lines.append(modified_line) + + # revert newlines + modified_text = "\n".join(modified_lines) + + return modified_text + + def normalize(text: str) -> str: """ Normalize and clean Thai text with normalizing rules as follows: diff --git a/tests/test_util.py b/tests/test_util.py index 1840e2dc0..b3165e25a 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -60,6 +60,7 @@ ipa_to_rtgs, remove_tone_ipa, tis620_to_utf8, + remove_repeat_consonants ) from pythainlp.util.spell_words import spell_word @@ -832,7 +833,8 @@ def test_convert_years(self): self.assertEqual(convert_years("242", src="re", target="ad"), "2023") self.assertEqual(convert_years("242", src="re", target="ah"), "1444") with self.assertRaises(NotImplementedError): - self.assertIsNotNone(convert_years("2023", src="cat", target="dog")) + self.assertIsNotNone(convert_years( + "2023", src="cat", target="dog")) def test_nectec_to_ipa(self): self.assertEqual(nectec_to_ipa("kl-uua-j^-2"), 'kl uua j ˥˩') @@ -846,17 +848,44 @@ def test_remove_tone_ipa(self): self.assertEqual(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"), "laː.sa.maj") def test_tis620_to_utf8(self): - self.assertEqual(tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม") + self.assertEqual(tis620_to_utf8( + "¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม") def test_spell_word(self): - self.assertEqual(spell_word("เสือ"),['สอ', 'เอือ', 'เสือ']) - self.assertEqual(spell_word("เสื้อ"),['สอ', 'เอือ', 'ไม้โท', 'เสื้อ']) - self.assertEqual(spell_word("คน"),['คอ', 'นอ', 'คน']) - self.assertEqual(spell_word("คนดี"),['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']) + self.assertEqual(spell_word("เสือ"), ['สอ', 'เอือ', 'เสือ']) + self.assertEqual(spell_word("เสื้อ"), ['สอ', 'เอือ', 'ไม้โท', 'เสื้อ']) + self.assertEqual(spell_word("คน"), ['คอ', 'นอ', 'คน']) + self.assertEqual(spell_word("คนดี"), [ + 'คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']) def test_rhyme(self): self.assertIsInstance(rhyme("แมว"), list) self.assertTrue(len(rhyme("แมว")) > 2) + def test_remove_repeat_consonants(self): + # update of pythainlp.copus.thai_words() able to break this + self.assertEqual( + remove_repeat_consonants('เริ่ดดดดดดดด'), + 'เริ่ด' + ) + self.assertEqual( + remove_repeat_consonants('อืมมมมมมมมมมมมมมม'), + 'อืมมม' + ) + + custom_dictionary = dict_trie(["อืมมมมม"]) + self.assertEqual( + remove_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary), + 'อืมมมมม' + ) + + self.assertEqual( + remove_repeat_consonants( + 'อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด ' + 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ' + ), + 'ออืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ' + ) + # def test_abbreviation_to_full_text(self): # self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list)) From 702be9a3cd60629b045808b803e39e39532b23a1 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 01:48:41 +0900 Subject: [PATCH 05/36] Fix: push miss this is the right commit --- pythainlp/util/normalize.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 2beac8a38..a2614aa21 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -274,7 +274,9 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: for line in text.split("\n"): segments = line.split(" ") - for segment in segments: + for cnt in range(len(segments)): + segment = segments[cnt] + # skip if the segment is not the target if (not ((len(segment) > 1) # the segment is long enough @@ -300,32 +302,38 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: # remove all of the last repeating character segment_head = segment - while ((len(segment) > 0) and (segment[-1] == dup)): - segment = segment[:-1] + while ((len(segment_head) > 0) and (segment_head[-1] == dup)): + segment_head = segment_head[:-1] # find the longest word that matches the segment longest_word = "" - repetition = 0 + repetition = 0 # how much the last character is repeated correctly for repeater in repeaters: # remove all of the last repeating character repeater_head = repeater - while ((len(repeater) > 0) and (repeater[-1] == dup)): - repeater = repeater[:-1] + while ((len(repeater_head) > 0) and (repeater_head[-1] == dup)): + repeater_head = repeater_head[:-1] # check match - if ((len(segment) >= len(repeater)) - and (segment[-len(repeater):] == repeater)): + if ((len(segment_head) >= len(repeater_head)) + and (segment_head[-len(repeater_head):] == repeater_head)): # matched if len(repeater) > len(longest_word): longest_word = repeater + repetition = len(repeater) - len(repeater_head) if len(longest_word) > 0: # if there is a match, use it segment = segment_head + (dup * repetition) else: - # if none found, make the repition to once + # if none found, the chance is that the correct is one character, + # or it's not in the dictionary. + + # make the repition to once segment = segment_head + (dup * 1) + segments[cnt] = segment + # revert spaces modified_line = " ".join(segments) modified_lines.append(modified_line) From 130b1ecf4e069e5887872bb009b309d967375ae0 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 01:53:23 +0900 Subject: [PATCH 06/36] Fix: divide the exceeding length code --- pythainlp/util/normalize.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index a2614aa21..c338e8105 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -263,7 +263,8 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: # long text remove_repeat_consonants('อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '\ 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ') - # output: อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ + # output: อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ + # นี่เป็นความลับ """ # use default dictionary if not given if dictionary is None: @@ -311,12 +312,14 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: for repeater in repeaters: # remove all of the last repeating character repeater_head = repeater - while ((len(repeater_head) > 0) and (repeater_head[-1] == dup)): + while ((len(repeater_head) > 0) + and (repeater_head[-1] == dup)): repeater_head = repeater_head[:-1] # check match if ((len(segment_head) >= len(repeater_head)) - and (segment_head[-len(repeater_head):] == repeater_head)): + and (segment_head[-len(repeater_head):] + == repeater_head)): # matched if len(repeater) > len(longest_word): longest_word = repeater @@ -326,7 +329,8 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: # if there is a match, use it segment = segment_head + (dup * repetition) else: - # if none found, the chance is that the correct is one character, + # if none found, + # the chance is that the correct is one character, # or it's not in the dictionary. # make the repition to once From ef8ac0fd700da8934e18bb7617753201efdd045f Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 01:56:28 +0900 Subject: [PATCH 07/36] Refac: remove last white space --- pythainlp/util/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index c338e8105..d9d7c309e 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -226,7 +226,7 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: Remove repeating consonants at the last of the sentence. This function will remove the repeating consonants - before a whitespace, new line or at the last + before a whitespace, new line or at the last so that the last word matches a word in the given dictionary. If there is no match, the repeating consonants will be reduced to one. From 2df4d371644c99916bf77c839bacf786c0970ebf Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 01:58:13 +0900 Subject: [PATCH 08/36] Fix: restrict only to consonants --- pythainlp/util/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index d9d7c309e..788f173c1 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -281,7 +281,7 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: # skip if the segment is not the target if (not ((len(segment) > 1) # the segment is long enough - and (isthaichar(segment[-1])) # the last is Thai + and (segment[-1] in consonants) # last is Thai consonant and (segment[-1] == segment[-2]))): # has repiitition # skip From 16c3154aa2893236553ac74d836421f82b8768d2 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 01:59:21 +0900 Subject: [PATCH 09/36] Refac: Remove unused import --- pythainlp/util/normalize.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 788f173c1..259760759 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -26,7 +26,6 @@ from pythainlp import thai_consonants as consonants from pythainlp.tokenize import word_tokenize from pythainlp.corpus import thai_words -from pythainlp.util import isthaichar from pythainlp.util.trie import Trie _DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" From cc62a95ba5a696935b1f18a3cb83ca9d20d95cd2 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 08:34:14 +0900 Subject: [PATCH 10/36] Refac: Use enumerate pointed out by codacy --- pythainlp/util/normalize.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 259760759..8e97995ab 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -274,9 +274,7 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: for line in text.split("\n"): segments = line.split(" ") - for cnt in range(len(segments)): - segment = segments[cnt] - + for cnt, segment in enumerate(segments): # skip if the segment is not the target if (not ((len(segment) > 1) # the segment is long enough From d74af323a11865ebd083eddecfb00f4ba7fa2561 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 08:36:13 +0900 Subject: [PATCH 11/36] Fix: add the function in init --- pythainlp/util/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 2b2ff40e4..05432ecea 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -102,6 +102,7 @@ remove_tonemark, remove_zw, reorder_vowels, + remove_repeat_consonants ) from pythainlp.util.numtoword import bahttext, num_to_thaiword from pythainlp.util.strftime import thai_strftime From 5bfa50d552b962ba915c2c70e6ee357e510dfa0f Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 08:48:50 +0900 Subject: [PATCH 12/36] Refac: use black --- pythainlp/util/normalize.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 8e97995ab..858477f61 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -276,11 +276,14 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: for cnt, segment in enumerate(segments): # skip if the segment is not the target - if (not - ((len(segment) > 1) # the segment is long enough - and (segment[-1] in consonants) # last is Thai consonant - and (segment[-1] == segment[-2]))): # has repiitition - + if not ( + # the segment is long enough + (len(segment) > 1) + # last is Thai consonant + and (segment[-1] in consonants) + # has repiitition + and (segment[-1] == segment[-2]) + ): # skip continue @@ -300,7 +303,7 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: # remove all of the last repeating character segment_head = segment - while ((len(segment_head) > 0) and (segment_head[-1] == dup)): + while (len(segment_head) > 0) and (segment_head[-1] == dup): segment_head = segment_head[:-1] # find the longest word that matches the segment @@ -309,14 +312,13 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: for repeater in repeaters: # remove all of the last repeating character repeater_head = repeater - while ((len(repeater_head) > 0) - and (repeater_head[-1] == dup)): + while (len(repeater_head) > 0) and (repeater_head[-1] == dup): repeater_head = repeater_head[:-1] # check match - if ((len(segment_head) >= len(repeater_head)) - and (segment_head[-len(repeater_head):] - == repeater_head)): + if (len(segment_head) >= len(repeater_head)) and ( + segment_head[-len(repeater_head):] == repeater_head + ): # matched if len(repeater) > len(longest_word): longest_word = repeater From 28b6006693ccd0b4e9cb5b0c28b8119e51a32d27 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 08:52:24 +0900 Subject: [PATCH 13/36] Refac: repeatedly used black "1 file left unchanged." shown --- pythainlp/util/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 858477f61..9746689a6 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -317,7 +317,7 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: # check match if (len(segment_head) >= len(repeater_head)) and ( - segment_head[-len(repeater_head):] == repeater_head + segment_head[-len(repeater_head) :] == repeater_head ): # matched if len(repeater) > len(longest_word): From c6b564dad962caa3fda7a7d6ee0f1da41a6fd44f Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 09:01:01 +0900 Subject: [PATCH 14/36] Refac: resolve nested if pointed out by codeclimate --- pythainlp/util/normalize.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 9746689a6..0fcfb9764 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -316,13 +316,14 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: repeater_head = repeater_head[:-1] # check match - if (len(segment_head) >= len(repeater_head)) and ( - segment_head[-len(repeater_head) :] == repeater_head + if ( + (len(segment_head) >= len(repeater_head)) + and (segment_head[-len(repeater_head):] == repeater_head) + # matched confirmed, check it's longer + and (len(repeater) > len(longest_word)) ): - # matched - if len(repeater) > len(longest_word): - longest_word = repeater - repetition = len(repeater) - len(repeater_head) + longest_word = repeater + repetition = len(repeater) - len(repeater_head) if len(longest_word) > 0: # if there is a match, use it From 8d09323ac8b4d64e3ccdf8442cccc8ad5543bc28 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 10:45:38 +0900 Subject: [PATCH 15/36] Fix test case --- tests/test_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_util.py b/tests/test_util.py index b3165e25a..3c1618201 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -884,7 +884,7 @@ def test_remove_repeat_consonants(self): 'อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด ' 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ' ), - 'ออืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ' + 'อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ' ) # def test_abbreviation_to_full_text(self): From 946f59c0ea61934a4291610c2345e407356b605e Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 10:56:39 +0900 Subject: [PATCH 16/36] Refac: seperate function Cognitive complexity pointed out by CodeClimate Black used --- pythainlp/util/normalize.py | 144 ++++++++++++++++++++---------------- 1 file changed, 79 insertions(+), 65 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 0fcfb9764..b3d36693a 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -51,9 +51,7 @@ ] # VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan -_NOREPEAT_CHARS = ( - f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e" -) +_NOREPEAT_CHARS = f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e" _NOREPEAT_PAIRS = list( zip([f"({ch}[ ]*)+{ch}" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS) ) @@ -275,68 +273,8 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: segments = line.split(" ") for cnt, segment in enumerate(segments): - # skip if the segment is not the target - if not ( - # the segment is long enough - (len(segment) > 1) - # last is Thai consonant - and (segment[-1] in consonants) - # has repiitition - and (segment[-1] == segment[-2]) - ): - # skip - continue - - # duplicating character - dup = segment[-1] - - # find the words that has 2 or more duplication of - # this character at the end. - # TODO: This maybe slow if the dictionary is large. - # If the dictionary not changed, this could be done - # only once in the kernel. - # But it will requires a global variable. - repeaters = [] - for word in dictionary: - if (len(word) > 1) and (word[-1] == word[-2] == dup): - repeaters.append(word) - - # remove all of the last repeating character - segment_head = segment - while (len(segment_head) > 0) and (segment_head[-1] == dup): - segment_head = segment_head[:-1] - - # find the longest word that matches the segment - longest_word = "" - repetition = 0 # how much the last character is repeated correctly - for repeater in repeaters: - # remove all of the last repeating character - repeater_head = repeater - while (len(repeater_head) > 0) and (repeater_head[-1] == dup): - repeater_head = repeater_head[:-1] - - # check match - if ( - (len(segment_head) >= len(repeater_head)) - and (segment_head[-len(repeater_head):] == repeater_head) - # matched confirmed, check it's longer - and (len(repeater) > len(longest_word)) - ): - longest_word = repeater - repetition = len(repeater) - len(repeater_head) - - if len(longest_word) > 0: - # if there is a match, use it - segment = segment_head + (dup * repetition) - else: - # if none found, - # the chance is that the correct is one character, - # or it's not in the dictionary. - - # make the repition to once - segment = segment_head + (dup * 1) - - segments[cnt] = segment + segments[cnt] = _remove_repeat_consonants_from_segment( + segment, dictionary) # revert spaces modified_line = " ".join(segments) @@ -348,6 +286,82 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: return modified_text +def _remove_repeat_consonants_from_segment(segment: str, dictionary: Trie) -> str: + """ + Remove repeating consonants at the last of the segment. + + This function process only at the last of the given text. + Details is same as remove_repeat_consonants(). + + :param str segment: segment of text + :param Trie dictionary: Trie dictionary to check the last word. + :return: segment without repeating Thai consonants + :rtype: str + """ + # skip if the segment is not the target + if not ( + # the segment is long enough + (len(segment) > 1) + # last is Thai consonant + and (segment[-1] in consonants) + # has repiitition + and (segment[-1] == segment[-2]) + ): + # no need to process + return segment + + # duplicating character + dup = segment[-1] + + # find the words that has 2 or more duplication of + # this character at the end. + # TODO: This maybe slow if the dictionary is large. + # If the dictionary not changed, this could be done + # only once in the kernel. + # But it will requires a global variable. + repeaters = [] + for word in dictionary: + if (len(word) > 1) and (word[-1] == word[-2] == dup): + repeaters.append(word) + + # remove all of the last repeating character + segment_head = segment + while (len(segment_head) > 0) and (segment_head[-1] == dup): + segment_head = segment_head[:-1] + + # find the longest word that matches the segment + longest_word = "" + repetition = 0 # how much the last character is repeated correctly + for repeater in repeaters: + # remove all of the last repeating character + repeater_head = repeater + while (len(repeater_head) > 0) and (repeater_head[-1] == dup): + repeater_head = repeater_head[:-1] + + # check match + if ( + (len(segment_head) >= len(repeater_head)) + and (segment_head[-len(repeater_head):] == repeater_head) + # matched confirmed, check it's longer + and (len(repeater) > len(longest_word)) + ): + longest_word = repeater + repetition = len(repeater) - len(repeater_head) + + if len(longest_word) > 0: + # if there is a match, use it + segment = segment_head + (dup * repetition) + else: + # if none found, + # the chance is that the correct is one character, + # or it's not in the dictionary. + + # make the repition to once + segment = segment_head + (dup * 1) + + return segment + + def normalize(text: str) -> str: """ Normalize and clean Thai text with normalizing rules as follows: From a5153e0b57f56f040423352d3aacb5a38ff6b4d4 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 10:58:45 +0900 Subject: [PATCH 17/36] Refac: reduce line length pointed out by Lint black used --- pythainlp/util/normalize.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index b3d36693a..2c56f813c 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -51,7 +51,9 @@ ] # VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan -_NOREPEAT_CHARS = f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e" +_NOREPEAT_CHARS = ( + f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e" +) _NOREPEAT_PAIRS = list( zip([f"({ch}[ ]*)+{ch}" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS) ) @@ -274,7 +276,8 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: for cnt, segment in enumerate(segments): segments[cnt] = _remove_repeat_consonants_from_segment( - segment, dictionary) + segment, dictionary + ) # revert spaces modified_line = " ".join(segments) @@ -286,7 +289,9 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: return modified_text -def _remove_repeat_consonants_from_segment(segment: str, dictionary: Trie) -> str: +def _remove_repeat_consonants_from_segment( + segment: str, dictionary: Trie +) -> str: """ Remove repeating consonants at the last of the segment. From 43dfd25f742fee532c4ef2a366761447d9389f91 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 11:08:19 +0900 Subject: [PATCH 18/36] Refac: seperate 2 functions --- pythainlp/util/normalize.py | 54 ++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 2c56f813c..67948cce6 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -324,24 +324,17 @@ def _remove_repeat_consonants_from_segment( # If the dictionary not changed, this could be done # only once in the kernel. # But it will requires a global variable. - repeaters = [] - for word in dictionary: - if (len(word) > 1) and (word[-1] == word[-2] == dup): - repeaters.append(word) + repeaters = _get_all_last_consonant_repeaters(dup, dictionary) # remove all of the last repeating character - segment_head = segment - while (len(segment_head) > 0) and (segment_head[-1] == dup): - segment_head = segment_head[:-1] + segment_head = _get_repitition_head(segment, dup) # find the longest word that matches the segment longest_word = "" repetition = 0 # how much the last character is repeated correctly for repeater in repeaters: # remove all of the last repeating character - repeater_head = repeater - while (len(repeater_head) > 0) and (repeater_head[-1] == dup): - repeater_head = repeater_head[:-1] + repeater_head = _get_repitition_head(repeater, dup) # check match if ( @@ -367,6 +360,47 @@ def _remove_repeat_consonants_from_segment( return segment +def _get_repitition_head(text: str, dup: str) -> str: + """ + Reduce repeating characters at the end of the text. + + This function will remove the repeating characters at the last. + The text just before the repeating characters will be returned. + + :param str text: input text + :param str dup: repeating character to be removed + :return: text without repeating characters at the end + :rtype: str + """ + head = text + while (len(head) > 0) and (head[-1] == dup): + head = head[:-1] + + return head + + +def _get_all_last_consonant_repeaters( + consonant: str, dictionary: Trie +) -> List[str]: + """ + Get all words that has repeating consonants at the end from the dictionary. + + Search all words in the dictionary that has more than 1 given consonants + repeating at the end. + + :param str consonant: consonant to be searched + :param Trie dictionary: Trie dictionary to search + :return: list of words that has repeating consonants at the end + :rtype: List[str] + """ + repeaters = [] + for word in dictionary: + if (len(word) > 1) and (word[-1] == word[-2] == consonant): + repeaters.append(word) + + return repeaters + + def normalize(text: str) -> str: """ Normalize and clean Thai text with normalizing rules as follows: From d9ae5343a7069e02de73121528555e54f6d07327 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 11:11:05 +0900 Subject: [PATCH 19/36] Refac: use black vscode autopep8 and black has been conflicting. So autopep8 cutted --- pythainlp/util/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 67948cce6..c25b6c58a 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -339,7 +339,7 @@ def _remove_repeat_consonants_from_segment( # check match if ( (len(segment_head) >= len(repeater_head)) - and (segment_head[-len(repeater_head):] == repeater_head) + and (segment_head[-len(repeater_head) :] == repeater_head) # matched confirmed, check it's longer and (len(repeater) > len(longest_word)) ): From 844c21d8f09ad357216ff5d6fea08d801737a662 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 11:27:04 +0900 Subject: [PATCH 20/36] Refac: seperate match finding method cognitive complexity pointed out by CodeClimate. Black used. --- pythainlp/util/normalize.py | 58 +++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index c25b6c58a..76dc8f3a0 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -16,7 +16,7 @@ Text normalization """ import re -from typing import List, Union +from typing import List, Tuple, Union from pythainlp import thai_above_vowels as above_v from pythainlp import thai_below_vowels as below_v @@ -330,21 +330,9 @@ def _remove_repeat_consonants_from_segment( segment_head = _get_repitition_head(segment, dup) # find the longest word that matches the segment - longest_word = "" - repetition = 0 # how much the last character is repeated correctly - for repeater in repeaters: - # remove all of the last repeating character - repeater_head = _get_repitition_head(repeater, dup) - - # check match - if ( - (len(segment_head) >= len(repeater_head)) - and (segment_head[-len(repeater_head) :] == repeater_head) - # matched confirmed, check it's longer - and (len(repeater) > len(longest_word)) - ): - longest_word = repeater - repetition = len(repeater) - len(repeater_head) + longest_word, repetition = _find_longest_consonant_repeaters_match( + segment_head, repeaters + ) if len(longest_word) > 0: # if there is a match, use it @@ -401,6 +389,44 @@ def _get_all_last_consonant_repeaters( return repeaters +def _find_longest_consonant_repeaters_match( + segment_head: str, repeaters: List[str] +) -> Tuple[str, int]: + """ + Find the longest word that matches the segment. + + Find the longest word that matches the last + of the segment from the given repeaters list. + This returns the word and + how much the last character is repeated correctly. + + :param str segment: segment of text + :param List[str] repeaters: list of words + that has repeating consonants at the end + :return: "tuple of the word" and + "how much the last character is repeated correctly" + If none, ("", 0) will be returned. + :rtype: Tuple[str, int] + """ + longest_word = "" # the longest word that matches the segment + repetition = 0 # how much the last character is repeated correctly + for repeater in repeaters: + # remove all of the last repeating character + repeater_head = _get_repitition_head(repeater, repeater[-1]) + + # check match + if ( + (len(segment_head) >= len(repeater_head)) + and (segment_head[-len(repeater_head) :] == repeater_head) + # matched confirmed, check it's longer + and (len(repeater) > len(longest_word)) + ): + longest_word = repeater + repetition = len(repeater) - len(repeater_head) + + return longest_word, repetition + + def normalize(text: str) -> str: """ Normalize and clean Thai text with normalizing rules as follows: From 1e1631f3378cb0965d6730eb3c7da62fdd7cd825 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 11:43:32 +0900 Subject: [PATCH 21/36] Improve: save consonants repeaters for improve speed TODO resolved, black used, test passed --- pythainlp/util/normalize.py | 55 +++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 76dc8f3a0..4939ad062 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -33,6 +33,14 @@ _ZERO_WIDTH_CHARS = "\u200b\u200c" # ZWSP, ZWNJ +# used by remove_repeat_consonants() +# contains all words that has repeating consonants at the end +# for each consonant +# when dictionary updated, this should be updated too +# key: consonant +# value: list of words that has repeating consonants at the end +consonants_repeaters = {} + _REORDER_PAIRS = [ ("\u0e40\u0e40", "\u0e41"), # Sara E + Sara E -> Sara Ae ( @@ -220,7 +228,9 @@ def remove_repeat_vowels(text: str) -> str: return text -def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: +def remove_repeat_consonants( + text: str, dictionary: Trie = None, dictionary_updated: bool = True +) -> str: """ Remove repeating consonants at the last of the sentence. @@ -237,6 +247,9 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: :param str text: input text :param Trie dictionary: Trie dictionary to check the last word. If None, pythainlp.corpus.thai_words() will be used + :param bool dictionary_updated: If the dictionary is updated + or the first time using in the kernel, set this true. + If not, set this false to save time. :return: text without repeating Thai consonants :rtype: str @@ -269,6 +282,10 @@ def remove_repeat_consonants(text: str, dictionary: Trie = None) -> str: if dictionary is None: dictionary = thai_words() + # update repeaters dictionary if not updated + if dictionary_updated: + _update_consonant_repeaters(dictionary) + # seperate by newline modified_lines = [] for line in text.split("\n"): @@ -320,11 +337,7 @@ def _remove_repeat_consonants_from_segment( # find the words that has 2 or more duplication of # this character at the end. - # TODO: This maybe slow if the dictionary is large. - # If the dictionary not changed, this could be done - # only once in the kernel. - # But it will requires a global variable. - repeaters = _get_all_last_consonant_repeaters(dup, dictionary) + repeaters = consonants_repeaters[dup] # remove all of the last repeating character segment_head = _get_repitition_head(segment, dup) @@ -367,26 +380,32 @@ def _get_repitition_head(text: str, dup: str) -> str: return head -def _get_all_last_consonant_repeaters( - consonant: str, dictionary: Trie -) -> List[str]: +def _update_consonant_repeaters(dictionary: Trie) -> None: """ - Get all words that has repeating consonants at the end from the dictionary. + Update dictionary of all words that has + repeating consonants at the end from the dictionary. - Search all words in the dictionary that has more than 1 given consonants - repeating at the end. + Search all words in the dictionary that has more than 1 consonants + repeating at the end and store them in the global dictionary. :param str consonant: consonant to be searched :param Trie dictionary: Trie dictionary to search - :return: list of words that has repeating consonants at the end - :rtype: List[str] + :rtype: None """ - repeaters = [] + # initialize dictionary + for consonant in list(consonants): + consonants_repeaters[consonant] = [] + + # register for word in dictionary: - if (len(word) > 1) and (word[-1] == word[-2] == consonant): - repeaters.append(word) + if ( + (len(word) > 1) + and (word[-1] == word[-2]) + and (word[-1] in consonants) + ): + consonants_repeaters[word[-1]].append(word) - return repeaters + return def _find_longest_consonant_repeaters_match( From ceb9d76a4fe935e4bd8f46dabebaccfdfa9938f0 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Fri, 10 Nov 2023 11:51:40 +0900 Subject: [PATCH 22/36] Refac: make repeater checking function Code complexity pointed out by CodeClimate, black used --- pythainlp/util/normalize.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 4939ad062..9c47f407e 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -398,16 +398,28 @@ def _update_consonant_repeaters(dictionary: Trie) -> None: # register for word in dictionary: - if ( - (len(word) > 1) - and (word[-1] == word[-2]) - and (word[-1] in consonants) - ): + if _is_consonant_repeater(word): consonants_repeaters[word[-1]].append(word) return +def _is_consonant_repeater(word: str) -> bool: + """ + Check if the word has repeating consonants at the end. + + This function checks if the word has + more than 1 repeating consonants at the end. + + :param str word: word to be checked + :return: True if the word has repeating consonants at the end. + :rtype: bool + """ + return ( + (len(word) > 1) and (word[-1] == word[-2]) and (word[-1] in consonants) + ) + + def _find_longest_consonant_repeaters_match( segment_head: str, repeaters: List[str] ) -> Tuple[str, int]: From 6509e0da3478bf01a2cf248d96f00b8030c6fae9 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Sat, 11 Nov 2023 23:11:16 +0900 Subject: [PATCH 23/36] Refac: seperate function --- pythainlp/util/__init__.py | 2 +- pythainlp/util/normalize.py | 248 +--------------------- pythainlp/util/removerepeatconsonants.py | 253 +++++++++++++++++++++++ 3 files changed, 257 insertions(+), 246 deletions(-) create mode 100644 pythainlp/util/removerepeatconsonants.py diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 05432ecea..99bc46621 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -102,8 +102,8 @@ remove_tonemark, remove_zw, reorder_vowels, - remove_repeat_consonants ) +from pythainlp.util.removerepeatconsonants import remove_repeat_consonants from pythainlp.util.numtoword import bahttext, num_to_thaiword from pythainlp.util.strftime import thai_strftime from pythainlp.util.thai import ( diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 9c47f407e..825ed79eb 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -16,31 +16,21 @@ Text normalization """ import re -from typing import List, Tuple, Union +from typing import List, Union from pythainlp import thai_above_vowels as above_v from pythainlp import thai_below_vowels as below_v from pythainlp import thai_follow_vowels as follow_v from pythainlp import thai_lead_vowels as lead_v from pythainlp import thai_tonemarks as tonemarks -from pythainlp import thai_consonants as consonants from pythainlp.tokenize import word_tokenize -from pythainlp.corpus import thai_words -from pythainlp.util.trie import Trie + _DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" _RE_REMOVE_DANGLINGS = re.compile(f"^[{_DANGLING_CHARS}]+") _ZERO_WIDTH_CHARS = "\u200b\u200c" # ZWSP, ZWNJ -# used by remove_repeat_consonants() -# contains all words that has repeating consonants at the end -# for each consonant -# when dictionary updated, this should be updated too -# key: consonant -# value: list of words that has repeating consonants at the end -consonants_repeaters = {} - _REORDER_PAIRS = [ ("\u0e40\u0e40", "\u0e41"), # Sara E + Sara E -> Sara Ae ( @@ -59,9 +49,7 @@ ] # VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan -_NOREPEAT_CHARS = ( - f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e" -) +_NOREPEAT_CHARS = f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e" _NOREPEAT_PAIRS = list( zip([f"({ch}[ ]*)+{ch}" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS) ) @@ -228,236 +216,6 @@ def remove_repeat_vowels(text: str) -> str: return text -def remove_repeat_consonants( - text: str, dictionary: Trie = None, dictionary_updated: bool = True -) -> str: - """ - Remove repeating consonants at the last of the sentence. - - This function will remove the repeating consonants - before a whitespace, new line or at the last - so that the last word matches a word in the given dictionary. - If there is no match, the repeating consonants will be - reduced to one. - If there are several match, the longest word will be used. - Since this function uses a dictionary, the result may differs - depending on the dictionary used. - Plus, it is recommended to use normalize() to have a better result. - - :param str text: input text - :param Trie dictionary: Trie dictionary to check the last word. - If None, pythainlp.corpus.thai_words() will be used - :param bool dictionary_updated: If the dictionary is updated - or the first time using in the kernel, set this true. - If not, set this false to save time. - :return: text without repeating Thai consonants - :rtype: str - - :Example: - :: - - from pythainlp.util import remove_repeat_consonants - from pythainlp.util import dict_trie - - # use default dictionary (pythainlp.corpus.thai_words()) - remove_repeat_consonants('เริ่ดดดดดดดด') - # output: เริ่ด - - remove_repeat_consonants('อืมมมมมมมมมมมมมมม') - # output: อืมมม - # "อืมมม" is in the default dictionary - - # use custom dictionary - custom_dictionary = dict_trie(["อืมมมมม"]) - remove_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary) - # output: อืมมมมม - - # long text - remove_repeat_consonants('อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '\ - 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ') - # output: อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ - # นี่เป็นความลับ - """ - # use default dictionary if not given - if dictionary is None: - dictionary = thai_words() - - # update repeaters dictionary if not updated - if dictionary_updated: - _update_consonant_repeaters(dictionary) - - # seperate by newline - modified_lines = [] - for line in text.split("\n"): - segments = line.split(" ") - - for cnt, segment in enumerate(segments): - segments[cnt] = _remove_repeat_consonants_from_segment( - segment, dictionary - ) - - # revert spaces - modified_line = " ".join(segments) - modified_lines.append(modified_line) - - # revert newlines - modified_text = "\n".join(modified_lines) - - return modified_text - - -def _remove_repeat_consonants_from_segment( - segment: str, dictionary: Trie -) -> str: - """ - Remove repeating consonants at the last of the segment. - - This function process only at the last of the given text. - Details is same as remove_repeat_consonants(). - - :param str segment: segment of text - :param Trie dictionary: Trie dictionary to check the last word. - :return: segment without repeating Thai consonants - :rtype: str - """ - # skip if the segment is not the target - if not ( - # the segment is long enough - (len(segment) > 1) - # last is Thai consonant - and (segment[-1] in consonants) - # has repiitition - and (segment[-1] == segment[-2]) - ): - # no need to process - return segment - - # duplicating character - dup = segment[-1] - - # find the words that has 2 or more duplication of - # this character at the end. - repeaters = consonants_repeaters[dup] - - # remove all of the last repeating character - segment_head = _get_repitition_head(segment, dup) - - # find the longest word that matches the segment - longest_word, repetition = _find_longest_consonant_repeaters_match( - segment_head, repeaters - ) - - if len(longest_word) > 0: - # if there is a match, use it - segment = segment_head + (dup * repetition) - else: - # if none found, - # the chance is that the correct is one character, - # or it's not in the dictionary. - - # make the repition to once - segment = segment_head + (dup * 1) - - return segment - - -def _get_repitition_head(text: str, dup: str) -> str: - """ - Reduce repeating characters at the end of the text. - - This function will remove the repeating characters at the last. - The text just before the repeating characters will be returned. - - :param str text: input text - :param str dup: repeating character to be removed - :return: text without repeating characters at the end - :rtype: str - """ - head = text - while (len(head) > 0) and (head[-1] == dup): - head = head[:-1] - - return head - - -def _update_consonant_repeaters(dictionary: Trie) -> None: - """ - Update dictionary of all words that has - repeating consonants at the end from the dictionary. - - Search all words in the dictionary that has more than 1 consonants - repeating at the end and store them in the global dictionary. - - :param str consonant: consonant to be searched - :param Trie dictionary: Trie dictionary to search - :rtype: None - """ - # initialize dictionary - for consonant in list(consonants): - consonants_repeaters[consonant] = [] - - # register - for word in dictionary: - if _is_consonant_repeater(word): - consonants_repeaters[word[-1]].append(word) - - return - - -def _is_consonant_repeater(word: str) -> bool: - """ - Check if the word has repeating consonants at the end. - - This function checks if the word has - more than 1 repeating consonants at the end. - - :param str word: word to be checked - :return: True if the word has repeating consonants at the end. - :rtype: bool - """ - return ( - (len(word) > 1) and (word[-1] == word[-2]) and (word[-1] in consonants) - ) - - -def _find_longest_consonant_repeaters_match( - segment_head: str, repeaters: List[str] -) -> Tuple[str, int]: - """ - Find the longest word that matches the segment. - - Find the longest word that matches the last - of the segment from the given repeaters list. - This returns the word and - how much the last character is repeated correctly. - - :param str segment: segment of text - :param List[str] repeaters: list of words - that has repeating consonants at the end - :return: "tuple of the word" and - "how much the last character is repeated correctly" - If none, ("", 0) will be returned. - :rtype: Tuple[str, int] - """ - longest_word = "" # the longest word that matches the segment - repetition = 0 # how much the last character is repeated correctly - for repeater in repeaters: - # remove all of the last repeating character - repeater_head = _get_repitition_head(repeater, repeater[-1]) - - # check match - if ( - (len(segment_head) >= len(repeater_head)) - and (segment_head[-len(repeater_head) :] == repeater_head) - # matched confirmed, check it's longer - and (len(repeater) > len(longest_word)) - ): - longest_word = repeater - repetition = len(repeater) - len(repeater_head) - - return longest_word, repetition - - def normalize(text: str) -> str: """ Normalize and clean Thai text with normalizing rules as follows: diff --git a/pythainlp/util/removerepeatconsonants.py b/pythainlp/util/removerepeatconsonants.py new file mode 100644 index 000000000..966712b37 --- /dev/null +++ b/pythainlp/util/removerepeatconsonants.py @@ -0,0 +1,253 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Removement of repeated consonants +""" +from pythainlp.corpus import thai_words +from pythainlp.util.trie import Trie +from pythainlp import thai_consonants as consonants +from typing import Tuple, List + +# used by remove_repeat_consonants() +# contains all words that has repeating consonants at the end +# for each consonant +# when dictionary updated, this should be updated too +# key: consonan +# value: list of words that has repeating consonants at the end +consonants_repeaters = {} + + +def remove_repeat_consonants( + text: str, dictionary: Trie = None, dictionary_updated: bool = True +) -> str: + """ + Remove repeating consonants at the last of the sentence. + + This function will remove the repeating consonants + before a whitespace, new line or at the last + so that the last word matches a word in the given dictionary. + If there is no match, the repeating consonants will be + reduced to one. + If there are several match, the longest word will be used. + Since this function uses a dictionary, the result may differs + depending on the dictionary used. + Plus, it is recommended to use normalize() to have a better result. + + :param str text: input text + :param Trie dictionary: Trie dictionary to check the last word. + If None, pythainlp.corpus.thai_words() will be used + :param bool dictionary_updated: If the dictionary is updated + or the first time using in the kernel, set this true. + If not, set this false to save time. + :return: text without repeating Thai consonants + :rtype: str + + :Example: + :: + + from pythainlp.util import remove_repeat_consonants + from pythainlp.util import dict_trie + + # use default dictionary (pythainlp.corpus.thai_words()) + remove_repeat_consonants('เริ่ดดดดดดดด') + # output: เริ่ด + + remove_repeat_consonants('อืมมมมมมมมมมมมมมม') + # output: อืมมม + # "อืมมม" is in the default dictionary + + # use custom dictionary + custom_dictionary = dict_trie(["อืมมมมม"]) + remove_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary) + # output: อืมมมมม + + # long text + remove_repeat_consonants('อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '\ + 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ') + # output: อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ + # นี่เป็นความลับ + """ + # use default dictionary if not given + if dictionary is None: + dictionary = thai_words() + + # update repeaters dictionary if not updated + if dictionary_updated: + _update_consonant_repeaters(dictionary) + + # seperate by newline + modified_lines = [] + for line in text.split("\n"): + segments = line.split(" ") + + for cnt, segment in enumerate(segments): + segments[cnt] = _remove_repeat_consonants_from_segment(segment, dictionary) + + # revert spaces + modified_line = " ".join(segments) + modified_lines.append(modified_line) + + # revert newlines + modified_text = "\n".join(modified_lines) + + return modified_text + + +def _remove_repeat_consonants_from_segment(segment: str, dictionary: Trie) -> str: + """ + Remove repeating consonants at the last of the segment. + + This function process only at the last of the given text. + Details is same as remove_repeat_consonants(). + + :param str segment: segment of text + :param Trie dictionary: Trie dictionary to check the last word. + :return: segment without repeating Thai consonants + :rtype: str + """ + # skip if the segment is not the target + if not ( + # the segment is long enough + (len(segment) > 1) + # last is Thai consonant + and (segment[-1] in consonants) + # has repiitition + and (segment[-1] == segment[-2]) + ): + # no need to process + return segment + + # duplicating character + dup = segment[-1] + + # find the words that has 2 or more duplication of + # this character at the end. + repeaters = consonants_repeaters[dup] + + # remove all of the last repeating character + segment_head = _get_repitition_head(segment, dup) + + # find the longest word that matches the segment + longest_word, repetition = _find_longest_consonant_repeaters_match( + segment_head, repeaters + ) + + if len(longest_word) > 0: + # if there is a match, use it + segment = segment_head + (dup * repetition) + else: + # if none found, + # the chance is that the correct is one character, + # or it's not in the dictionary. + + # make the repition to once + segment = segment_head + (dup * 1) + + return segment + + +def _get_repitition_head(text: str, dup: str) -> str: + """ + Reduce repeating characters at the end of the text. + + This function will remove the repeating characters at the last. + The text just before the repeating characters will be returned. + + :param str text: input text + :param str dup: repeating character to be removed + :return: text without repeating characters at the end + :rtype: str + """ + head = text + while (len(head) > 0) and (head[-1] == dup): + head = head[:-1] + + return head + + +def _update_consonant_repeaters(dictionary: Trie) -> None: + """ + Update dictionary of all words that has + repeating consonants at the end from the dictionary. + + Search all words in the dictionary that has more than 1 consonants + repeating at the end and store them in the global dictionary. + + :param str consonant: consonant to be searched + :param Trie dictionary: Trie dictionary to search + :rtype: None + """ + # initialize dictionary + for consonant in list(consonants): + consonants_repeaters[consonant] = [] + + # register + for word in dictionary: + if _is_consonant_repeater(word): + consonants_repeaters[word[-1]].append(word) + + return + + +def _is_consonant_repeater(word: str) -> bool: + """ + Check if the word has repeating consonants at the end. + + This function checks if the word has + more than 1 repeating consonants at the end. + + :param str word: word to be checked + :return: True if the word has repeating consonants at the end. + :rtype: bool + """ + return (len(word) > 1) and (word[-1] == word[-2]) and (word[-1] in consonants) + + +def _find_longest_consonant_repeaters_match( + segment_head: str, repeaters: List[str] +) -> Tuple[str, int]: + """ + Find the longest word that matches the segment. + + Find the longest word that matches the last + of the segment from the given repeaters list. + This returns the word and + how much the last character is repeated correctly. + + :param str segment: segment of text + :param List[str] repeaters: list of words + that has repeating consonants at the end + :return: "tuple of the word" and + "how much the last character is repeated correctly" + If none, ("", 0) will be returned. + :rtype: Tuple[str, int] + """ + longest_word = "" # the longest word that matches the segment + repetition = 0 # how much the last character is repeated correctly + for repeater in repeaters: + # remove all of the last repeating character + repeater_head = _get_repitition_head(repeater, repeater[-1]) + + # check match + if ( + (len(segment_head) >= len(repeater_head)) + and (segment_head[-len(repeater_head) :] == repeater_head) + # matched confirmed, check it's longer + and (len(repeater) > len(longest_word)) + ): + longest_word = repeater + repetition = len(repeater) - len(repeater_head) + + return longest_word, repetition From 9c1a34ca39a3c11d6d27ca8c254c7fd92c93a2d6 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Sat, 11 Nov 2023 23:15:59 +0900 Subject: [PATCH 24/36] Improve: Rename method suggested by https://github.com/PyThaiNLP/pythainlp/pull/862#issuecomment-1805830606 --- pythainlp/util/__init__.py | 2 +- pythainlp/util/removerepeatconsonants.py | 2 +- tests/test_util.py | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 99bc46621..3b03f8367 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -103,7 +103,7 @@ remove_zw, reorder_vowels, ) -from pythainlp.util.removerepeatconsonants import remove_repeat_consonants +from pythainlp.util.removerepeatconsonants import remove_trailing_repeat_consonants from pythainlp.util.numtoword import bahttext, num_to_thaiword from pythainlp.util.strftime import thai_strftime from pythainlp.util.thai import ( diff --git a/pythainlp/util/removerepeatconsonants.py b/pythainlp/util/removerepeatconsonants.py index 966712b37..32b0cf472 100644 --- a/pythainlp/util/removerepeatconsonants.py +++ b/pythainlp/util/removerepeatconsonants.py @@ -29,7 +29,7 @@ consonants_repeaters = {} -def remove_repeat_consonants( +def remove_trailing_repeat_consonants( text: str, dictionary: Trie = None, dictionary_updated: bool = True ) -> str: """ diff --git a/tests/test_util.py b/tests/test_util.py index 3c1618201..e45319c99 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -60,7 +60,7 @@ ipa_to_rtgs, remove_tone_ipa, tis620_to_utf8, - remove_repeat_consonants + remove_trailing_repeat_consonants ) from pythainlp.util.spell_words import spell_word @@ -865,22 +865,22 @@ def test_rhyme(self): def test_remove_repeat_consonants(self): # update of pythainlp.copus.thai_words() able to break this self.assertEqual( - remove_repeat_consonants('เริ่ดดดดดดดด'), + remove_trailing_repeat_consonants('เริ่ดดดดดดดด'), 'เริ่ด' ) self.assertEqual( - remove_repeat_consonants('อืมมมมมมมมมมมมมมม'), + remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม'), 'อืมมม' ) custom_dictionary = dict_trie(["อืมมมมม"]) self.assertEqual( - remove_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary), + remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary), 'อืมมมมม' ) self.assertEqual( - remove_repeat_consonants( + remove_trailing_repeat_consonants( 'อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด ' 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ' ), From 24c30500ab3616c4a74ef204b6b86884c55bcb97 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Sat, 11 Nov 2023 23:27:58 +0900 Subject: [PATCH 25/36] Refac: make names more clear --- pythainlp/util/removerepeatconsonants.py | 39 ++++++++++++------------ 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/pythainlp/util/removerepeatconsonants.py b/pythainlp/util/removerepeatconsonants.py index 32b0cf472..aad8f2638 100644 --- a/pythainlp/util/removerepeatconsonants.py +++ b/pythainlp/util/removerepeatconsonants.py @@ -26,11 +26,11 @@ # when dictionary updated, this should be updated too # key: consonan # value: list of words that has repeating consonants at the end -consonants_repeaters = {} +last_consonants_repeaters = {} def remove_trailing_repeat_consonants( - text: str, dictionary: Trie = None, dictionary_updated: bool = True + text: str, dictionary: Trie = None, has_dictionary_updated: bool = True ) -> str: """ Remove repeating consonants at the last of the sentence. @@ -48,7 +48,7 @@ def remove_trailing_repeat_consonants( :param str text: input text :param Trie dictionary: Trie dictionary to check the last word. If None, pythainlp.corpus.thai_words() will be used - :param bool dictionary_updated: If the dictionary is updated + :param bool has_dictionary_updated: If the dictionary is updated or the first time using in the kernel, set this true. If not, set this false to save time. :return: text without repeating Thai consonants @@ -84,7 +84,7 @@ def remove_trailing_repeat_consonants( dictionary = thai_words() # update repeaters dictionary if not updated - if dictionary_updated: + if has_dictionary_updated: _update_consonant_repeaters(dictionary) # seperate by newline @@ -93,7 +93,9 @@ def remove_trailing_repeat_consonants( segments = line.split(" ") for cnt, segment in enumerate(segments): - segments[cnt] = _remove_repeat_consonants_from_segment(segment, dictionary) + segments[cnt] = _remove_repeat_trailing_consonants_from_segment( + segment, dictionary + ) # revert spaces modified_line = " ".join(segments) @@ -105,7 +107,7 @@ def remove_trailing_repeat_consonants( return modified_text -def _remove_repeat_consonants_from_segment(segment: str, dictionary: Trie) -> str: +def _remove_repeat_trailing_consonants_from_segment(segment: str) -> str: """ Remove repeating consonants at the last of the segment. @@ -113,7 +115,6 @@ def _remove_repeat_consonants_from_segment(segment: str, dictionary: Trie) -> st Details is same as remove_repeat_consonants(). :param str segment: segment of text - :param Trie dictionary: Trie dictionary to check the last word. :return: segment without repeating Thai consonants :rtype: str """ @@ -134,10 +135,10 @@ def _remove_repeat_consonants_from_segment(segment: str, dictionary: Trie) -> st # find the words that has 2 or more duplication of # this character at the end. - repeaters = consonants_repeaters[dup] + repeaters = last_consonants_repeaters[dup] # remove all of the last repeating character - segment_head = _get_repitition_head(segment, dup) + segment_head = _remove_all_last_consonants(segment, dup) # find the longest word that matches the segment longest_word, repetition = _find_longest_consonant_repeaters_match( @@ -158,7 +159,7 @@ def _remove_repeat_consonants_from_segment(segment: str, dictionary: Trie) -> st return segment -def _get_repitition_head(text: str, dup: str) -> str: +def _remove_all_last_consonants(text: str, dup: str) -> str: """ Reduce repeating characters at the end of the text. @@ -170,11 +171,11 @@ def _get_repitition_head(text: str, dup: str) -> str: :return: text without repeating characters at the end :rtype: str """ - head = text - while (len(head) > 0) and (head[-1] == dup): - head = head[:-1] + removed = text + while (len(removed) > 0) and (removed[-1] == dup): + removed = removed[:-1] - return head + return removed def _update_consonant_repeaters(dictionary: Trie) -> None: @@ -191,17 +192,17 @@ def _update_consonant_repeaters(dictionary: Trie) -> None: """ # initialize dictionary for consonant in list(consonants): - consonants_repeaters[consonant] = [] + last_consonants_repeaters[consonant] = [] # register for word in dictionary: - if _is_consonant_repeater(word): - consonants_repeaters[word[-1]].append(word) + if _is_last_consonant_repeater(word): + last_consonants_repeaters[word[-1]].append(word) return -def _is_consonant_repeater(word: str) -> bool: +def _is_last_consonant_repeater(word: str) -> bool: """ Check if the word has repeating consonants at the end. @@ -238,7 +239,7 @@ def _find_longest_consonant_repeaters_match( repetition = 0 # how much the last character is repeated correctly for repeater in repeaters: # remove all of the last repeating character - repeater_head = _get_repitition_head(repeater, repeater[-1]) + repeater_head = _remove_all_last_consonants(repeater, repeater[-1]) # check match if ( From 13cf54ad1657e820f7bd9d34105abbcfc51b3cbf Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Sat, 11 Nov 2023 23:38:15 +0900 Subject: [PATCH 26/36] Refac: reflect method name change --- pythainlp/util/__init__.py | 2 +- ...sonants.py => remove_trailing_repeat_consonants.py} | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) rename pythainlp/util/{removerepeatconsonants.py => remove_trailing_repeat_consonants.py} (94%) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 3b03f8367..d05d15c3a 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -103,7 +103,7 @@ remove_zw, reorder_vowels, ) -from pythainlp.util.removerepeatconsonants import remove_trailing_repeat_consonants +from pythainlp.util.remove_trailing_repeat_consonants import remove_trailing_repeat_consonants from pythainlp.util.numtoword import bahttext, num_to_thaiword from pythainlp.util.strftime import thai_strftime from pythainlp.util.thai import ( diff --git a/pythainlp/util/removerepeatconsonants.py b/pythainlp/util/remove_trailing_repeat_consonants.py similarity index 94% rename from pythainlp/util/removerepeatconsonants.py rename to pythainlp/util/remove_trailing_repeat_consonants.py index aad8f2638..33ca1c692 100644 --- a/pythainlp/util/removerepeatconsonants.py +++ b/pythainlp/util/remove_trailing_repeat_consonants.py @@ -57,24 +57,24 @@ def remove_trailing_repeat_consonants( :Example: :: - from pythainlp.util import remove_repeat_consonants + from pythainlp.util import remove_trailing_repeat_consonants from pythainlp.util import dict_trie # use default dictionary (pythainlp.corpus.thai_words()) - remove_repeat_consonants('เริ่ดดดดดดดด') + remove_trailing_repeat_consonants('เริ่ดดดดดดดด') # output: เริ่ด - remove_repeat_consonants('อืมมมมมมมมมมมมมมม') + remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม') # output: อืมมม # "อืมมม" is in the default dictionary # use custom dictionary custom_dictionary = dict_trie(["อืมมมมม"]) - remove_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary) + remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary) # output: อืมมมมม # long text - remove_repeat_consonants('อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '\ + remove_trailing_repeat_consonants('อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '\ 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ') # output: อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ # นี่เป็นความลับ From a94fccbd8bcab9db092a2bdc12f0d854d6f8f0f2 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Sat, 11 Nov 2023 23:39:22 +0900 Subject: [PATCH 27/36] Fix: argument inconsistence --- pythainlp/util/remove_trailing_repeat_consonants.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pythainlp/util/remove_trailing_repeat_consonants.py b/pythainlp/util/remove_trailing_repeat_consonants.py index 33ca1c692..6f2d92c39 100644 --- a/pythainlp/util/remove_trailing_repeat_consonants.py +++ b/pythainlp/util/remove_trailing_repeat_consonants.py @@ -93,9 +93,7 @@ def remove_trailing_repeat_consonants( segments = line.split(" ") for cnt, segment in enumerate(segments): - segments[cnt] = _remove_repeat_trailing_consonants_from_segment( - segment, dictionary - ) + segments[cnt] = _remove_repeat_trailing_consonants_from_segment(segment) # revert spaces modified_line = " ".join(segments) From 832d28c2767f91f7881243dbf0bc1084b2b4508a Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Sat, 11 Nov 2023 23:46:13 +0900 Subject: [PATCH 28/36] Refac: revert to the first place because this PR inplemention seperated from normalize.py --- pythainlp/util/normalize.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 825ed79eb..b7e0f558b 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -49,7 +49,9 @@ ] # VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan -_NOREPEAT_CHARS = f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e" +_NOREPEAT_CHARS = ( + f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e" +) _NOREPEAT_PAIRS = list( zip([f"({ch}[ ]*)+{ch}" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS) ) @@ -297,4 +299,4 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]: i += 1 _list_word.append(text) i += 1 - return _list_word + return _list_word \ No newline at end of file From 95761ea55f88466095d1f114a97a9f1680fe8615 Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Sat, 11 Nov 2023 23:46:33 +0900 Subject: [PATCH 29/36] Refac: use black line-length=79 --- pythainlp/util/remove_trailing_repeat_consonants.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pythainlp/util/remove_trailing_repeat_consonants.py b/pythainlp/util/remove_trailing_repeat_consonants.py index 6f2d92c39..7ccec361e 100644 --- a/pythainlp/util/remove_trailing_repeat_consonants.py +++ b/pythainlp/util/remove_trailing_repeat_consonants.py @@ -93,7 +93,9 @@ def remove_trailing_repeat_consonants( segments = line.split(" ") for cnt, segment in enumerate(segments): - segments[cnt] = _remove_repeat_trailing_consonants_from_segment(segment) + segments[cnt] = _remove_repeat_trailing_consonants_from_segment( + segment + ) # revert spaces modified_line = " ".join(segments) @@ -211,7 +213,9 @@ def _is_last_consonant_repeater(word: str) -> bool: :return: True if the word has repeating consonants at the end. :rtype: bool """ - return (len(word) > 1) and (word[-1] == word[-2]) and (word[-1] in consonants) + return ( + (len(word) > 1) and (word[-1] == word[-2]) and (word[-1] in consonants) + ) def _find_longest_consonant_repeaters_match( From cefc4e7246114867f8c5ec0827e236161fdbbafa Mon Sep 17 00:00:00 2001 From: konbraphat51 Date: Sat, 11 Nov 2023 23:54:28 +0900 Subject: [PATCH 30/36] Refac: reduce col length used black line-length=79 --- pythainlp/util/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index d05d15c3a..f6f5e373c 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -103,7 +103,9 @@ remove_zw, reorder_vowels, ) -from pythainlp.util.remove_trailing_repeat_consonants import remove_trailing_repeat_consonants +from pythainlp.util.remove_trailing_repeat_consonants import ( + remove_trailing_repeat_consonants, +) from pythainlp.util.numtoword import bahttext, num_to_thaiword from pythainlp.util.strftime import thai_strftime from pythainlp.util.thai import ( From fd2896b0cc1aa889b21e461687f23bbfbc08c9d4 Mon Sep 17 00:00:00 2001 From: Konbraphat <101827492+konbraphat51@users.noreply.github.com> Date: Mon, 13 Nov 2023 10:32:03 +0900 Subject: [PATCH 31/36] Refac: add last new line --- pythainlp/util/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index b7e0f558b..a8cacae22 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -299,4 +299,4 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]: i += 1 _list_word.append(text) i += 1 - return _list_word \ No newline at end of file + return _list_word From ee492f14c4ec992c152869c595c8eea624416aac Mon Sep 17 00:00:00 2001 From: Konbraphat <101827492+konbraphat51@users.noreply.github.com> Date: Mon, 13 Nov 2023 10:48:17 +0900 Subject: [PATCH 32/36] Update commentation Update responding to method rename --- pythainlp/util/remove_trailing_repeat_consonants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/util/remove_trailing_repeat_consonants.py b/pythainlp/util/remove_trailing_repeat_consonants.py index 7ccec361e..60634b236 100644 --- a/pythainlp/util/remove_trailing_repeat_consonants.py +++ b/pythainlp/util/remove_trailing_repeat_consonants.py @@ -20,7 +20,7 @@ from pythainlp import thai_consonants as consonants from typing import Tuple, List -# used by remove_repeat_consonants() +# used by remove_trailing_repeat_consonants() # contains all words that has repeating consonants at the end # for each consonant # when dictionary updated, this should be updated too From 4212ff3e1c4a776735f0dfc91ad19291302bfc8e Mon Sep 17 00:00:00 2001 From: Konbraphat <101827492+konbraphat51@users.noreply.github.com> Date: Mon, 13 Nov 2023 10:51:32 +0900 Subject: [PATCH 33/36] Refac: clearify commentation --- pythainlp/util/remove_trailing_repeat_consonants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/util/remove_trailing_repeat_consonants.py b/pythainlp/util/remove_trailing_repeat_consonants.py index 60634b236..51e269575 100644 --- a/pythainlp/util/remove_trailing_repeat_consonants.py +++ b/pythainlp/util/remove_trailing_repeat_consonants.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Removement of repeated consonants +Removement of repeated consonants at the end of words """ from pythainlp.corpus import thai_words from pythainlp.util.trie import Trie From abd47025d7ff4ecc7bdc40df16cd5bea6075e458 Mon Sep 17 00:00:00 2001 From: Konbraphat <101827492+konbraphat51@users.noreply.github.com> Date: Mon, 13 Nov 2023 10:55:04 +0900 Subject: [PATCH 34/36] Refac: fix typi --- pythainlp/util/remove_trailing_repeat_consonants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/util/remove_trailing_repeat_consonants.py b/pythainlp/util/remove_trailing_repeat_consonants.py index 51e269575..7aae7e519 100644 --- a/pythainlp/util/remove_trailing_repeat_consonants.py +++ b/pythainlp/util/remove_trailing_repeat_consonants.py @@ -24,7 +24,7 @@ # contains all words that has repeating consonants at the end # for each consonant # when dictionary updated, this should be updated too -# key: consonan +# key: consonant # value: list of words that has repeating consonants at the end last_consonants_repeaters = {} From 740c5e5e4b46176819049745f8529cf413e28e9b Mon Sep 17 00:00:00 2001 From: Konbraphat <101827492+konbraphat51@users.noreply.github.com> Date: Mon, 13 Nov 2023 10:58:16 +0900 Subject: [PATCH 35/36] Add: __all__ --- pythainlp/util/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index f6f5e373c..ddcb9b62b 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -69,6 +69,7 @@ "remove_tone_ipa", "tis620_to_utf8", "spell_words", + "remove_trailing_repeat_consonants", ] from pythainlp.util.collate import collate From 3315cb026fc2e2b8b4518c3ae3a9b04c50738b10 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 13 Nov 2023 07:49:54 +0000 Subject: [PATCH 36/36] Sort export names in __all__ --- pythainlp/util/__init__.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index ddcb9b62b..55302507b 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -21,19 +21,21 @@ "abbreviation_to_full_text", "arabic_digit_to_thai_digit", "bahttext", - "convert_years", "collate", - "countthai", + "convert_years", "count_thai_chars", + "countthai", "dict_trie", "digit_to_text", "display_thai_char", "emoji_to_thai", "eng_to_thai", "find_keyword", + "ipa_to_rtgs", "is_native_thai", "isthai", "isthaichar", + "nectec_to_ipa", "normalize", "now_reign_year", "num_to_thaiword", @@ -42,11 +44,18 @@ "remove_dangling", "remove_dup_spaces", "remove_repeat_vowels", + "remove_tone_ipa", "remove_tonemark", + "remove_trailing_repeat_consonants", "remove_zw", "reorder_vowels", "rhyme", + "sound_syllable", + "spell_words", + "syllable_length", + "syllable_open_close_detector", "text_to_arabic_digit", + "text_to_num", "text_to_thai_digit", "thai_digit_to_arabic_digit", "thai_keyboard_dist", @@ -58,18 +67,9 @@ "thaiword_to_num", "thaiword_to_time", "time_to_thaiword", - "text_to_num", + "tis620_to_utf8", "tone_detector", "words_to_num", - "sound_syllable", - "syllable_length", - "syllable_open_close_detector", - "nectec_to_ipa", - "ipa_to_rtgs", - "remove_tone_ipa", - "tis620_to_utf8", - "spell_words", - "remove_trailing_repeat_consonants", ] from pythainlp.util.collate import collate