From 04b22d88ac631a41a067a83c9e8b8b7991aa7e3a Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 20 Oct 2018 09:33:28 +0700 Subject: [PATCH 1/2] Small variable rename and handle engine not found case - add __ prefix for private class members - summarize_text() will always return something (if summarization engine not found, return first n sentences) --- pythainlp/summarize/__init__.py | 77 ++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/pythainlp/summarize/__init__.py b/pythainlp/summarize/__init__.py index 0c413bd7b..20d5ce72f 100644 --- a/pythainlp/summarize/__init__.py +++ b/pythainlp/summarize/__init__.py @@ -1,51 +1,68 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals -from pythainlp.corpus import stopwords -from string import punctuation + +from __future__ import absolute_import, unicode_literals + from collections import defaultdict -from pythainlp.tokenize import sent_tokenize, word_tokenize from heapq import nlargest +from string import punctuation + +from pythainlp.corpus import stopwords +from pythainlp.tokenize import sent_tokenize, word_tokenize + + class FrequencySummarizer: def __init__(self, min_cut=0.1, max_cut=0.9): - self._min_cut = min_cut - self._max_cut = max_cut - self._stopwords = set(stopwords.words('thai') + list(punctuation)) + self.__min_cut = min_cut + self.__max_cut = max_cut + self.__stopwords = set(stopwords.words("thai") + list(punctuation)) - def _compute_frequencies(self, word_sent): + def __compute_frequencies(self, word_sent): freq = defaultdict(int) for s in word_sent: for word in s: - if word not in self._stopwords: + if word not in self.__stopwords: freq[word] += 1 + m = float(max(freq.values())) for w in list(freq): - freq[w] = freq[w]/m - if freq[w] >= self._max_cut or freq[w] <= self._min_cut: + freq[w] = freq[w] / m + if freq[w] >= self.__max_cut or freq[w] <= self.__min_cut: del freq[w] + return freq - def _rank(self, ranking, n): + def __rank(self, ranking, n): return nlargest(n, ranking, key=ranking.get) - def summarize(self, text, n,tokenize): + def summarize(self, text, n, tokenizer): sents = sent_tokenize(text) - word_sent = [word_tokenize(s,tokenize) for s in sents] - self._freq = self._compute_frequencies(word_sent) + word_sent = [word_tokenize(s, tokenizer) for s in sents] + self.__freq = self.__compute_frequencies(word_sent) ranking = defaultdict(int) + for i, sent in enumerate(word_sent): for w in sent: - if w in self._freq: - ranking[i] += self._freq[w] - sents_idx = self._rank(ranking,n) - return [sents[j] for j in sents_idx] -def summarize_text(text,n,engine='frequency',tokenize='newmm'): - ''' - Thai text summarize. - :param str text: thai text - :param int n: sent number - :param str engine: Thai text summarize engine. - :param str tokenize: thai word tokenize. - ''' - if engine=='frequency': - data=FrequencySummarizer().summarize(text,n,tokenize) - return data + if w in self.__freq: + ranking[i] += self.__freq[w] + summaries_idx = self.__rank(ranking, n) + + return [sents[j] for j in summaries_idx] + + +def summarize_text(text, n, engine="frequency", tokenizer="newmm"): + """ + Thai text summarization + :param str text: text to be summarized + :param int n: number of sentences to be included in the summary + :param str engine: text summarization engine + :param str tokenizer: word tokenizer + :return List[str] summary: list of selected sentences + """ + sents = [] + + if engine == "frequency": + sents = FrequencySummarizer().summarize(text, n, tokenizer) + else: # if engine not found, return first n sentences + sents = sent_tokenize(text)[:n] + + return sents From c1296d7a957b4c4d62543ac74883b67d9fb34f9f Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 20 Oct 2018 09:57:10 +0700 Subject: [PATCH 2/2] more comprehensible variable names --- pythainlp/summarize/__init__.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pythainlp/summarize/__init__.py b/pythainlp/summarize/__init__.py index 20d5ce72f..f21fb84af 100644 --- a/pythainlp/summarize/__init__.py +++ b/pythainlp/summarize/__init__.py @@ -16,31 +16,31 @@ def __init__(self, min_cut=0.1, max_cut=0.9): self.__max_cut = max_cut self.__stopwords = set(stopwords.words("thai") + list(punctuation)) - def __compute_frequencies(self, word_sent): - freq = defaultdict(int) - for s in word_sent: - for word in s: + def __compute_frequencies(self, word_tokenized_sents): + word_freqs = defaultdict(int) + for sent in word_tokenized_sents: + for word in sent: if word not in self.__stopwords: - freq[word] += 1 + word_freqs[word] += 1 - m = float(max(freq.values())) - for w in list(freq): - freq[w] = freq[w] / m - if freq[w] >= self.__max_cut or freq[w] <= self.__min_cut: - del freq[w] + max_freq = float(max(word_freqs.values())) + for w in list(word_freqs): + word_freqs[w] = word_freqs[w] / max_freq + if word_freqs[w] >= self.__max_cut or word_freqs[w] <= self.__min_cut: + del word_freqs[w] - return freq + return word_freqs def __rank(self, ranking, n): return nlargest(n, ranking, key=ranking.get) def summarize(self, text, n, tokenizer): sents = sent_tokenize(text) - word_sent = [word_tokenize(s, tokenizer) for s in sents] - self.__freq = self.__compute_frequencies(word_sent) + word_tokenized_sents = [word_tokenize(sent, tokenizer) for sent in sents] + self.__freq = self.__compute_frequencies(word_tokenized_sents) ranking = defaultdict(int) - for i, sent in enumerate(word_sent): + for i, sent in enumerate(word_tokenized_sents): for w in sent: if w in self.__freq: ranking[i] += self.__freq[w]