diff --git a/pythainlp/summarize/__init__.py b/pythainlp/summarize/__init__.py index 0c413bd7b..f21fb84af 100644 --- a/pythainlp/summarize/__init__.py +++ b/pythainlp/summarize/__init__.py @@ -1,51 +1,68 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals -from pythainlp.corpus import stopwords -from string import punctuation + +from __future__ import absolute_import, unicode_literals + from collections import defaultdict -from pythainlp.tokenize import sent_tokenize, word_tokenize from heapq import nlargest +from string import punctuation + +from pythainlp.corpus import stopwords +from pythainlp.tokenize import sent_tokenize, word_tokenize + + class FrequencySummarizer: def __init__(self, min_cut=0.1, max_cut=0.9): - self._min_cut = min_cut - self._max_cut = max_cut - self._stopwords = set(stopwords.words('thai') + list(punctuation)) - - def _compute_frequencies(self, word_sent): - freq = defaultdict(int) - for s in word_sent: - for word in s: - if word not in self._stopwords: - freq[word] += 1 - m = float(max(freq.values())) - for w in list(freq): - freq[w] = freq[w]/m - if freq[w] >= self._max_cut or freq[w] <= self._min_cut: - del freq[w] - return freq - - def _rank(self, ranking, n): + self.__min_cut = min_cut + self.__max_cut = max_cut + self.__stopwords = set(stopwords.words("thai") + list(punctuation)) + + def __compute_frequencies(self, word_tokenized_sents): + word_freqs = defaultdict(int) + for sent in word_tokenized_sents: + for word in sent: + if word not in self.__stopwords: + word_freqs[word] += 1 + + max_freq = float(max(word_freqs.values())) + for w in list(word_freqs): + word_freqs[w] = word_freqs[w] / max_freq + if word_freqs[w] >= self.__max_cut or word_freqs[w] <= self.__min_cut: + del word_freqs[w] + + return word_freqs + + def __rank(self, ranking, n): return nlargest(n, ranking, key=ranking.get) - def summarize(self, text, n,tokenize): + def summarize(self, text, n, tokenizer): sents = sent_tokenize(text) - word_sent = [word_tokenize(s,tokenize) for s in sents] - self._freq = self._compute_frequencies(word_sent) + word_tokenized_sents = [word_tokenize(sent, tokenizer) for sent in sents] + self.__freq = self.__compute_frequencies(word_tokenized_sents) ranking = defaultdict(int) - for i, sent in enumerate(word_sent): + + for i, sent in enumerate(word_tokenized_sents): for w in sent: - if w in self._freq: - ranking[i] += self._freq[w] - sents_idx = self._rank(ranking,n) - return [sents[j] for j in sents_idx] -def summarize_text(text,n,engine='frequency',tokenize='newmm'): - ''' - Thai text summarize. - :param str text: thai text - :param int n: sent number - :param str engine: Thai text summarize engine. - :param str tokenize: thai word tokenize. - ''' - if engine=='frequency': - data=FrequencySummarizer().summarize(text,n,tokenize) - return data + if w in self.__freq: + ranking[i] += self.__freq[w] + summaries_idx = self.__rank(ranking, n) + + return [sents[j] for j in summaries_idx] + + +def summarize_text(text, n, engine="frequency", tokenizer="newmm"): + """ + Thai text summarization + :param str text: text to be summarized + :param int n: number of sentences to be included in the summary + :param str engine: text summarization engine + :param str tokenizer: word tokenizer + :return List[str] summary: list of selected sentences + """ + sents = [] + + if engine == "frequency": + sents = FrequencySummarizer().summarize(text, n, tokenizer) + else: # if engine not found, return first n sentences + sents = sent_tokenize(text)[:n] + + return sents