From 04b22d88ac631a41a067a83c9e8b8b7991aa7e3a Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 20 Oct 2018 09:33:28 +0700
Subject: [PATCH 1/2] Small variable rename and handle engine not found case -
 add __ prefix for private class members - summarize_text() will always return
 something (if summarization engine not found, return first n sentences)

---
 pythainlp/summarize/__init__.py | 77 ++++++++++++++++++++-------------
 1 file changed, 47 insertions(+), 30 deletions(-)

diff --git a/pythainlp/summarize/__init__.py b/pythainlp/summarize/__init__.py
index 0c413bd7b..20d5ce72f 100644
--- a/pythainlp/summarize/__init__.py
+++ b/pythainlp/summarize/__init__.py
@@ -1,51 +1,68 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,unicode_literals
-from pythainlp.corpus import stopwords
-from string import punctuation
+
+from __future__ import absolute_import, unicode_literals
+
 from collections import defaultdict
-from pythainlp.tokenize import sent_tokenize, word_tokenize
 from heapq import nlargest
+from string import punctuation
+
+from pythainlp.corpus import stopwords
+from pythainlp.tokenize import sent_tokenize, word_tokenize
+
+
 class FrequencySummarizer:
     def __init__(self, min_cut=0.1, max_cut=0.9):
-        self._min_cut = min_cut
-        self._max_cut = max_cut
-        self._stopwords = set(stopwords.words('thai') + list(punctuation))
+        self.__min_cut = min_cut
+        self.__max_cut = max_cut
+        self.__stopwords = set(stopwords.words("thai") + list(punctuation))
 
-    def _compute_frequencies(self, word_sent):
+    def __compute_frequencies(self, word_sent):
         freq = defaultdict(int)
         for s in word_sent:
             for word in s:
-                if word not in self._stopwords:
+                if word not in self.__stopwords:
                     freq[word] += 1
+
         m = float(max(freq.values()))
         for w in list(freq):
-            freq[w] = freq[w]/m
-            if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
+            freq[w] = freq[w] / m
+            if freq[w] >= self.__max_cut or freq[w] <= self.__min_cut:
                 del freq[w]
+
         return freq
 
-    def _rank(self, ranking, n):
+    def __rank(self, ranking, n):
         return nlargest(n, ranking, key=ranking.get)
 
-    def summarize(self, text, n,tokenize):
+    def summarize(self, text, n, tokenizer):
         sents = sent_tokenize(text)
-        word_sent = [word_tokenize(s,tokenize) for s in sents]
-        self._freq = self._compute_frequencies(word_sent)
+        word_sent = [word_tokenize(s, tokenizer) for s in sents]
+        self.__freq = self.__compute_frequencies(word_sent)
         ranking = defaultdict(int)
+
         for i, sent in enumerate(word_sent):
             for w in sent:
-                if w in self._freq:
-                    ranking[i] += self._freq[w]
-        sents_idx = self._rank(ranking,n)
-        return [sents[j] for j in sents_idx]
-def summarize_text(text,n,engine='frequency',tokenize='newmm'):
-    '''
-    Thai text summarize.
-    :param str text: thai text
-    :param int n: sent number
-    :param str engine: Thai text summarize engine.
-    :param str tokenize: thai word tokenize.
-    '''
-    if engine=='frequency':
-        data=FrequencySummarizer().summarize(text,n,tokenize)
-    return data
+                if w in self.__freq:
+                    ranking[i] += self.__freq[w]
+        summaries_idx = self.__rank(ranking, n)
+
+        return [sents[j] for j in summaries_idx]
+
+
+def summarize_text(text, n, engine="frequency", tokenizer="newmm"):
+    """
+    Thai text summarization
+    :param str text: text to be summarized
+    :param int n: number of sentences to be included in the summary
+    :param str engine: text summarization engine
+    :param str tokenizer: word tokenizer
+    :return List[str] summary: list of selected sentences
+    """
+    sents = []
+
+    if engine == "frequency":
+        sents = FrequencySummarizer().summarize(text, n, tokenizer)
+    else:  # if engine not found, return first n sentences
+        sents = sent_tokenize(text)[:n]
+
+    return sents

From c1296d7a957b4c4d62543ac74883b67d9fb34f9f Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 20 Oct 2018 09:57:10 +0700
Subject: [PATCH 2/2] more comprehensible variable names

---
 pythainlp/summarize/__init__.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/pythainlp/summarize/__init__.py b/pythainlp/summarize/__init__.py
index 20d5ce72f..f21fb84af 100644
--- a/pythainlp/summarize/__init__.py
+++ b/pythainlp/summarize/__init__.py
@@ -16,31 +16,31 @@ def __init__(self, min_cut=0.1, max_cut=0.9):
         self.__max_cut = max_cut
         self.__stopwords = set(stopwords.words("thai") + list(punctuation))
 
-    def __compute_frequencies(self, word_sent):
-        freq = defaultdict(int)
-        for s in word_sent:
-            for word in s:
+    def __compute_frequencies(self, word_tokenized_sents):
+        word_freqs = defaultdict(int)
+        for sent in word_tokenized_sents:
+            for word in sent:
                 if word not in self.__stopwords:
-                    freq[word] += 1
+                    word_freqs[word] += 1
 
-        m = float(max(freq.values()))
-        for w in list(freq):
-            freq[w] = freq[w] / m
-            if freq[w] >= self.__max_cut or freq[w] <= self.__min_cut:
-                del freq[w]
+        max_freq = float(max(word_freqs.values()))
+        for w in list(word_freqs):
+            word_freqs[w] = word_freqs[w] / max_freq
+            if word_freqs[w] >= self.__max_cut or word_freqs[w] <= self.__min_cut:
+                del word_freqs[w]
 
-        return freq
+        return word_freqs
 
     def __rank(self, ranking, n):
         return nlargest(n, ranking, key=ranking.get)
 
     def summarize(self, text, n, tokenizer):
         sents = sent_tokenize(text)
-        word_sent = [word_tokenize(s, tokenizer) for s in sents]
-        self.__freq = self.__compute_frequencies(word_sent)
+        word_tokenized_sents = [word_tokenize(sent, tokenizer) for sent in sents]
+        self.__freq = self.__compute_frequencies(word_tokenized_sents)
         ranking = defaultdict(int)
 
-        for i, sent in enumerate(word_sent):
+        for i, sent in enumerate(word_tokenized_sents):
             for w in sent:
                 if w in self.__freq:
                     ranking[i] += self.__freq[w]