-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathscraper.py
44 lines (30 loc) · 1.2 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
class ScrapingResult:
def __init__(self):
self.url = None
self.summary = None
LANGUAGE = "english"
SENTENCES_COUNT = 2
class Scraper:
def scrape(self, url):
complete_url = url
try:
# get summary
print "Retrieving page summary of %s... " % url
parser = HtmlParser.from_url(complete_url, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
url_summary = ''.join(str(sentence) for sentence in summarizer(parser.document, SENTENCES_COUNT))
except Exception, e:
url_summary = "Could not scrape summary. Reason: %s" % e.message
print "Done: %s = %s" % (url, url_summary)
# create scraping result
scraping_result = ScrapingResult()
scraping_result.summary = url_summary
scraping_result.url = url
return scraping_result