Skip to content

Commit f00b1fe

Browse files
committed
Initial commit
0 parents  commit f00b1fe

File tree

5 files changed

+1226
-0
lines changed

5 files changed

+1226
-0
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
node_modules

analyzer.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import nltk
2+
import json
3+
import fileinput
4+
5+
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
6+
7+
text = ''
8+
9+
for line in fileinput.input():
10+
text += line
11+
12+
sentences = nltk.sent_tokenize(text)
13+
sentences = [nltk.word_tokenize(sent) for sent in sentences]
14+
sentences = [nltk.pos_tag(sent) for sent in sentences]
15+
chunked_sentences = nltk.batch_ne_chunk(sentences, binary=True)
16+
17+
def extract_entity_names(t):
18+
entity_names = []
19+
20+
if hasattr(t, 'node') and t.node:
21+
if t.node == 'NE':
22+
entity_names.append(' '.join([child[0] for child in t]))
23+
else:
24+
for child in t:
25+
entity_names.extend(extract_entity_names(child))
26+
27+
return entity_names
28+
29+
entity_names = []
30+
31+
for tree in chunked_sentences:
32+
entity_names.extend(extract_entity_names(tree))
33+
34+
print set(entity_names)
35+
print json.dumps(sentences)

0 commit comments

Comments
 (0)