File tree 5 files changed +1226
-0
lines changed
5 files changed +1226
-0
lines changed Original file line number Diff line number Diff line change
1
+ node_modules
Original file line number Diff line number Diff line change
1
+ import nltk
2
+ import json
3
+ import fileinput
4
+
5
+ sent_detector = nltk .data .load ('tokenizers/punkt/english.pickle' )
6
+
7
+ text = ''
8
+
9
+ for line in fileinput .input ():
10
+ text += line
11
+
12
+ sentences = nltk .sent_tokenize (text )
13
+ sentences = [nltk .word_tokenize (sent ) for sent in sentences ]
14
+ sentences = [nltk .pos_tag (sent ) for sent in sentences ]
15
+ chunked_sentences = nltk .batch_ne_chunk (sentences , binary = True )
16
+
17
+ def extract_entity_names (t ):
18
+ entity_names = []
19
+
20
+ if hasattr (t , 'node' ) and t .node :
21
+ if t .node == 'NE' :
22
+ entity_names .append (' ' .join ([child [0 ] for child in t ]))
23
+ else :
24
+ for child in t :
25
+ entity_names .extend (extract_entity_names (child ))
26
+
27
+ return entity_names
28
+
29
+ entity_names = []
30
+
31
+ for tree in chunked_sentences :
32
+ entity_names .extend (extract_entity_names (tree ))
33
+
34
+ print set (entity_names )
35
+ print json .dumps (sentences )
You can’t perform that action at this time.
0 commit comments