This repository was archived by the owner on Mar 13, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparser.py
97 lines (77 loc) · 2.64 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import nltk
import sys
import re
TERMINALS = """
Adj -> "country" | "dreadful" | "enigmatical" | "little" | "moist" | "red"
Adv -> "down" | "here" | "never"
Conj -> "and"
Det -> "a" | "an" | "his" | "my" | "the"
N -> "armchair" | "companion" | "day" | "door" | "hand" | "he" | "himself"
N -> "holmes" | "home" | "i" | "mess" | "paint" | "palm" | "pipe" | "she"
N -> "smile" | "thursday" | "walk" | "we" | "word"
P -> "at" | "before" | "in" | "of" | "on" | "to" | "until"
V -> "arrived" | "came" | "chuckled" | "had" | "lit" | "said" | "sat"
V -> "smiled" | "tell" | "were"
"""
NONTERMINALS = """
S -> NP VP | S Conj S | NP VP Conj VP
AP -> Adj | Adj AP
NP -> N | Det NP | AP NP | NP PP
PP -> P NP | P S
VP -> V | V NP | V NP PP | V PP | VP Adv | Adv VP
"""
grammar = nltk.CFG.fromstring(NONTERMINALS + TERMINALS)
parser = nltk.ChartParser(grammar)
def main():
# If filename specified, read sentence from file
if len(sys.argv) == 2:
with open(sys.argv[1]) as f:
s = f.read()
# Otherwise, get sentence as input
else:
s = input("Sentence: ")
# Convert input into list of words
s = preprocess(s)
# Attempt to parse sentence
try:
trees = list(parser.parse(s))
except ValueError as e:
print(e)
return
if not trees:
print("Could not parse sentence.")
return
# Print each tree with noun phrase chunks
for tree in trees:
tree.pretty_print()
print("Noun Phrase Chunks")
for np in np_chunk(tree):
print(" ".join(np.flatten()))
def preprocess(sentence):
"""
Convert `sentence` to a list of its words.
Pre-process sentence by converting all characters to lowercase
and removing any word that does not contain at least one alphabetic
character.
"""
sentence = sentence.lower()
words = nltk.word_tokenize(sentence)
return [word for word in words if re.match('[a-z]', word)]
def np_chunk(tree):
"""
Return a list of all noun phrase chunks in the sentence tree.
A noun phrase chunk is defined as any subtree of the sentence
whose label is "NP" that does not itself contain any other
noun phrases as subtrees.
"""
return [subtree for subtree in tree.subtrees(check_np_chunk)]
def check_np_chunk(tree):
"""
Returns true if given tree is a NP chunk.
A noun phrase chunk is defined as any subtree of the sentence
whose label is "NP" that does not itself contain any other
noun phrases as subtrees.
"""
return True if tree.label() == 'NP' and not list(tree.subtrees(lambda t: t.label() == 'NP' and t != tree)) else False
if __name__ == "__main__":
main()