Skip to content

Commit 5aa7f07

Browse files
committed
Add custom argument parser for cell magic
1 parent cc274f7 commit 5aa7f07

File tree

4 files changed

+706
-0
lines changed

4 files changed

+706
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from google.cloud.bigquery.ipython_magics.line_arg_parser.lexer import Lexer
16+
from google.cloud.bigquery.ipython_magics.line_arg_parser.parser import Parser
17+
from google.cloud.bigquery.ipython_magics.line_arg_parser import (
18+
visitors,
19+
) # TODO: import all
20+
21+
22+
__all__ = ("Lexer", "Parser", "visitors")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from collections import namedtuple
16+
from collections import OrderedDict
17+
import itertools
18+
import re
19+
20+
import enum
21+
22+
23+
Token = namedtuple("Token", ("type_", "lexeme", "pos"))
24+
StateTransition = namedtuple("StateTransition", ("new_state", "total_offset"))
25+
26+
27+
token_types = OrderedDict(
28+
state_1=OrderedDict(
29+
GOTO_STATE_2=r"(?P<GOTO_STATE_2>(?=--))", # double dash - starting the options list
30+
DEST_VAR=r"(?P<DEST_VAR>[^\d\W]\w*)", # essentially a Python ID
31+
),
32+
state_2=OrderedDict(
33+
GOTO_STATE_3=r"(?P<GOTO_STATE_3>(?=--params(?=\s|$)))", # the --params option
34+
OPTION_SPEC=r"(?P<OPTION_SPEC>--\w+)",
35+
# NOTE: currently the only valid value for a non "--params" option is project ID
36+
OPT_VAL=r"(?P<OPT_VAL>[^_\d\W](?:\w|\.)+)",
37+
),
38+
state_3=OrderedDict(
39+
PY_STRING=r"(?P<PY_STRING>(?:{})|(?:{}))".format(
40+
r"'(?:[^'\\]|\.)*'", r'"(?:[^"\\]|\.)*"' # single and double quoted strings
41+
),
42+
PARAMS_OPT_SPEC=r"(?P<PARAMS_OPT_SPEC>--params(?=\s|$))",
43+
GOTO_STATE_2=r"(?P<GOTO_STATE_2>(?=--\w+))", # found another option spec
44+
PY_BOOL=r"(?P<PY_BOOL>True|False)",
45+
DOLLAR_PY_ID=r"(?P<DOLLAR_PY_ID>\$[^\d\W]\w*)",
46+
PY_ID=r"(?P<PY_ID>[^\d\W]\w*)",
47+
# TODO: supporting only ints or floats, add floats in scientific notation, too?
48+
PY_NUMBER=r"(?P<PY_NUMBER>-?[1-9]\d*(?:\.\d+)?)",
49+
SQUOTE=r"(?P<SQUOTE>')",
50+
DQUOTE=r'(?P<DQUOTE>")',
51+
COLON=r"(?P<COLON>:)",
52+
COMMA=r"(?P<COMMA>,)",
53+
LCURL=r"(?P<LCURL>\{)",
54+
RCURL=r"(?P<RCURL>})",
55+
LSQUARE=r"(?P<LSQUARE>\[)",
56+
RSQUARE=r"(?P<RSQUARE>])",
57+
LPAREN=r"(?P<LPAREN>\()",
58+
RPAREN=r"(?P<RPAREN>\))",
59+
),
60+
common=OrderedDict(
61+
WS=r"(?P<WS>\s+)",
62+
EOL=r"(?P<EOL>$)",
63+
UNKNOWN=r"(?P<UNKNOWN>\S+)", # anything not a whitespace or matched by something else
64+
)
65+
)
66+
67+
68+
class AutoStrEnum(str, enum.Enum):
69+
def _generate_next_value_(name, start, count, last_values):
70+
return name
71+
72+
73+
TokenType = AutoStrEnum(
74+
"TokenType",
75+
[
76+
name for name in itertools.chain.from_iterable(token_types.values())
77+
if not name.startswith("GOTO_STATE")
78+
]
79+
)
80+
81+
82+
class LexerState(AutoStrEnum):
83+
STATE_1 = enum.auto() # parsing positional arguments
84+
STATE_2 = enum.auto() # parsing options other than "--params"
85+
STATE_3 = enum.auto() # parsing the "--params" option
86+
STATE_END = enum.auto()
87+
88+
89+
class Lexer(object):
90+
"""Lexical analyzer for tokenizing the cell magic input line."""
91+
92+
_GRAND_PATTERNS = {
93+
LexerState.STATE_1: re.compile(
94+
"|".join(
95+
itertools.chain(
96+
token_types["state_1"].values(), token_types["common"].values(),
97+
)
98+
)
99+
),
100+
LexerState.STATE_2: re.compile(
101+
"|".join(
102+
itertools.chain(
103+
token_types["state_2"].values(), token_types["common"].values(),
104+
)
105+
)
106+
),
107+
LexerState.STATE_3: re.compile(
108+
"|".join(
109+
itertools.chain(
110+
token_types["state_3"].values(), token_types["common"].values(),
111+
)
112+
)
113+
),
114+
}
115+
116+
def __init__(self, input_text):
117+
self._text = input_text
118+
self._state_handlers = {
119+
LexerState.STATE_1: self._state_1,
120+
LexerState.STATE_2: self._state_2,
121+
LexerState.STATE_3: self._state_3,
122+
}
123+
124+
def __iter__(self):
125+
# Since re.scanner does not seem to support manipulating inner scanner states,
126+
# we need to implement lexer state transitions manually using special
127+
# non-capturing lookahead token patterns to signal when a state transition
128+
# should be made.
129+
# Each state is then processed by a dedicated state handler method.
130+
state = LexerState.STATE_1
131+
offset = 0 # the number of characters processed so far
132+
133+
while state != LexerState.STATE_END:
134+
token_generator = self._get_state_token_generator(state, offset)
135+
136+
for maybe_token in token_generator:
137+
if isinstance(maybe_token, StateTransition):
138+
state = maybe_token.new_state
139+
offset = maybe_token.total_offset
140+
break
141+
142+
if maybe_token.type_ != TokenType.WS:
143+
yield maybe_token
144+
145+
if maybe_token.type_ == TokenType.EOL:
146+
state = LexerState.STATE_END
147+
break
148+
149+
def _get_state_token_generator(self, state, current_offset):
150+
"""TODO: explain... we need to create the canner and pick the state handler
151+
and return that
152+
"""
153+
state_handler = self._state_handlers[state]
154+
pattern = self._GRAND_PATTERNS[state]
155+
scanner = pattern.scanner(self._text, pos=current_offset)
156+
return state_handler(scanner)
157+
158+
def _state_1(self, scanner):
159+
for match in iter(scanner.match, None):
160+
token_type = match.lastgroup
161+
162+
if token_type == "GOTO_STATE_2":
163+
yield StateTransition(
164+
new_state=LexerState.STATE_2, total_offset=match.start(),
165+
)
166+
167+
yield Token(token_type, match.group(), match.start())
168+
169+
def _state_2(self, scanner):
170+
for match in iter(scanner.match, None):
171+
token_type = match.lastgroup
172+
173+
if token_type == "GOTO_STATE_3":
174+
yield StateTransition(
175+
new_state=LexerState.STATE_3, total_offset=match.start(),
176+
)
177+
178+
yield Token(token_type, match.group(), match.start())
179+
180+
def _state_3(self, scanner):
181+
for match in iter(scanner.match, None):
182+
token_type = match.lastgroup
183+
184+
if token_type == "GOTO_STATE_2":
185+
yield StateTransition(
186+
new_state=LexerState.STATE_2, total_offset=match.start(),
187+
)
188+
189+
yield Token(token_type, match.group(), match.start())

0 commit comments

Comments
 (0)