Skip to content

Commit dcfbac2

Browse files
authored
feat: add custom cell magic parser to handle complex --params values (#213)
* chore: Move cell magic code into its own directory * Add custom argument parser for cell magic * Add AST node visitor * Use a custom parser for cell magic arguments * Improve cell magic parser test coverage * Generalize valid option values The parser should accept as wide a range of values as possible and let the code that delas with the semantics to decide whether the values are good or not. * Fix recognizing --params option in state 3 The --params option spec must be followed by a non-alphanumeric character, otherwise it's a different option spec (e.g. --paramsX). * Fix typo in comment * Cover missing parser code path with a test * Preserve the cell magic context's import path The context still needs to be importable from the old path * Clarify lexer states * Replace re.scanner with finditer() * Fix typo in docstring * Simplify string literal in a single line Apparently black just places all implicitly concatenated string literals in a single line when short enough without replacing them with a single string literal. * Explain the visitors module. * Pass pos as a positional arg to finditer() This is necessary to retain Python 2 compatibility. * Resolve coverage complaint about a code path The tokens are designed in a way that the scanner *always* returns some match, even if just UNKNOWN or EOL. The "no matches" code path can thus never be taken, but the coverage check can't know that.
1 parent aa1613c commit dcfbac2

File tree

14 files changed

+1644
-65
lines changed

14 files changed

+1644
-65
lines changed

docs/magics.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
IPython Magics for BigQuery
22
===========================
33

4-
.. automodule:: google.cloud.bigquery.magics
4+
.. automodule:: google.cloud.bigquery.magics.magics
55
:members:

google/cloud/bigquery/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@
150150

151151
def load_ipython_extension(ipython):
152152
"""Called by IPython when this module is loaded as an IPython extension."""
153-
from google.cloud.bigquery.magics import _cell_magic
153+
from google.cloud.bigquery.magics.magics import _cell_magic
154154

155155
ipython.register_magic_function(
156156
_cell_magic, magic_kind="cell", magic_name="bigquery"
+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from google.cloud.bigquery.magics.magics import context
16+
17+
18+
# For backwards compatibility we need to make the context available in the path
19+
# google.cloud.bigquery.magics.context
20+
__all__ = ("context",)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from google.cloud.bigquery.magics.line_arg_parser.exceptions import ParseError
16+
from google.cloud.bigquery.magics.line_arg_parser.exceptions import (
17+
DuplicateQueryParamsError,
18+
QueryParamsParseError,
19+
)
20+
from google.cloud.bigquery.magics.line_arg_parser.lexer import Lexer
21+
from google.cloud.bigquery.magics.line_arg_parser.lexer import TokenType
22+
from google.cloud.bigquery.magics.line_arg_parser.parser import Parser
23+
from google.cloud.bigquery.magics.line_arg_parser.visitors import QueryParamsExtractor
24+
25+
26+
__all__ = (
27+
"DuplicateQueryParamsError",
28+
"Lexer",
29+
"Parser",
30+
"ParseError",
31+
"QueryParamsExtractor",
32+
"QueryParamsParseError",
33+
"TokenType",
34+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
class ParseError(Exception):
17+
pass
18+
19+
20+
class QueryParamsParseError(ParseError):
21+
"""Raised when --params option is syntactically incorrect."""
22+
23+
24+
class DuplicateQueryParamsError(ParseError):
25+
pass
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from collections import namedtuple
16+
from collections import OrderedDict
17+
import itertools
18+
import re
19+
20+
import enum
21+
22+
23+
Token = namedtuple("Token", ("type_", "lexeme", "pos"))
24+
StateTransition = namedtuple("StateTransition", ("new_state", "total_offset"))
25+
26+
# Pattern matching is done with regexes, and the order in which the token patterns are
27+
# defined is important.
28+
#
29+
# Suppose we had the following token definitions:
30+
# * INT - a token matching integers,
31+
# * FLOAT - a token matching floating point numbers,
32+
# * DOT - a token matching a single literal dot character, i.e. "."
33+
#
34+
# The FLOAT token would have to be defined first, since we would want the input "1.23"
35+
# to be tokenized as a single FLOAT token, and *not* three tokens (INT, DOT, INT).
36+
#
37+
# Sometimes, however, different tokens match too similar patterns, and it is not
38+
# possible to define them in order that would avoid any ambiguity. One such case are
39+
# the OPT_VAL and PY_NUMBER tokens, as both can match an integer literal, say "42".
40+
#
41+
# In order to avoid the dilemmas, the lexer implements a concept of STATES. States are
42+
# used to split token definitions into subgroups, and in each lexer state only a single
43+
# subgroup is used for tokenizing the input. Lexer states can therefore be though of as
44+
# token namespaces.
45+
#
46+
# For example, while parsing the value of the "--params" option, we do not want to
47+
# "recognize" it as a single OPT_VAL token, but instead want to parse it as a Python
48+
# dictionary and verify its syntactial correctness. On the other hand, while parsing
49+
# the value of an option other than "--params", we do not really care about its
50+
# structure, and thus do not want to use any of the "Python tokens" for pattern matching.
51+
#
52+
# Since token definition order is important, an OrderedDict is needed with tightly
53+
# controlled member definitions (i.e. passed as a sequence, and *not* via kwargs).
54+
token_types = OrderedDict(
55+
[
56+
(
57+
"state_parse_pos_args",
58+
OrderedDict(
59+
[
60+
(
61+
"GOTO_PARSE_NON_PARAMS_OPTIONS",
62+
r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--))", # double dash - starting the options list
63+
),
64+
(
65+
"DEST_VAR",
66+
r"(?P<DEST_VAR>[^\d\W]\w*)", # essentially a Python ID
67+
),
68+
]
69+
),
70+
),
71+
(
72+
"state_parse_non_params_options",
73+
OrderedDict(
74+
[
75+
(
76+
"GOTO_PARSE_PARAMS_OPTION",
77+
r"(?P<GOTO_PARSE_PARAMS_OPTION>(?=--params(?:\s|=|--|$)))", # the --params option
78+
),
79+
("OPTION_SPEC", r"(?P<OPTION_SPEC>--\w+)"),
80+
("OPTION_EQ", r"(?P<OPTION_EQ>=)"),
81+
("OPT_VAL", r"(?P<OPT_VAL>\S+?(?=\s|--|$))"),
82+
]
83+
),
84+
),
85+
(
86+
"state_parse_params_option",
87+
OrderedDict(
88+
[
89+
(
90+
"PY_STRING",
91+
r"(?P<PY_STRING>(?:{})|(?:{}))".format(
92+
r"'(?:[^'\\]|\.)*'",
93+
r'"(?:[^"\\]|\.)*"', # single and double quoted strings
94+
),
95+
),
96+
("PARAMS_OPT_SPEC", r"(?P<PARAMS_OPT_SPEC>--params(?=\s|=|--|$))"),
97+
("PARAMS_OPT_EQ", r"(?P<PARAMS_OPT_EQ>=)"),
98+
(
99+
"GOTO_PARSE_NON_PARAMS_OPTIONS",
100+
r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--\w+))", # found another option spec
101+
),
102+
("PY_BOOL", r"(?P<PY_BOOL>True|False)"),
103+
("DOLLAR_PY_ID", r"(?P<DOLLAR_PY_ID>\$[^\d\W]\w*)"),
104+
(
105+
"PY_NUMBER",
106+
r"(?P<PY_NUMBER>-?[1-9]\d*(?:\.\d+)?(:?[e|E][+-]?\d+)?)",
107+
),
108+
("SQUOTE", r"(?P<SQUOTE>')"),
109+
("DQUOTE", r'(?P<DQUOTE>")'),
110+
("COLON", r"(?P<COLON>:)"),
111+
("COMMA", r"(?P<COMMA>,)"),
112+
("LCURL", r"(?P<LCURL>\{)"),
113+
("RCURL", r"(?P<RCURL>})"),
114+
("LSQUARE", r"(?P<LSQUARE>\[)"),
115+
("RSQUARE", r"(?P<RSQUARE>])"),
116+
("LPAREN", r"(?P<LPAREN>\()"),
117+
("RPAREN", r"(?P<RPAREN>\))"),
118+
]
119+
),
120+
),
121+
(
122+
"common",
123+
OrderedDict(
124+
[
125+
("WS", r"(?P<WS>\s+)"),
126+
("EOL", r"(?P<EOL>$)"),
127+
(
128+
# anything not a whitespace or matched by something else
129+
"UNKNOWN",
130+
r"(?P<UNKNOWN>\S+)",
131+
),
132+
]
133+
),
134+
),
135+
]
136+
)
137+
138+
139+
# The _generate_next_value_() enum hook is only available in Python 3.6+, thus we
140+
# need to do some acrobatics to implement an "auto str enum" base class. Implementation
141+
# based on the recipe provided by the very author of the Enum library:
142+
# https://stackoverflow.com/a/32313954/5040035
143+
class StrEnumMeta(enum.EnumMeta):
144+
@classmethod
145+
def __prepare__(metacls, name, bases, **kwargs):
146+
# Having deterministic enum members definition order is nice.
147+
return OrderedDict()
148+
149+
def __new__(metacls, name, bases, oldclassdict):
150+
# Scan through the declared enum members and convert any value that is a plain
151+
# empty tuple into a `str` of the name instead.
152+
newclassdict = enum._EnumDict()
153+
for key, val in oldclassdict.items():
154+
if val == ():
155+
val = key
156+
newclassdict[key] = val
157+
return super(StrEnumMeta, metacls).__new__(metacls, name, bases, newclassdict)
158+
159+
160+
# The @six.add_metaclass decorator does not work, Enum complains about _sunder_ names,
161+
# and we cannot use class syntax directly, because the Python 3 version would cause
162+
# a syntax error under Python 2.
163+
AutoStrEnum = StrEnumMeta(
164+
"AutoStrEnum",
165+
(str, enum.Enum),
166+
{"__doc__": "Base enum class for for name=value str enums."},
167+
)
168+
169+
TokenType = AutoStrEnum(
170+
"TokenType",
171+
[
172+
(name, name)
173+
for name in itertools.chain.from_iterable(token_types.values())
174+
if not name.startswith("GOTO_")
175+
],
176+
)
177+
178+
179+
class LexerState(AutoStrEnum):
180+
PARSE_POS_ARGS = () # parsing positional arguments
181+
PARSE_NON_PARAMS_OPTIONS = () # parsing options other than "--params"
182+
PARSE_PARAMS_OPTION = () # parsing the "--params" option
183+
STATE_END = ()
184+
185+
186+
class Lexer(object):
187+
"""Lexical analyzer for tokenizing the cell magic input line."""
188+
189+
_GRAND_PATTERNS = {
190+
LexerState.PARSE_POS_ARGS: re.compile(
191+
"|".join(
192+
itertools.chain(
193+
token_types["state_parse_pos_args"].values(),
194+
token_types["common"].values(),
195+
)
196+
)
197+
),
198+
LexerState.PARSE_NON_PARAMS_OPTIONS: re.compile(
199+
"|".join(
200+
itertools.chain(
201+
token_types["state_parse_non_params_options"].values(),
202+
token_types["common"].values(),
203+
)
204+
)
205+
),
206+
LexerState.PARSE_PARAMS_OPTION: re.compile(
207+
"|".join(
208+
itertools.chain(
209+
token_types["state_parse_params_option"].values(),
210+
token_types["common"].values(),
211+
)
212+
)
213+
),
214+
}
215+
216+
def __init__(self, input_text):
217+
self._text = input_text
218+
219+
def __iter__(self):
220+
# Since re.scanner does not seem to support manipulating inner scanner states,
221+
# we need to implement lexer state transitions manually using special
222+
# non-capturing lookahead token patterns to signal when a state transition
223+
# should be made.
224+
# Since we don't have "nested" states, we don't really need a stack and
225+
# this simple mechanism is sufficient.
226+
state = LexerState.PARSE_POS_ARGS
227+
offset = 0 # the number of characters processed so far
228+
229+
while state != LexerState.STATE_END:
230+
token_stream = self._find_state_tokens(state, offset)
231+
232+
for maybe_token in token_stream: # pragma: NO COVER
233+
if isinstance(maybe_token, StateTransition):
234+
state = maybe_token.new_state
235+
offset = maybe_token.total_offset
236+
break
237+
238+
if maybe_token.type_ != TokenType.WS:
239+
yield maybe_token
240+
241+
if maybe_token.type_ == TokenType.EOL:
242+
state = LexerState.STATE_END
243+
break
244+
245+
def _find_state_tokens(self, state, current_offset):
246+
"""Scan the input for current state's tokens starting at ``current_offset``.
247+
248+
Args:
249+
state (LexerState): The current lexer state.
250+
current_offset (int): The offset in the input text, i.e. the number
251+
of characters already scanned so far.
252+
253+
Yields:
254+
The next ``Token`` or ``StateTransition`` instance.
255+
"""
256+
pattern = self._GRAND_PATTERNS[state]
257+
scanner = pattern.finditer(self._text, current_offset)
258+
259+
for match in scanner: # pragma: NO COVER
260+
token_type = match.lastgroup
261+
262+
if token_type.startswith("GOTO_"):
263+
yield StateTransition(
264+
new_state=getattr(LexerState, token_type[5:]), # w/o "GOTO_" prefix
265+
total_offset=match.start(),
266+
)
267+
268+
yield Token(token_type, match.group(), match.start())

0 commit comments

Comments
 (0)