Skip to content

Commit 53ea608

Browse files
authored
Remove the lexer hack in Painless (#56293)
Background: The lexer-hack (https://en.wikipedia.org/wiki/Lexer_hack) was used in Painless to add contextual information around types during lexing. This made the the lexer/grammar much simpler, and allowed us to build a user tree with full typing information at creation. Limitations: Forcing the lexer to know about type information up front requires that we know all types when building the user tree, so this limits us from a number of things including the following: * good, consistent error messages as types fail during lexing which gives very poor error messaging since this should be purely syntax errors * compiling against different contexts as a possible use case for ensuring a stored script is valid for at least one context * pursuing new front ends such as a possible template language based on Painless * using the existing grammar to introduce an auto-completion API * the potential of adding new features with specialized types Results: This change removes the lexer hack by doing all type checking as part of the semantic pass against the user tree. The lexer now keywords on the primitives and def types while the user tree will use a symbol node instead of a variable node that can potentially be either a type, a piece of a package name, or a variable. One important change is that the dot operator is now overridden. This means that a dot operator may indicate a piece of a package name or a field (a.b could be access field b from variable a or could be the static type a.b). Logic has been introduced to account for this overloaded operator including checks in the expression nodes to ensure they aren't mistakenly using an unexpected static type or an unknown partial type.
1 parent c98ceb8 commit 53ea608

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+2740
-1804
lines changed

modules/lang-painless/src/main/antlr/PainlessLexer.g4

Lines changed: 9 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,8 @@
2020
lexer grammar PainlessLexer;
2121

2222
@members {
23-
/**
24-
* Check against the current whitelist to determine whether a token is a type
25-
* or not. Called by the {@code TYPE} token defined in {@code PainlessLexer.g4}.
26-
* See also
27-
* <a href="https://en.wikipedia.org/wiki/The_lexer_hack">The lexer hack</a>.
28-
*/
29-
protected abstract boolean isType(String name);
30-
31-
/**
32-
* Is the preceding {@code /} a the beginning of a regex (true) or a division
33-
* (false).
34-
*/
35-
protected abstract boolean slashIsRegex();
23+
/** Is the preceding {@code /} a the beginning of a regex (true) or a division (false). */
24+
protected abstract boolean isSlashRegex();
3625
}
3726

3827
WS: [ \t\n\r]+ -> skip;
@@ -71,7 +60,7 @@ INSTANCEOF: 'instanceof';
7160
BOOLNOT: '!';
7261
BWNOT: '~';
7362
MUL: '*';
74-
DIV: '/' { false == slashIsRegex() }?;
63+
DIV: '/' { isSlashRegex() == false }?;
7564
REM: '%';
7665
ADD: '+';
7766
SUB: '-';
@@ -120,23 +109,19 @@ INTEGER: ( '0' | [1-9] [0-9]* ) [lLfFdD]?;
120109
DECIMAL: ( '0' | [1-9] [0-9]* ) (DOT [0-9]+)? ( [eE] [+\-]? [0-9]+ )? [fFdD]?;
121110

122111
STRING: ( '"' ( '\\"' | '\\\\' | ~[\\"] )*? '"' ) | ( '\'' ( '\\\'' | '\\\\' | ~[\\'] )*? '\'' );
123-
REGEX: '/' ( '\\' ~'\n' | ~('/' | '\n') )+? '/' [cilmsUux]* { slashIsRegex() }?;
112+
REGEX: '/' ( '\\' ~'\n' | ~('/' | '\n') )+? '/' [cilmsUux]* { isSlashRegex() }?;
124113

125114
TRUE: 'true';
126115
FALSE: 'false';
127116

128117
NULL: 'null';
129118

130-
// The predicate here allows us to remove ambiguities when
131-
// dealing with types versus identifiers. We check against
132-
// the current whitelist to determine whether a token is a type
133-
// or not. Note this works by processing one character at a time
134-
// and the rule is added or removed as this happens. This is also known
135-
// as "the lexer hack." See (https://en.wikipedia.org/wiki/The_lexer_hack).
136-
TYPE: ID ( DOT ID )* { isType(getText()) }?;
119+
PRIMITIVE: 'boolean' | 'byte' | 'short' | 'char' | 'int' | 'long' | 'float' | 'double';
120+
DEF: 'def';
121+
137122
ID: [_a-zA-Z] [_a-zA-Z0-9]*;
138123

139124
mode AFTER_DOT;
140125

141-
DOTINTEGER: ( '0' | [1-9] [0-9]* ) -> mode(DEFAULT_MODE);
142-
DOTID: [_a-zA-Z] [_a-zA-Z0-9]* -> mode(DEFAULT_MODE);
126+
DOTINTEGER: ( '0' | [1-9] [0-9]* ) -> mode(DEFAULT_MODE);
127+
DOTID: [_a-zA-Z] [_a-zA-Z0-9]* -> mode(DEFAULT_MODE);

modules/lang-painless/src/main/antlr/PainlessLexer.tokens

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,11 @@ REGEX=77
7878
TRUE=78
7979
FALSE=79
8080
NULL=80
81-
TYPE=81
82-
ID=82
83-
DOTINTEGER=83
84-
DOTID=84
81+
PRIMITIVE=81
82+
DEF=82
83+
ID=83
84+
DOTINTEGER=84
85+
DOTID=85
8586
'{'=3
8687
'}'=4
8788
'['=5
@@ -154,3 +155,4 @@ DOTID=84
154155
'true'=78
155156
'false'=79
156157
'null'=80
158+
'def'=82

modules/lang-painless/src/main/antlr/PainlessParser.g4

Lines changed: 54 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -87,32 +87,38 @@ declaration
8787
;
8888

8989
decltype
90-
: TYPE (LBRACE RBRACE)*
90+
: type (LBRACE RBRACE)*
91+
;
92+
93+
type
94+
: DEF
95+
| PRIMITIVE
96+
| ID (DOT DOTID)*
9197
;
9298

9399
declvar
94100
: ID ( ASSIGN expression )?
95101
;
96102

97103
trap
98-
: CATCH LP TYPE ID RP block
104+
: CATCH LP type ID RP block
99105
;
100106

101107
noncondexpression
102-
: unary # single
103-
| noncondexpression ( MUL | DIV | REM ) noncondexpression # binary
104-
| noncondexpression ( ADD | SUB ) noncondexpression # binary
105-
| noncondexpression ( FIND | MATCH ) noncondexpression # binary
106-
| noncondexpression ( LSH | RSH | USH ) noncondexpression # binary
107-
| noncondexpression ( LT | LTE | GT | GTE ) noncondexpression # comp
108-
| noncondexpression INSTANCEOF decltype # instanceof
109-
| noncondexpression ( EQ | EQR | NE | NER ) noncondexpression # comp
110-
| noncondexpression BWAND noncondexpression # binary
111-
| noncondexpression XOR noncondexpression # binary
112-
| noncondexpression BWOR noncondexpression # binary
113-
| noncondexpression BOOLAND noncondexpression # bool
114-
| noncondexpression BOOLOR noncondexpression # bool
115-
| <assoc=right> noncondexpression ELVIS noncondexpression # elvis
108+
: unary # single
109+
| noncondexpression ( MUL | DIV | REM ) noncondexpression # binary
110+
| noncondexpression ( ADD | SUB ) noncondexpression # binary
111+
| noncondexpression ( FIND | MATCH ) noncondexpression # binary
112+
| noncondexpression ( LSH | RSH | USH ) noncondexpression # binary
113+
| noncondexpression ( LT | LTE | GT | GTE ) noncondexpression # comp
114+
| noncondexpression INSTANCEOF decltype # instanceof
115+
| noncondexpression ( EQ | EQR | NE | NER ) noncondexpression # comp
116+
| noncondexpression BWAND noncondexpression # binary
117+
| noncondexpression XOR noncondexpression # binary
118+
| noncondexpression BWOR noncondexpression # binary
119+
| noncondexpression BOOLAND noncondexpression # bool
120+
| noncondexpression BOOLOR noncondexpression # bool
121+
| <assoc=right> noncondexpression ELVIS noncondexpression # elvis
116122
;
117123

118124
expression
@@ -124,17 +130,37 @@ expression
124130
;
125131

126132
unary
127-
: ( INCR | DECR ) chain # pre
128-
| chain (INCR | DECR ) # post
129-
| chain # read
130-
| ( BOOLNOT | BWNOT | ADD | SUB ) unary # operator
131-
| LP decltype RP unary # cast
133+
: ( INCR | DECR ) chain # pre
134+
| ( ADD | SUB ) unary # addsub
135+
| unarynotaddsub # notaddsub
136+
;
137+
138+
unarynotaddsub
139+
: chain # read
140+
| chain (INCR | DECR ) # post
141+
| ( BOOLNOT | BWNOT ) unary # not
142+
| castexpression # cast
143+
;
144+
145+
castexpression
146+
: LP primordefcasttype RP unary # primordefcast
147+
| LP refcasttype RP unarynotaddsub # refcast
148+
;
149+
150+
primordefcasttype
151+
: DEF
152+
| PRIMITIVE
153+
;
154+
155+
refcasttype
156+
: DEF (LBRACE RBRACE)+
157+
| PRIMITIVE (LBRACE RBRACE)+
158+
| ID (DOT DOTID)* (LBRACE RBRACE)*
132159
;
133160

134161
chain
135-
: primary postfix* # dynamic
136-
| decltype postdot postfix* # static
137-
| arrayinitializer # newarray
162+
: primary postfix* # dynamic
163+
| arrayinitializer # newarray
138164
;
139165

140166
primary
@@ -149,7 +175,7 @@ primary
149175
| mapinitializer # mapinit
150176
| ID # variable
151177
| ID arguments # calllocal
152-
| NEW TYPE arguments # newobject
178+
| NEW type arguments # newobject
153179
;
154180

155181
postfix
@@ -176,8 +202,8 @@ braceaccess
176202
;
177203

178204
arrayinitializer
179-
: NEW TYPE ( LBRACE expression RBRACE )+ ( postdot postfix* )? # newstandardarray
180-
| NEW TYPE LBRACE RBRACE LBRACK ( expression ( COMMA expression )* )? RBRACK postfix* # newinitializedarray
205+
: NEW type ( LBRACE expression RBRACE )+ ( postdot postfix* )? # newstandardarray
206+
| NEW type LBRACE RBRACE LBRACK ( expression ( COMMA expression )* )? RBRACK postfix* # newinitializedarray
181207
;
182208

183209
listinitializer
@@ -213,8 +239,7 @@ lamtype
213239
;
214240

215241
funcref
216-
: TYPE REF ID # classfuncref
242+
: decltype REF ID # classfuncref
217243
| decltype REF NEW # constructorfuncref
218-
| ID REF ID # capturingfuncref
219244
| THIS REF ID # localfuncref
220245
;

modules/lang-painless/src/main/antlr/PainlessParser.tokens

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,11 @@ REGEX=77
7878
TRUE=78
7979
FALSE=79
8080
NULL=80
81-
TYPE=81
82-
ID=82
83-
DOTINTEGER=83
84-
DOTID=84
81+
PRIMITIVE=81
82+
DEF=82
83+
ID=83
84+
DOTINTEGER=84
85+
DOTID=85
8586
'{'=3
8687
'}'=4
8788
'['=5
@@ -154,3 +155,4 @@ DOTID=84
154155
'true'=78
155156
'false'=79
156157
'null'=80
158+
'def'=82

modules/lang-painless/src/main/java/org/elasticsearch/painless/Compiler.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ private static void addFactoryMethod(Map<String, Class<?>> additionalClasses, Cl
209209
ScriptRoot compile(Loader loader, String name, String source, CompilerSettings settings) {
210210
String scriptName = Location.computeSourceName(name);
211211
ScriptClassInfo scriptClassInfo = new ScriptClassInfo(painlessLookup, scriptClass);
212-
SClass root = Walker.buildPainlessTree(scriptClassInfo, scriptName, source, settings, painlessLookup);
212+
SClass root = Walker.buildPainlessTree(scriptClassInfo, scriptName, source, settings);
213213
ScriptRoot scriptRoot = new ScriptRoot(painlessLookup, settings, scriptClassInfo, scriptName, source);
214214
ClassNode classNode = root.writeClass(scriptRoot);
215215
DefBootstrapInjectionPhase.phase(classNode);
@@ -239,7 +239,7 @@ ScriptRoot compile(Loader loader, String name, String source, CompilerSettings s
239239
byte[] compile(String name, String source, CompilerSettings settings, Printer debugStream) {
240240
String scriptName = Location.computeSourceName(name);
241241
ScriptClassInfo scriptClassInfo = new ScriptClassInfo(painlessLookup, scriptClass);
242-
SClass root = Walker.buildPainlessTree(scriptClassInfo, scriptName, source, settings, painlessLookup);
242+
SClass root = Walker.buildPainlessTree(scriptClassInfo, scriptName, source, settings);
243243
ScriptRoot scriptRoot = new ScriptRoot(painlessLookup, settings, scriptClassInfo, scriptName, source);
244244
ClassNode classNode = root.writeClass(scriptRoot);
245245
classNode.setDebugStream(debugStream);

modules/lang-painless/src/main/java/org/elasticsearch/painless/Operation.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,7 @@ public enum Operation {
5151
EQ ( "==" , "equals" ),
5252
EQR ( "===" , "reference equals" ),
5353
NE ( "!=" , "not equals" ),
54-
NER ( "!==" , "reference not equals" ),
55-
INCR ( "++" , "increment" ),
56-
DECR ( "--" , "decrement" );
54+
NER ( "!==" , "reference not equals" );
5755

5856
public final String symbol;
5957
public final String name;

modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/EnhancedPainlessLexer.java

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
import org.antlr.v4.runtime.Token;
2525
import org.antlr.v4.runtime.misc.Interval;
2626
import org.elasticsearch.painless.Location;
27-
import org.elasticsearch.painless.lookup.PainlessLookup;
2827

2928
/**
3029
* A lexer that is customized for painless. It:
@@ -39,14 +38,11 @@
3938
*/
4039
final class EnhancedPainlessLexer extends PainlessLexer {
4140
private final String sourceName;
42-
private final PainlessLookup painlessLookup;
43-
4441
private Token current = null;
4542

46-
EnhancedPainlessLexer(CharStream charStream, String sourceName, PainlessLookup painlessLookup) {
43+
EnhancedPainlessLexer(CharStream charStream, String sourceName) {
4744
super(charStream);
4845
this.sourceName = sourceName;
49-
this.painlessLookup = painlessLookup;
5046
}
5147

5248
@Override
@@ -74,12 +70,7 @@ public void recover(final LexerNoViableAltException lnvae) {
7470
}
7571

7672
@Override
77-
protected boolean isType(String name) {
78-
return painlessLookup.isValidCanonicalClassName(name);
79-
}
80-
81-
@Override
82-
protected boolean slashIsRegex() {
73+
protected boolean isSlashRegex() {
8374
Token lastToken = current;
8475
if (lastToken == null) {
8576
return true;

0 commit comments

Comments
 (0)