@@ -13,24 +13,24 @@ use crate::diagnostics::SplitDiagnostic;
13
13
/// Main parser that exposes the `cstree` api, and collects errors and statements
14
14
/// It is modelled after a Pratt Parser. For a gentle introduction to Pratt Parsing, see https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html
15
15
pub struct Parser {
16
- /// The ranges of the statements
17
- ranges : Vec < ( usize , usize ) > ,
16
+ /// The statement ranges are defined by the indices of the start/end tokens
17
+ stmt_ranges : Vec < ( usize , usize ) > ,
18
+
18
19
/// The syntax errors accumulated during parsing
19
20
errors : Vec < SplitDiagnostic > ,
20
- /// The start of the current statement, if any
21
+
21
22
current_stmt_start : Option < usize > ,
22
- /// The tokens to parse
23
- pub tokens : Vec < Token > ,
23
+
24
+ tokens : Vec < Token > ,
24
25
25
26
eof_token : Token ,
26
27
27
- next_pos : usize ,
28
+ current_pos : usize ,
28
29
}
29
30
30
- /// Result of Building
31
31
#[ derive( Debug ) ]
32
- pub struct Parse {
33
- /// The ranges of the errors
32
+ pub struct ParserResult {
33
+ /// The ranges of the parsed statements
34
34
pub ranges : Vec < TextRange > ,
35
35
/// The syntax errors accumulated during parsing
36
36
pub errors : Vec < SplitDiagnostic > ,
@@ -41,40 +41,34 @@ impl Parser {
41
41
let eof_token = Token :: eof ( usize:: from (
42
42
tokens
43
43
. last ( )
44
- . map ( |t| t. span . start ( ) )
44
+ . map ( |t| t. span . end ( ) )
45
45
. unwrap_or ( TextSize :: from ( 0 ) ) ,
46
46
) ) ;
47
47
48
- // next_pos should be the initialised with the first valid token already
49
- let mut next_pos = 0 ;
50
- loop {
51
- let token = tokens. get ( next_pos) . unwrap_or ( & eof_token) ;
52
-
53
- if is_irrelevant_token ( token) {
54
- next_pos += 1 ;
55
- } else {
56
- break ;
57
- }
48
+ // Place `current_pos` on the first relevant token
49
+ let mut current_pos = 0 ;
50
+ while is_irrelevant_token ( tokens. get ( current_pos) . unwrap_or ( & eof_token) ) {
51
+ current_pos += 1 ;
58
52
}
59
53
60
54
Self {
61
- ranges : Vec :: new ( ) ,
55
+ stmt_ranges : Vec :: new ( ) ,
62
56
eof_token,
63
57
errors : Vec :: new ( ) ,
64
58
current_stmt_start : None ,
65
59
tokens,
66
- next_pos ,
60
+ current_pos ,
67
61
}
68
62
}
69
63
70
- pub fn finish ( self ) -> Parse {
71
- Parse {
64
+ pub fn finish ( self ) -> ParserResult {
65
+ ParserResult {
72
66
ranges : self
73
- . ranges
67
+ . stmt_ranges
74
68
. iter ( )
75
- . map ( |( start , end ) | {
76
- let from = self . tokens . get ( * start ) ;
77
- let to = self . tokens . get ( * end ) . unwrap_or ( & self . eof_token ) ;
69
+ . map ( |( start_token_pos , end_token_pos ) | {
70
+ let from = self . tokens . get ( * start_token_pos ) ;
71
+ let to = self . tokens . get ( * end_token_pos ) . unwrap_or ( & self . eof_token ) ;
78
72
79
73
TextRange :: new ( from. unwrap ( ) . span . start ( ) , to. span . end ( ) )
80
74
} )
@@ -83,124 +77,87 @@ impl Parser {
83
77
}
84
78
}
85
79
86
- /// Start statement
87
80
pub fn start_stmt ( & mut self ) {
88
81
assert ! (
89
82
self . current_stmt_start. is_none( ) ,
90
83
"cannot start statement within statement at {:?}" ,
91
84
self . tokens. get( self . current_stmt_start. unwrap( ) )
92
85
) ;
93
- self . current_stmt_start = Some ( self . next_pos ) ;
86
+ self . current_stmt_start = Some ( self . current_pos ) ;
94
87
}
95
88
96
- /// Close statement
97
89
pub fn close_stmt ( & mut self ) {
98
- assert ! ( self . next_pos > 0 ) ;
99
-
100
- // go back the positions until we find the first relevant token
101
- let mut end_token_pos = self . next_pos - 1 ;
102
- loop {
103
- let token = self . tokens . get ( end_token_pos) ;
90
+ assert ! (
91
+ self . current_stmt_start. is_some( ) ,
92
+ "Must start statement before closing it."
93
+ ) ;
104
94
105
- if end_token_pos == 0 || token. is_none ( ) {
106
- break ;
107
- }
95
+ let start_token_pos = self . current_stmt_start . unwrap ( ) ;
108
96
109
- if !is_irrelevant_token ( token. unwrap ( ) ) {
110
- break ;
111
- }
97
+ assert ! (
98
+ self . current_pos > start_token_pos,
99
+ "Must close the statement on a token that's later than the start token."
100
+ ) ;
112
101
113
- end_token_pos -= 1 ;
114
- }
102
+ let ( end_token_pos, _) = self . find_last_relevant ( ) . unwrap ( ) ;
115
103
116
- self . ranges . push ( (
117
- self . current_stmt_start . expect ( "Expected active statement" ) ,
118
- end_token_pos,
119
- ) ) ;
104
+ self . stmt_ranges . push ( ( start_token_pos, end_token_pos) ) ;
120
105
121
106
self . current_stmt_start = None ;
122
107
}
123
108
124
- fn advance ( & mut self ) -> & Token {
125
- let mut first_relevant_token = None ;
126
- loop {
127
- let token = self . tokens . get ( self . next_pos ) . unwrap_or ( & self . eof_token ) ;
128
-
129
- // we need to continue with next_pos until the next relevant token after we already
130
- // found the first one
131
- if !is_irrelevant_token ( token) {
132
- if let Some ( t) = first_relevant_token {
133
- return t;
134
- }
135
- first_relevant_token = Some ( token) ;
136
- }
137
-
138
- self . next_pos += 1 ;
139
- }
140
- }
141
-
142
- fn peek ( & self ) -> & Token {
143
- match self . tokens . get ( self . next_pos ) {
109
+ fn current ( & self ) -> & Token {
110
+ match self . tokens . get ( self . current_pos ) {
144
111
Some ( token) => token,
145
112
None => & self . eof_token ,
146
113
}
147
114
}
148
115
149
- /// Look ahead to the next relevant token
150
- /// Returns `None` if we are already at the last relevant token
151
- fn look_ahead ( & self ) -> Option < & Token > {
152
- // we need to look ahead to the next relevant token
153
- let mut look_ahead_pos = self . next_pos + 1 ;
154
- loop {
155
- let token = self . tokens . get ( look_ahead_pos) ?;
156
-
157
- if !is_irrelevant_token ( token) {
158
- return Some ( token) ;
159
- }
116
+ fn advance ( & mut self ) -> & Token {
117
+ // can't reuse any `find_next_relevant` logic because of Mr. Borrow Checker
118
+ let ( pos, token) = self
119
+ . tokens
120
+ . iter ( )
121
+ . enumerate ( )
122
+ . skip ( self . current_pos + 1 )
123
+ . find ( |( _, t) | is_relevant ( t) )
124
+ . unwrap_or ( ( self . tokens . len ( ) , & self . eof_token ) ) ;
125
+
126
+ self . current_pos = pos;
127
+ token
128
+ }
160
129
161
- look_ahead_pos += 1 ;
162
- }
130
+ fn look_ahead ( & self ) -> Option < & Token > {
131
+ self . tokens
132
+ . iter ( )
133
+ . skip ( self . current_pos + 1 )
134
+ . find ( |t| is_relevant ( t) )
163
135
}
164
136
165
137
/// Returns `None` if there are no previous relevant tokens
166
138
fn look_back ( & self ) -> Option < & Token > {
167
- // we need to look back to the last relevant token
168
- let mut look_back_pos = self . next_pos - 1 ;
169
- loop {
170
- let token = self . tokens . get ( look_back_pos) ;
171
-
172
- if look_back_pos == 0 || token. is_none ( ) {
173
- return None ;
174
- }
175
-
176
- if !is_irrelevant_token ( token. unwrap ( ) ) {
177
- return token;
178
- }
179
-
180
- look_back_pos -= 1 ;
181
- }
139
+ self . find_last_relevant ( ) . map ( |it| it. 1 )
182
140
}
183
141
184
- /// checks if the current token is of `kind` and advances if true
185
- /// returns true if the current token is of `kind`
186
- pub fn eat ( & mut self , kind : SyntaxKind ) -> bool {
187
- if self . peek ( ) . kind == kind {
142
+ /// Will advance if the `kind` matches the current token.
143
+ /// Otherwise, will add a diagnostic to the internal `errors`.
144
+ pub fn expect ( & mut self , kind : SyntaxKind ) {
145
+ if self . current ( ) . kind == kind {
188
146
self . advance ( ) ;
189
- true
190
147
} else {
191
- false
148
+ self . errors . push ( SplitDiagnostic :: new (
149
+ format ! ( "Expected {:#?}" , kind) ,
150
+ self . current ( ) . span ,
151
+ ) ) ;
192
152
}
193
153
}
194
154
195
- pub fn expect ( & mut self , kind : SyntaxKind ) {
196
- if self . eat ( kind) {
197
- return ;
198
- }
199
-
200
- self . errors . push ( SplitDiagnostic :: new (
201
- format ! ( "Expected {:#?}" , kind) ,
202
- self . peek ( ) . span ,
203
- ) ) ;
155
+ fn find_last_relevant ( & self ) -> Option < ( usize , & Token ) > {
156
+ self . tokens
157
+ . iter ( )
158
+ . enumerate ( )
159
+ . take ( self . current_pos )
160
+ . rfind ( |( _, t) | is_relevant ( t) )
204
161
}
205
162
}
206
163
@@ -219,3 +176,57 @@ fn is_irrelevant_token(t: &Token) -> bool {
219
176
WHITESPACE_TOKENS . contains ( & t. kind )
220
177
&& ( t. kind != SyntaxKind :: Newline || t. text . chars ( ) . count ( ) == 1 )
221
178
}
179
+
180
+ fn is_relevant ( t : & Token ) -> bool {
181
+ !is_irrelevant_token ( t)
182
+ }
183
+
184
+ #[ cfg( test) ]
185
+ mod tests {
186
+ use pgt_lexer:: SyntaxKind ;
187
+
188
+ use crate :: parser:: Parser ;
189
+
190
+ #[ test]
191
+ fn advance_works_as_expected ( ) {
192
+ let sql = r#"
193
+ create table users (
194
+ id serial primary key,
195
+ name text,
196
+ email text
197
+ );
198
+ "# ;
199
+ let tokens = pgt_lexer:: lex ( sql) . unwrap ( ) ;
200
+ let total_num_tokens = tokens. len ( ) ;
201
+
202
+ let mut parser = Parser :: new ( tokens) ;
203
+
204
+ let expected = vec ! [
205
+ ( SyntaxKind :: Create , 2 ) ,
206
+ ( SyntaxKind :: Table , 4 ) ,
207
+ ( SyntaxKind :: Ident , 6 ) ,
208
+ ( SyntaxKind :: Ascii40 , 8 ) ,
209
+ ( SyntaxKind :: Ident , 11 ) ,
210
+ ( SyntaxKind :: Ident , 13 ) ,
211
+ ( SyntaxKind :: Primary , 15 ) ,
212
+ ( SyntaxKind :: Key , 17 ) ,
213
+ ( SyntaxKind :: Ascii44 , 18 ) ,
214
+ ( SyntaxKind :: NameP , 21 ) ,
215
+ ( SyntaxKind :: TextP , 23 ) ,
216
+ ( SyntaxKind :: Ascii44 , 24 ) ,
217
+ ( SyntaxKind :: Ident , 27 ) ,
218
+ ( SyntaxKind :: TextP , 29 ) ,
219
+ ( SyntaxKind :: Ascii41 , 32 ) ,
220
+ ( SyntaxKind :: Ascii59 , 33 ) ,
221
+ ] ;
222
+
223
+ for ( kind, pos) in expected {
224
+ assert_eq ! ( parser. current( ) . kind, kind) ;
225
+ assert_eq ! ( parser. current_pos, pos) ;
226
+ parser. advance ( ) ;
227
+ }
228
+
229
+ assert_eq ! ( parser. current( ) . kind, SyntaxKind :: Eof ) ;
230
+ assert_eq ! ( parser. current_pos, total_num_tokens) ;
231
+ }
232
+ }
0 commit comments