Skip to content

Commit 4a0dd13

Browse files
committed
added custom file mapping for dockerfile
1 parent 6362d4c commit 4a0dd13

File tree

2 files changed

+346
-0
lines changed

2 files changed

+346
-0
lines changed

_automation/main.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,8 @@ func (s *UpdateService) downloadGrammar(ctx context.Context, g *Grammar) {
231231
s.makeDir(ctx, g.Language)
232232

233233
switch g.Language {
234+
case "dockerfile":
235+
s.downloadDockerfile(ctx, g)
234236
case "ocaml":
235237
s.downloadOcaml(ctx, g)
236238
case "typescript":
@@ -367,6 +369,32 @@ func (s *UpdateService) downloadPhp(ctx context.Context, g *Grammar) {
367369
}
368370
}
369371

372+
func (s *UpdateService) downloadDockerfile(ctx context.Context, g *Grammar) {
373+
fileMapping := map[string]string{
374+
"parser.h": "src/tree_sitter/parser.h",
375+
"parser.c": "src/parser.c",
376+
"scanner.c": "src/scanner.c",
377+
}
378+
379+
url := g.ContentURL()
380+
for _, f := range g.Files {
381+
fp, ok := fileMapping[f]
382+
if !ok {
383+
logAndExit(getLogger(ctx), "mapping for file not found", "file", f)
384+
}
385+
386+
s.downloadFile(
387+
ctx,
388+
fmt.Sprintf("%s/%s/%s", url, g.Revision, fp),
389+
fmt.Sprintf("%s/%s", g.Language, f),
390+
map[string]string{
391+
`"tree_sitter/parser.h"`: `"parser.h"`,
392+
`<tree_sitter/parser.h>`: `"parser.h"`,
393+
},
394+
)
395+
}
396+
}
397+
370398
// ocaml is special since its folder structure is different from the other ones
371399
func (s *UpdateService) downloadOcaml(ctx context.Context, g *Grammar) {
372400
fileMapping := map[string]string{

dockerfile/scanner.c

Lines changed: 318 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,318 @@
1+
#include <stdbool.h>
2+
#include <stdlib.h>
3+
#include <string.h>
4+
#include <wctype.h>
5+
6+
#include "parser.h"
7+
8+
#define MAX_HEREDOCS 10
9+
#define DEL_SPACE 512
10+
11+
typedef struct {
12+
bool in_heredoc;
13+
bool stripping_heredoc;
14+
unsigned heredoc_count;
15+
char *heredocs[MAX_HEREDOCS];
16+
} scanner_state;
17+
18+
enum TokenType {
19+
HEREDOC_MARKER,
20+
HEREDOC_LINE,
21+
HEREDOC_END,
22+
HEREDOC_NL,
23+
ERROR_SENTINEL,
24+
};
25+
26+
void *tree_sitter_dockerfile_external_scanner_create() {
27+
scanner_state *state = malloc(sizeof(scanner_state));
28+
memset(state, 0, sizeof(scanner_state));
29+
return state;
30+
}
31+
32+
void tree_sitter_dockerfile_external_scanner_destroy(void *payload) {
33+
if (!payload)
34+
return;
35+
36+
scanner_state *state = payload;
37+
for (unsigned i = 0; i < MAX_HEREDOCS; i++) {
38+
if (state->heredocs[i]) {
39+
free(state->heredocs[i]);
40+
}
41+
}
42+
43+
free(state);
44+
}
45+
46+
unsigned tree_sitter_dockerfile_external_scanner_serialize(void *payload,
47+
char *buffer) {
48+
scanner_state *state = payload;
49+
50+
unsigned pos = 0;
51+
buffer[pos++] = state->in_heredoc;
52+
buffer[pos++] = state->stripping_heredoc;
53+
54+
for (unsigned i = 0; i < state->heredoc_count; i++) {
55+
// Add the ending null byte to the length since we'll have to copy it as
56+
// well.
57+
unsigned len = strlen(state->heredocs[i]) + 1;
58+
59+
// If we run out of space, just drop the heredocs that don't fit.
60+
// We need at least len + 1 bytes space since we'll copy len bytes below
61+
// and later add a null byte at the end.
62+
if (pos + len + 1 > TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
63+
break;
64+
}
65+
66+
memcpy(&buffer[pos], state->heredocs[i], len);
67+
pos += len;
68+
}
69+
70+
// Add a null byte at the end to make it easy to detect.
71+
buffer[pos++] = 0;
72+
return pos;
73+
}
74+
75+
void tree_sitter_dockerfile_external_scanner_deserialize(void *payload,
76+
const char *buffer,
77+
unsigned length) {
78+
scanner_state *state = payload;
79+
// Free all current heredocs to avoid leaking memory when we overwrite the
80+
// array later.
81+
for (unsigned i = 0; i < state->heredoc_count; i++) {
82+
free(state->heredocs[i]);
83+
state->heredocs[i] = NULL;
84+
}
85+
86+
if (length == 0) {
87+
state->in_heredoc = false;
88+
state->stripping_heredoc = false;
89+
state->heredoc_count = 0;
90+
} else {
91+
unsigned pos = 0;
92+
state->in_heredoc = buffer[pos++];
93+
state->stripping_heredoc = buffer[pos++];
94+
95+
unsigned heredoc_count = 0;
96+
for (unsigned i = 0; i < MAX_HEREDOCS; i++) {
97+
unsigned len = strlen(&buffer[pos]);
98+
99+
// We found the ending null byte which means that we're done.
100+
if (len == 0)
101+
break;
102+
103+
// Account for the ending null byte in strings (again).
104+
len++;
105+
char *heredoc = malloc(len);
106+
memcpy(heredoc, &buffer[pos], len);
107+
state->heredocs[i] = heredoc;
108+
heredoc_count++;
109+
110+
pos += len;
111+
}
112+
113+
state->heredoc_count = heredoc_count;
114+
}
115+
}
116+
117+
static void skip_whitespace(TSLexer *lexer) {
118+
while (lexer->lookahead != '\0' && lexer->lookahead != '\n' &&
119+
iswspace(lexer->lookahead))
120+
lexer->advance(lexer, true);
121+
}
122+
123+
static bool scan_marker(scanner_state *state, TSLexer *lexer) {
124+
skip_whitespace(lexer);
125+
126+
if (lexer->lookahead != '<')
127+
return false;
128+
lexer->advance(lexer, false);
129+
130+
if (lexer->lookahead != '<')
131+
return false;
132+
lexer->advance(lexer, false);
133+
134+
bool stripping = false;
135+
if (lexer->lookahead == '-') {
136+
stripping = true;
137+
lexer->advance(lexer, false);
138+
}
139+
140+
int32_t quote = 0;
141+
if (lexer->lookahead == '"' || lexer->lookahead == '\'') {
142+
quote = lexer->lookahead;
143+
lexer->advance(lexer, false);
144+
}
145+
146+
// Reserve a reasonable amount of space for the heredoc delimiter string.
147+
// Most heredocs (like EOF, EOT, EOS, FILE, etc.) are pretty short so we'll
148+
// usually only need a few bytes. We're also limited to less than 1024 bytes
149+
// by tree-sitter since our state has to fit in
150+
// TREE_SITTER_SERIALIZATION_BUFFER_SIZE.
151+
char delimiter[DEL_SPACE];
152+
153+
// We start recording the actual string at position 1 since we store whether
154+
// it's a stripping heredoc in the first position (with either a dash or a
155+
// space).
156+
unsigned del_idx = 1;
157+
158+
while (lexer->lookahead != '\0' &&
159+
(quote ? lexer->lookahead != quote : !iswspace(lexer->lookahead))) {
160+
if (lexer->lookahead == '\\') {
161+
lexer->advance(lexer, false);
162+
163+
if (lexer->lookahead == '\0') {
164+
return false;
165+
}
166+
}
167+
168+
if (del_idx > 0) {
169+
delimiter[del_idx++] = lexer->lookahead;
170+
}
171+
lexer->advance(lexer, false);
172+
173+
// If we run out of space, stop recording the delimiter but keep
174+
// advancing the lexer to ensure that we at least parse the marker
175+
// correctly. Reserve two bytes: one for the strip indicator and
176+
// one for the terminating null byte.
177+
if (del_idx >= DEL_SPACE - 2) {
178+
del_idx = 0;
179+
}
180+
}
181+
182+
if (quote) {
183+
if (lexer->lookahead != quote) {
184+
return false;
185+
}
186+
lexer->advance(lexer, false);
187+
}
188+
189+
if (del_idx == 0) {
190+
lexer->result_symbol = HEREDOC_MARKER;
191+
return true;
192+
}
193+
194+
delimiter[0] = stripping ? '-' : ' ';
195+
delimiter[del_idx] = '\0';
196+
197+
// We copy the delimiter string to the heap here since we can't store our
198+
// stack-allocated string in our state (which is stored on the heap).
199+
char *del_copy = malloc(del_idx + 1);
200+
memcpy(del_copy, delimiter, del_idx + 1);
201+
202+
if (state->heredoc_count == 0) {
203+
state->heredoc_count = 1;
204+
state->heredocs[0] = del_copy;
205+
state->stripping_heredoc = stripping;
206+
} else if (state->heredoc_count >= MAX_HEREDOCS) {
207+
free(del_copy);
208+
} else {
209+
state->heredocs[state->heredoc_count++] = del_copy;
210+
}
211+
212+
lexer->result_symbol = HEREDOC_MARKER;
213+
return true;
214+
}
215+
216+
static bool scan_content(scanner_state *state, TSLexer *lexer,
217+
const bool *valid_symbols) {
218+
if (state->heredoc_count == 0) {
219+
state->in_heredoc = false;
220+
return false;
221+
}
222+
223+
state->in_heredoc = true;
224+
225+
if (state->stripping_heredoc) {
226+
skip_whitespace(lexer);
227+
}
228+
229+
if (valid_symbols[HEREDOC_END]) {
230+
unsigned delim_idx = 1;
231+
// Look for the current heredoc delimiter.
232+
while (state->heredocs[0][delim_idx] != '\0' &&
233+
lexer->lookahead != '\0' &&
234+
lexer->lookahead == state->heredocs[0][delim_idx]) {
235+
lexer->advance(lexer, false);
236+
delim_idx++;
237+
}
238+
239+
// Check if the entire string matched.
240+
if (state->heredocs[0][delim_idx] == '\0') {
241+
lexer->result_symbol = HEREDOC_END;
242+
243+
// Shift the first heredoc off the list.
244+
free(state->heredocs[0]);
245+
246+
for (unsigned i = 1; i < state->heredoc_count; i++) {
247+
state->heredocs[i - 1] = state->heredocs[i];
248+
}
249+
state->heredocs[state->heredoc_count - 1] = NULL;
250+
state->heredoc_count--;
251+
252+
if (state->heredoc_count > 0) {
253+
state->stripping_heredoc = state->heredocs[0][0] == '-';
254+
} else {
255+
state->in_heredoc = false;
256+
}
257+
258+
return true;
259+
}
260+
}
261+
262+
if (!valid_symbols[HEREDOC_LINE])
263+
return false;
264+
265+
lexer->result_symbol = HEREDOC_LINE;
266+
267+
for (;;) {
268+
switch (lexer->lookahead) {
269+
case '\0':
270+
if (lexer->eof(lexer)) {
271+
state->in_heredoc = false;
272+
return true;
273+
}
274+
lexer->advance(lexer, false);
275+
break;
276+
277+
case '\n':
278+
return true;
279+
280+
default:
281+
lexer->advance(lexer, false);
282+
}
283+
}
284+
}
285+
286+
bool tree_sitter_dockerfile_external_scanner_scan(void *payload, TSLexer *lexer,
287+
const bool *valid_symbols) {
288+
scanner_state *state = payload;
289+
290+
if (valid_symbols[ERROR_SENTINEL]) {
291+
if (state->in_heredoc) {
292+
return scan_content(state, lexer, valid_symbols);
293+
} else {
294+
return scan_marker(state, lexer);
295+
}
296+
}
297+
298+
// HEREDOC_NL only matches a linebreak if there are open heredocs. This is
299+
// necessary to avoid a conflict in the grammar since a normal line break
300+
// could either be the start of a heredoc or the end of an instruction.
301+
if (valid_symbols[HEREDOC_NL]) {
302+
if (state->heredoc_count > 0 && lexer->lookahead == '\n') {
303+
lexer->result_symbol = HEREDOC_NL;
304+
lexer->advance(lexer, false);
305+
return true;
306+
}
307+
}
308+
309+
if (valid_symbols[HEREDOC_MARKER]) {
310+
return scan_marker(state, lexer);
311+
}
312+
313+
if (valid_symbols[HEREDOC_LINE] || valid_symbols[HEREDOC_END]) {
314+
return scan_content(state, lexer, valid_symbols);
315+
}
316+
317+
return false;
318+
}

0 commit comments

Comments
 (0)