Skip to content

Commit dd7463f

Browse files
committed
Check-in basic parser implementation
0 parents  commit dd7463f

File tree

10 files changed

+1060
-0
lines changed

10 files changed

+1060
-0
lines changed

go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
module github.com/foxcpp/go-sieve

lexer/lex.go

+381
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,381 @@
1+
package lexer
2+
3+
import (
4+
"bufio"
5+
"bytes"
6+
"fmt"
7+
"io"
8+
"strconv"
9+
"strings"
10+
)
11+
12+
type Options struct {
13+
NoPosition bool
14+
}
15+
16+
func consumeCRLF(r *bufio.Reader, state *lexerState) error {
17+
b, err := r.ReadByte()
18+
if err != nil {
19+
return err
20+
}
21+
switch b {
22+
case '\r':
23+
b, err = r.ReadByte()
24+
if err != nil {
25+
return err
26+
}
27+
if b != '\n' {
28+
return fmt.Errorf("CR is not followed by LF")
29+
}
30+
fallthrough
31+
case '\n':
32+
state.Line++
33+
state.Col = 0
34+
return nil
35+
default:
36+
panic("consumeCRLF should not be called not on CR/LF")
37+
}
38+
}
39+
40+
func Lex(r io.Reader, opts *Options) ([]Token, error) {
41+
if opts == nil {
42+
opts = &Options{}
43+
}
44+
toks, err := tokenStream(bufio.NewReader(r), opts)
45+
if err != nil {
46+
if err == io.EOF {
47+
return nil, io.ErrUnexpectedEOF
48+
}
49+
return nil, err
50+
}
51+
return toks, nil
52+
}
53+
54+
type lexerState struct {
55+
Position
56+
}
57+
58+
func tokenStream(r *bufio.Reader, opts *Options) ([]Token, error) {
59+
res := []Token{}
60+
state := &lexerState{}
61+
state.Line = 1
62+
for {
63+
b, err := r.ReadByte()
64+
if err != nil {
65+
if err == io.EOF {
66+
break
67+
}
68+
return nil, err
69+
}
70+
if opts.NoPosition {
71+
state.Line = 0
72+
state.Col = 0
73+
} else {
74+
state.Col++
75+
}
76+
switch b {
77+
case 0:
78+
return nil, fmt.Errorf("go-sieve/lexer: NUL is not allowed in input stream")
79+
case '[':
80+
res = append(res, ListStart{state.Position})
81+
case ']':
82+
res = append(res, ListEnd{state.Position})
83+
case '{':
84+
res = append(res, BlockStart{state.Position})
85+
case '}':
86+
res = append(res, BlockEnd{state.Position})
87+
case '(':
88+
res = append(res, TestListStart{state.Position})
89+
case ')':
90+
res = append(res, TestListEnd{state.Position})
91+
case ',':
92+
res = append(res, Comma{state.Position})
93+
case ':':
94+
res = append(res, Colon{state.Position})
95+
case ';':
96+
res = append(res, Semicolon{state.Position})
97+
case ' ', '\t':
98+
continue
99+
case '\r', '\n':
100+
if err := r.UnreadByte(); err != nil {
101+
return nil, err
102+
}
103+
if err := consumeCRLF(r, state); err != nil {
104+
return nil, err
105+
}
106+
case '"':
107+
lineCol := state.Position
108+
str, err := quotedString(r, state)
109+
if err != nil {
110+
return nil, err
111+
}
112+
res = append(res, String{Position: lineCol, Text: str})
113+
case '#':
114+
if err := hashComment(r, state); err != nil {
115+
return nil, err
116+
}
117+
case '/':
118+
b2, err := r.ReadByte()
119+
if err != nil {
120+
return nil, err
121+
}
122+
state.Col++
123+
if b2 != '*' {
124+
return nil, fmt.Errorf("unexpected forward slash")
125+
}
126+
if err := multilineComment(r, state); err != nil {
127+
return nil, err
128+
}
129+
case 't':
130+
// "text:"
131+
lineCol := state.Position
132+
ext, err := r.Peek(4)
133+
if err != nil {
134+
return nil, err
135+
}
136+
if bytes.Equal(ext, []byte("ext:")) {
137+
if _, err := r.Discard(4); err != nil {
138+
return nil, err
139+
}
140+
state.Col += 4
141+
// we consume whitespace and then build the multiline string
142+
wsLoop:
143+
for {
144+
b, err := r.ReadByte()
145+
if err != nil {
146+
return nil, err
147+
}
148+
state.Col++
149+
switch b {
150+
case ' ', '\t':
151+
continue
152+
case '#':
153+
if err := hashComment(r, state); err != nil {
154+
return nil, err
155+
}
156+
break wsLoop
157+
case '\r', '\n':
158+
if err := r.UnreadByte(); err != nil {
159+
return nil, err
160+
}
161+
if err := consumeCRLF(r, state); err != nil {
162+
return nil, err
163+
}
164+
break wsLoop
165+
default:
166+
return nil, fmt.Errorf("unexpected character: %v", b)
167+
}
168+
}
169+
mlString, err := multilineString(r, state)
170+
if err != nil {
171+
return nil, err
172+
}
173+
res = append(res, String{Position: lineCol, Text: mlString})
174+
}
175+
default:
176+
lineCol := state.Position
177+
if err := r.UnreadByte(); err != nil {
178+
return nil, err
179+
}
180+
state.Col--
181+
182+
if (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') {
183+
str, err := identifier(r, state)
184+
if err != nil {
185+
return nil, err
186+
}
187+
res = append(res, Identifier{Position: lineCol, Text: str})
188+
} else if b >= '0' && b <= '9' {
189+
num, err := number(r, state)
190+
if err != nil {
191+
return nil, err
192+
}
193+
num.Position = lineCol
194+
res = append(res, num)
195+
} else {
196+
return nil, fmt.Errorf("unexpected character: %v", b)
197+
}
198+
}
199+
}
200+
return res, nil
201+
}
202+
203+
func identifier(r *bufio.Reader, state *lexerState) (string, error) {
204+
id := strings.Builder{}
205+
for {
206+
b, err := r.ReadByte()
207+
if err != nil {
208+
if err == io.EOF {
209+
break
210+
}
211+
return "", err
212+
}
213+
state.Col++
214+
if (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' {
215+
id.WriteByte(b)
216+
} else {
217+
if err := r.UnreadByte(); err != nil {
218+
return "", err
219+
}
220+
state.Col--
221+
break
222+
}
223+
}
224+
return id.String(), nil
225+
}
226+
227+
func number(r *bufio.Reader, state *lexerState) (Number, error) {
228+
num := strings.Builder{}
229+
q := None
230+
readLoop:
231+
for {
232+
b, err := r.ReadByte()
233+
if err != nil {
234+
if err == io.EOF {
235+
break
236+
}
237+
return Number{}, err
238+
}
239+
state.Col++
240+
switch b {
241+
case 'K', 'G', 'M':
242+
q = Quantifier(b)
243+
break readLoop
244+
}
245+
if b >= '0' && b <= '9' {
246+
num.WriteByte(b)
247+
} else {
248+
if err := r.UnreadByte(); err != nil {
249+
return Number{}, err
250+
}
251+
state.Col--
252+
break readLoop
253+
}
254+
}
255+
256+
numParsed, err := strconv.Atoi(num.String())
257+
if err != nil {
258+
return Number{}, err
259+
}
260+
return Number{Value: numParsed, Quantifier: q}, nil
261+
}
262+
263+
func hashComment(r *bufio.Reader, state *lexerState) error {
264+
for {
265+
b, err := r.ReadByte()
266+
if err != nil {
267+
if err == io.EOF {
268+
break
269+
}
270+
return err
271+
}
272+
state.Col++
273+
if b == '\r' || b == '\n' {
274+
if err := r.UnreadByte(); err != nil {
275+
return err
276+
}
277+
if err := consumeCRLF(r, state); err != nil {
278+
return err
279+
}
280+
break
281+
}
282+
}
283+
return nil
284+
}
285+
286+
func multilineComment(r *bufio.Reader, state *lexerState) error {
287+
wasStar := false
288+
for {
289+
b, err := r.ReadByte()
290+
if err != nil {
291+
return err
292+
}
293+
state.Col++
294+
if b == '\n' {
295+
state.Line++
296+
state.Col = 0
297+
}
298+
if wasStar && b == '/' {
299+
return nil
300+
}
301+
wasStar = b == '*'
302+
}
303+
}
304+
305+
func quotedString(r *bufio.Reader, state *lexerState) (string, error) {
306+
str := strings.Builder{}
307+
atBackslash := false
308+
for {
309+
b, err := r.ReadByte()
310+
if err != nil {
311+
return "", err
312+
}
313+
state.Col++
314+
switch b {
315+
case '\r', '\n':
316+
if err := r.UnreadByte(); err != nil {
317+
return "", err
318+
}
319+
if err := consumeCRLF(r, state); err != nil {
320+
return "", err
321+
}
322+
323+
str.WriteByte('\r')
324+
str.WriteByte('\n')
325+
case '\\':
326+
if !atBackslash {
327+
atBackslash = true
328+
continue
329+
}
330+
str.WriteByte(b)
331+
case '"':
332+
if !atBackslash {
333+
return str.String(), nil
334+
}
335+
str.WriteByte(b)
336+
default:
337+
str.WriteByte(b)
338+
}
339+
atBackslash = false
340+
}
341+
}
342+
343+
func multilineString(r *bufio.Reader, state *lexerState) (string, error) {
344+
atLF := false
345+
atLFHadDot := false
346+
var data strings.Builder
347+
for {
348+
b, err := r.ReadByte()
349+
if err != nil {
350+
return "", err
351+
}
352+
state.Col++
353+
// We also normalize LF into CRLF while reading multiline strings.
354+
switch b {
355+
case '.':
356+
if atLF {
357+
atLFHadDot = true
358+
} else {
359+
data.WriteByte('.')
360+
}
361+
362+
atLF = false
363+
case '\r', '\n':
364+
if err := r.UnreadByte(); err != nil {
365+
return "", err
366+
}
367+
if err := consumeCRLF(r, state); err != nil {
368+
return "", err
369+
}
370+
if atLFHadDot {
371+
return data.String(), nil
372+
}
373+
data.WriteByte('\r')
374+
data.WriteByte('\n')
375+
atLF = true
376+
default:
377+
atLF = false
378+
atLFHadDot = false
379+
}
380+
}
381+
}

0 commit comments

Comments
 (0)