bash - tokeniser.go
1 package bash
2
3 import (
4 "io"
5 "strings"
6
7 "vimagination.zapto.org/parser"
8 )
9
10 var (
11 keywords = []string{"if", "then", "else", "elif", "fi", "case", "esac", "while", "for", "in", "do", "done", "time", "until", "coproc", "select", "function", "{", "}", "[[", "]]", "!"}
12 dotdot = []string{".."}
13 escapedNewline = []string{"\\\n"}
14 assignment = []string{"=", "+="}
15 )
16
17 const (
18 whitespace = " \t"
19 newline = "\n"
20 metachars = whitespace + newline + "|&;()<>"
21 heredocsBreak = metachars + "\\\"'"
22 heredocStringBreak = newline + "$"
23 doubleStops = "\\`$\""
24 singleStops = "'"
25 ansiStops = "'\\"
26 word = "\\\"'`(){}- \t\n"
27 wordNoBracket = "\\\"'`(){}- \t\n]"
28 wordBreak = " `\\\t\n$|&;<>(){"
29 wordBreakNoBracket = wordBreak + "]"
30 wordBreakNoBrace = wordBreak + "}"
31 braceWordBreak = " `\\\t\n|&;<>()={},"
32 hexDigit = "0123456789ABCDEFabcdef"
33 octalDigit = "012345678"
34 decimalDigit = "0123456789"
35 letters = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz"
36 identStart = letters + "_"
37 identCont = decimalDigit + identStart
38 numberChars = identCont + "@"
39 )
40
41 const (
42 TokenWhitespace parser.TokenType = iota
43 TokenLineTerminator
44 TokenComment
45 TokenIdentifier
46 TokenIdentifierAssign
47 TokenKeyword
48 TokenWord
49 TokenNumberLiteral
50 TokenString
51 TokenStringStart
52 TokenStringMid
53 TokenStringEnd
54 TokenBraceExpansion
55 TokenBraceWord
56 TokenPunctuator
57 TokenHeredoc
58 TokenHeredocEnd
59 TokenOpenBacktick
60 TokenCloseBacktick
61 )
62
63 type bashTokeniser struct {
64 tokenDepth []byte
65 heredoc [][]string
66 backticks int
67 }
68
69 // SetTokeniser sets the initial tokeniser state of a parser.Tokeniser.
70 //
71 // Used if you want to manually tokenise bash code.
72 func SetTokeniser(t *parser.Tokeniser) *parser.Tokeniser {
73 p := new(bashTokeniser)
74
75 t.TokeniserState(p.main)
76
77 return t
78 }
79
80 func (b *bashTokeniser) lastTokenDepth() rune {
81 if len(b.tokenDepth) == 0 {
82 return 0
83 }
84
85 return rune(b.tokenDepth[len(b.tokenDepth)-1])
86 }
87
88 func (b *bashTokeniser) pushTokenDepth(c rune) {
89 b.tokenDepth = append(b.tokenDepth, byte(c))
90 }
91
92 func (b *bashTokeniser) popTokenDepth() {
93 b.tokenDepth = b.tokenDepth[:len(b.tokenDepth)-1]
94 }
95
96 func (b *bashTokeniser) main(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
97 td := b.lastTokenDepth()
98
99 if t.Peek() == -1 {
100 if td == 0 {
101 return t.Done()
102 }
103
104 return t.ReturnError(io.ErrUnexpectedEOF)
105 } else if td == 'h' {
106 b.popTokenDepth()
107
108 return b.heredocString(t)
109 } else if td == '"' || td == '\'' {
110 return b.string(t, false)
111 } else if t.Accept(whitespace) || t.AcceptWord(escapedNewline, false) != "" {
112 for t.AcceptRun(whitespace) != -1 {
113 if t.AcceptWord(escapedNewline, false) == "" {
114 break
115 }
116 }
117
118 return t.Return(TokenWhitespace, b.main)
119 } else if t.Accept(newline) {
120 if td == 'H' {
121 return t.Return(TokenLineTerminator, b.heredocString)
122 }
123
124 t.AcceptRun(newline)
125
126 return t.Return(TokenLineTerminator, b.main)
127 } else if t.Accept("#") {
128 t.ExceptRun(newline)
129
130 return t.Return(TokenComment, b.main)
131 } else if td == '>' || td == '/' || td == ':' {
132 return b.arithmeticExpansion(t)
133 }
134
135 return b.operatorOrWord(t)
136 }
137
138 func (b *bashTokeniser) string(t *parser.Tokeniser, start bool) (parser.Token, parser.TokenFunc) {
139 stops := singleStops
140
141 td := b.lastTokenDepth()
142 if td == '"' {
143 stops = doubleStops
144 } else if td == '$' {
145 stops = ansiStops
146 }
147
148 tk := TokenStringMid
149
150 if start {
151 tk = TokenStringStart
152 }
153
154 for {
155 switch t.ExceptRun(stops) {
156 default:
157 return t.ReturnError(io.ErrUnexpectedEOF)
158 case '\n':
159 return t.ReturnError(ErrInvalidCharacter)
160 case '`':
161 return t.Return(tk, b.backtickOrIdentOrWord)
162 case '$':
163 return t.Return(tk, b.identifier)
164 case '"', '\'':
165 t.Next()
166 b.popTokenDepth()
167
168 tk = TokenStringEnd
169
170 if start {
171 tk = TokenString
172 }
173
174 return t.Return(tk, b.main)
175 case '\\':
176 if td == '"' && b.isBacktick(t, true) != backtickNone {
177 return t.Return(tk, b.backtickOrIdentOrWord)
178 }
179
180 t.Next()
181 t.Next()
182 }
183 }
184 }
185
186 func (b *bashTokeniser) arithmeticExpansion(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
187 switch c := t.Peek(); c {
188 case -1:
189 return t.ReturnError(io.ErrUnexpectedEOF)
190 case '"', '\'':
191 return b.stringStart(t)
192 case '$':
193 return b.identifier(t)
194 case '+', '-', '&', '|':
195 t.Next()
196
197 if t.Peek() == c {
198 t.Next()
199 } else {
200 t.Accept("=")
201 }
202 case '<', '>':
203 t.Next()
204 t.Accept("=")
205 case '=', '!', '/', '%', '^':
206 t.Next()
207 t.Accept("=")
208 case '*':
209 t.Next()
210 t.Accept("*=")
211 case '~', ',':
212 t.Next()
213 case '?':
214 t.Next()
215 b.pushTokenDepth(':')
216 case ':':
217 t.Next()
218
219 if b.lastTokenDepth() != ':' {
220 return t.ReturnError(ErrInvalidCharacter)
221 }
222
223 b.popTokenDepth()
224 case ')':
225 t.Next()
226
227 if td := b.lastTokenDepth(); (td != '>' || !t.Accept(")")) && td != '/' {
228 return t.ReturnError(ErrInvalidCharacter)
229 }
230
231 b.popTokenDepth()
232 case '(':
233 t.Next()
234 b.pushTokenDepth('/')
235 case '0':
236 return b.zero(t)
237 default:
238 return b.number(t)
239 }
240
241 return t.Return(TokenPunctuator, b.main)
242 }
243
244 func (b *bashTokeniser) operatorOrWord(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
245 switch c := t.Peek(); c {
246 default:
247 return b.keywordIdentOrWord(t)
248 case '<':
249 t.Next()
250
251 if t.Accept("<") {
252 if !t.Accept("<-") {
253 t.Accept("-")
254
255 return t.Return(TokenPunctuator, b.startHeredoc)
256 }
257 } else {
258 t.Accept("&>")
259 }
260 case '>':
261 t.Next()
262 t.Accept(">&|")
263 case '|':
264 t.Next()
265 t.Accept("&|")
266 case '&':
267 t.Next()
268
269 if t.Accept(">") {
270 t.Accept(">")
271 } else {
272 t.Accept("&")
273 }
274 case ';':
275 t.Next()
276 t.Accept(";")
277 t.Accept("&")
278 case '"', '\'':
279 return b.stringStart(t)
280 case '(':
281 t.Next()
282 b.pushTokenDepth(')')
283 case '{':
284 t.Next()
285
286 if !strings.ContainsRune(word, t.Peek()) || t.Peek() == '-' {
287 return b.braceExpansion(t)
288 }
289
290 b.pushTokenDepth('}')
291 case '}', ')', ']':
292 t.Next()
293
294 if b.lastTokenDepth() != c {
295 return t.ReturnError(ErrInvalidCharacter)
296 }
297
298 b.popTokenDepth()
299 case '+':
300 t.Next()
301
302 if !t.Accept("=") {
303 return t.ReturnError(ErrInvalidCharacter)
304 }
305 case '=':
306 t.Next()
307 case '$':
308 return b.identifier(t)
309 case '`', '\\':
310 return b.backtickOrIdentOrWord(t)
311 }
312
313 return t.Return(TokenPunctuator, b.main)
314 }
315
316 func (b *bashTokeniser) backtickOrIdentOrWord(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
317 switch b.isBacktick(t, false) {
318 case backtickNone:
319 return b.keywordIdentOrWord(t)
320 case backtickOpen:
321 b.pushTokenDepth('`')
322
323 b.backticks++
324
325 return t.Return(TokenOpenBacktick, b.main)
326 case backtickClose:
327 b.popTokenDepth()
328
329 b.backticks--
330
331 return t.Return(TokenCloseBacktick, b.main)
332 default:
333 return t.ReturnError(ErrIncorrectBacktick)
334 }
335 }
336
337 type backtick uint8
338
339 const (
340 backtickNone backtick = iota
341 backtickOpen
342 backtickClose
343 backtickInvalid
344 )
345
346 func (b *bashTokeniser) isBacktick(t *parser.Tokeniser, reset bool) backtick {
347 if reset {
348 defer t.State().Reset()
349 }
350
351 pos := t.Len()
352
353 t.AcceptRun("\\")
354
355 slashes := t.Len() - pos
356
357 if slashes == 0 {
358 if b.backticks == 0 && t.Accept("`") {
359 return backtickOpen
360 } else if b.backticks == 1 && b.lastTokenDepth() == '`' && t.Accept("`") {
361 return backtickClose
362 }
363 } else if (b.backticks<<1)-1 == slashes && t.Accept("`") {
364 return backtickOpen
365 } else if ((b.backticks-1)<<1)-1 == slashes && t.Accept("`") {
366 return backtickClose
367 }
368
369 if (b.backticks<<1)-1 > slashes && t.Accept("`") {
370 return backtickInvalid
371 }
372
373 return backtickNone
374 }
375
376 func (b *bashTokeniser) startHeredoc(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
377 if t.Peek() == -1 || t.Accept(newline) || t.Accept("#") {
378 return t.ReturnError(io.ErrUnexpectedEOF)
379 } else if t.Accept(whitespace) || t.AcceptWord(escapedNewline, false) != "" {
380 for t.AcceptRun(whitespace) != -1 {
381 if t.AcceptWord(escapedNewline, false) == "" {
382 break
383 }
384 }
385
386 return t.Return(TokenWhitespace, b.startHeredoc)
387 }
388
389 chars := heredocsBreak
390
391 Loop:
392 for {
393 switch t.ExceptRun(chars) {
394 case -1:
395 return t.ReturnError(io.ErrUnexpectedEOF)
396 case '\\':
397 t.Next()
398 t.Next()
399 case '\'':
400 t.Next()
401
402 if chars == heredocsBreak {
403 chars = "'"
404 } else {
405 chars = heredocsBreak
406 }
407 case '"':
408 if chars == heredocsBreak {
409 chars = "\\\""
410 } else {
411 chars = heredocsBreak
412 }
413 default:
414 break Loop
415 }
416 }
417
418 tk := parser.Token{
419 Type: TokenWord,
420 Data: t.Get(),
421 }
422
423 if b.lastTokenDepth() == 'H' {
424 b.heredoc[len(b.heredoc)-1] = append(b.heredoc[len(b.heredoc)-1], unstring(tk.Data))
425 } else {
426 b.pushTokenDepth('H')
427 b.heredoc = append(b.heredoc, []string{unstring(tk.Data)})
428 }
429
430 return tk, b.main
431 }
432
433 func unstring(str string) string {
434 var sb strings.Builder
435
436 nextEscaped := false
437
438 for _, c := range str {
439 if nextEscaped {
440 switch c {
441 case 'n':
442 c = '\n'
443 case 't':
444 c = '\t'
445 }
446
447 nextEscaped = false
448 } else {
449 switch c {
450 case '"', '\'':
451 continue
452 case '\\':
453 nextEscaped = true
454
455 continue
456 }
457 }
458
459 sb.WriteRune(c)
460 }
461
462 return sb.String()
463 }
464
465 func (b *bashTokeniser) heredocString(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
466 last := len(b.heredoc) - 1
467 heredoc := b.heredoc[last][0]
468
469 for {
470 state := t.State()
471
472 if t.AcceptString(heredoc, false) == len(heredoc) && (t.Peek() == '\n' || t.Peek() == -1) {
473 state.Reset()
474
475 return t.Return(TokenHeredoc, b.heredocEnd)
476 }
477
478 switch t.ExceptRun(heredocStringBreak) {
479 case -1:
480 return t.ReturnError(io.ErrUnexpectedEOF)
481 case '$':
482 state := t.State()
483
484 t.Next()
485
486 if t.Accept(decimalDigit) || t.Accept(identStart) || t.Accept("({") {
487 state.Reset()
488 b.pushTokenDepth('h')
489
490 return t.Return(TokenHeredoc, b.identifier)
491 }
492
493 continue
494 }
495
496 t.Next()
497 }
498 }
499
500 func (b *bashTokeniser) heredocEnd(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
501 last := len(b.heredoc) - 1
502 heredoc := b.heredoc[last][0]
503 b.heredoc[last] = b.heredoc[last][1:]
504
505 t.AcceptString(heredoc, false)
506
507 if len(b.heredoc[last]) == 0 {
508 b.heredoc = b.heredoc[:last]
509
510 b.popTokenDepth()
511 }
512
513 return t.Return(TokenHeredocEnd, b.main)
514 }
515
516 func (b *bashTokeniser) identifier(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
517 t.Next()
518
519 if t.Accept(decimalDigit) {
520 return t.Return(TokenIdentifier, b.main)
521 } else if t.Accept("(") {
522 if t.Accept("(") {
523 b.pushTokenDepth('>')
524
525 return t.Return(TokenPunctuator, b.main)
526 }
527
528 b.pushTokenDepth(')')
529
530 return t.Return(TokenPunctuator, b.main)
531 } else if t.Accept("{") {
532 b.pushTokenDepth('}')
533
534 return t.Return(TokenPunctuator, b.keywordIdentOrWord)
535 } else if td := b.lastTokenDepth(); td != '"' && td != 'h' && t.Accept("'\"") {
536 t.Reset()
537
538 return b.stringStart(t)
539 }
540
541 var wb string
542
543 switch b.lastTokenDepth() {
544 case ']':
545 wb = wordNoBracket
546 default:
547 wb = word
548 }
549
550 t.ExceptRun(wb)
551
552 return t.Return(TokenIdentifier, b.main)
553 }
554
555 func (b *bashTokeniser) stringStart(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
556 if b.lastTokenDepth() == t.Peek() {
557 b.popTokenDepth()
558 t.Next()
559
560 return t.Return(TokenString, b.main)
561 } else if t.Accept("$") && t.Accept("'") {
562 b.pushTokenDepth('$')
563 } else {
564 b.pushTokenDepth(t.Next())
565 }
566
567 return b.string(t, true)
568 }
569
570 func (b *bashTokeniser) zero(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
571 t.Next()
572
573 if t.Accept("xX") {
574 if !t.Accept(hexDigit) {
575 return t.ReturnError(ErrInvalidNumber)
576 }
577
578 t.AcceptRun(hexDigit)
579 } else {
580 t.AcceptRun(octalDigit)
581 }
582
583 return t.Return(TokenNumberLiteral, b.main)
584 }
585
586 func (b *bashTokeniser) number(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
587 if !t.Accept(decimalDigit) {
588 return b.keywordIdentOrWord(t)
589 }
590
591 t.AcceptRun(decimalDigit)
592
593 if t.Accept("#") {
594 if !t.Accept(numberChars) {
595 return t.ReturnError(ErrInvalidNumber)
596 }
597
598 t.AcceptRun(numberChars)
599 }
600
601 return t.Return(TokenNumberLiteral, b.main)
602 }
603
604 func (b *bashTokeniser) keywordIdentOrWord(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
605 if t.AcceptWord(keywords, false) != "" {
606 return t.Return(TokenKeyword, b.main)
607 }
608
609 if t.Accept(identStart) {
610 t.AcceptRun(identCont)
611
612 if state := t.State(); t.AcceptWord(assignment, false) != "" {
613 state.Reset()
614
615 return t.Return(TokenIdentifierAssign, b.main)
616 } else if t.Peek() == b.lastTokenDepth() {
617 return t.Return(TokenWord, b.main)
618 } else if t.Peek() == '[' {
619 return t.Return(TokenIdentifierAssign, b.startArrayAssign)
620 }
621 } else if t.Accept(decimalDigit) {
622 t.AcceptRun(decimalDigit)
623
624 switch t.Peek() {
625 case '<', '>':
626 return t.Return(TokenNumberLiteral, b.main)
627 }
628 }
629
630 return b.word(t)
631 }
632
633 func (b *bashTokeniser) word(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
634 var wb string
635
636 switch b.lastTokenDepth() {
637 case '}':
638 wb = wordBreakNoBrace
639 case ']':
640 wb = wordBreakNoBracket
641 default:
642 wb = wordBreak
643 }
644
645 if t.Len() == 0 && t.Accept(wb) {
646 return t.ReturnError(ErrInvalidCharacter)
647 }
648
649 for {
650 switch t.ExceptRun(wb) {
651 case -1:
652 if t.Len() == 0 {
653 if b.lastTokenDepth() == 0 {
654 return t.Done()
655 }
656
657 return t.ReturnError(io.ErrUnexpectedEOF)
658 }
659
660 fallthrough
661 default:
662 return t.Return(TokenWord, b.main)
663 case '{':
664 state := t.State()
665
666 t.Next()
667
668 if t.Accept(whitespace) || t.Accept(newline) || t.Peek() == -1 {
669 state.Reset()
670 } else {
671 tk, _ := b.braceExpansion(t.SubTokeniser())
672
673 state.Reset()
674
675 if tk.Type == TokenBraceExpansion {
676 return t.Return(TokenWord, b.main)
677 }
678 }
679
680 t.Next()
681 case '\\':
682 if b.isBacktick(t, true) != backtickNone {
683 return t.Return(TokenWord, b.backtickOrIdentOrWord)
684 }
685
686 t.Next()
687 t.Next()
688 case '$':
689 state := t.State()
690
691 t.Next()
692
693 if t.Accept(decimalDigit) || t.Accept(identStart) || t.Accept("({") {
694 state.Reset()
695
696 return t.Return(TokenWord, b.main)
697 }
698 }
699 }
700 }
701
702 func (b *bashTokeniser) startArrayAssign(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
703 t.Accept("[")
704 b.pushTokenDepth(']')
705
706 return t.Return(TokenPunctuator, b.main)
707 }
708
709 func (b *bashTokeniser) braceExpansion(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
710 if t.Accept(letters) {
711 if t.AcceptWord(dotdot, false) != "" {
712 if !t.Accept(letters) {
713 return b.word(t)
714 }
715
716 if t.AcceptWord(dotdot, false) != "" {
717 t.Accept("-")
718
719 if !t.Accept(decimalDigit) {
720 return b.word(t)
721 }
722
723 t.AcceptRun(decimalDigit)
724 }
725
726 if !t.Accept("}") {
727 return b.word(t)
728 }
729
730 return t.Return(TokenBraceExpansion, b.main)
731 }
732
733 return b.braceWord(t)
734 } else if t.Accept("_") {
735 return b.braceWord(t)
736 } else {
737 t.Accept("-")
738
739 if t.Accept(decimalDigit) {
740 switch t.AcceptRun(decimalDigit) {
741 default:
742 return b.word(t)
743 case ',':
744 return b.braceExpansionWord(t)
745 case '.':
746 if t.AcceptWord(dotdot, false) != "" {
747 t.Accept("-")
748
749 if !t.Accept(decimalDigit) {
750 return b.word(t)
751 }
752
753 t.AcceptRun(decimalDigit)
754
755 if t.AcceptWord(dotdot, false) != "" {
756 t.Accept("-")
757
758 if !t.Accept(decimalDigit) {
759 return b.word(t)
760 }
761
762 t.AcceptRun(decimalDigit)
763 }
764
765 if !t.Accept("}") {
766 return b.word(t)
767 }
768
769 return t.Return(TokenBraceExpansion, b.main)
770 }
771
772 }
773 }
774 }
775
776 return b.braceExpansionWord(t)
777 }
778
779 func (b *bashTokeniser) braceWord(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
780 t.AcceptRun(identCont)
781
782 if !t.Accept("}") {
783 return b.braceExpansionWord(t)
784 }
785
786 return t.Return(TokenBraceWord, b.main)
787 }
788
789 func (b *bashTokeniser) braceExpansionWord(t *parser.Tokeniser) (parser.Token, parser.TokenFunc) {
790 var hasComma bool
791
792 for {
793 switch t.ExceptRun(braceWordBreak) {
794 case '}':
795 if hasComma {
796 t.Next()
797
798 return t.Return(TokenBraceExpansion, b.main)
799 }
800
801 fallthrough
802 default:
803 return b.word(t)
804 case '\\':
805 t.Next()
806 t.Next()
807 case ',':
808 t.Next()
809
810 hasComma = true
811 }
812 }
813 }
814