parser - tokeniser.go
1 package parser
2
3 import (
4 "errors"
5 "io"
6 "slices"
7 "strings"
8 "unicode"
9 "unicode/utf8"
10 )
11
12 // TokenType represents the type of token being read.
13 //
14 // Negative values are reserved for this package.
15 type TokenType int
16
17 // Constants TokenError (-2) and TokenDone (-1).
18 const (
19 TokenDone TokenType = -1 - iota
20 TokenError
21 )
22
23 // Token represents data parsed from the stream.
24 type Token struct {
25 Type TokenType
26 Data string
27 }
28
29 // TokenFunc is the type that the worker funcs implement in order to be used by
30 // the tokeniser.
31 type TokenFunc func(*Tokeniser) (Token, TokenFunc)
32
33 // State represents a position in the byte stream of the Tokeniser.
34 type State interface {
35 // Reset returns the byte stream to the position it was in when this
36 // object was created.
37 //
38 // Only valid until Tokeniser.Get is called.
39 Reset() bool
40 }
41
42 type tokeniser interface {
43 backup()
44 get() string
45 length() int
46 next() rune
47 reset()
48 state() State
49 sub() tokeniser
50 slice(int, int) (string, int)
51 }
52
53 // Tokeniser is a state machine to generate tokens from an input.
54 type Tokeniser struct {
55 tokeniser
56 Err error
57 state TokenFunc
58 }
59
60 // GetToken runs the state machine and retrieves a single token and possible an
61 // error.
62 func (t *Tokeniser) GetToken() (Token, error) {
63 tk := t.get()
64
65 if tk.Type == TokenError {
66 return tk, t.Err
67 }
68
69 return tk, nil
70 }
71
72 // Iter yields each token as it's returned, stopping after yielding a TokenDone
73 // or TokenError Token.
74 func (t *Tokeniser) Iter(yield func(Token) bool) {
75 for {
76 if tk := t.get(); !yield(tk) || tk.Type == TokenDone || tk.Type == TokenError {
77 break
78 }
79 }
80 }
81
82 // GetError returns any error that has been generated by the Tokeniser.
83 func (t *Tokeniser) GetError() error {
84 return t.Err
85 }
86
87 // TokeniserState allows the internal state of the Tokeniser to be set.
88 func (t *Tokeniser) TokeniserState(tf TokenFunc) {
89 t.state = tf
90 }
91
92 func (t *Tokeniser) get() Token {
93 if errors.Is(t.Err, io.EOF) {
94 return Token{
95 Type: TokenDone,
96 Data: "",
97 }
98 }
99
100 if t.state == nil {
101 t.Err = ErrNoState
102 t.state = (*Tokeniser).Error
103 }
104
105 var tk Token
106
107 tk, t.state = t.state(t)
108
109 if tk.Type == TokenError && errors.Is(t.Err, io.EOF) {
110 t.Err = io.ErrUnexpectedEOF
111 }
112
113 return tk
114 }
115
116 // Accept returns true if the next character to be read is contained within the
117 // given string.
118 //
119 // Upon true, it advances the read position, otherwise the position remains the
120 // same.
121 func (t *Tokeniser) Accept(chars string) bool {
122 if !strings.ContainsRune(chars, t.next()) {
123 t.backup()
124
125 return false
126 }
127
128 return true
129 }
130
131 // AcceptRune returns true if the next character to be read is the rune
132 // specified
133 //
134 // Upon true, it advances the read position, otherwise the position remains the
135 // same.
136 func (t *Tokeniser) AcceptRune(r rune) bool {
137 if t.next() == r {
138 return true
139 }
140
141 t.backup()
142
143 return false
144 }
145
146 // Next returns the next rune and advances the read position.
147 func (t *Tokeniser) Next() rune {
148 return t.next()
149 }
150
151 // Peek returns the next rune without advancing the read position.
152 func (t *Tokeniser) Peek() rune {
153 r := t.next()
154
155 t.backup()
156
157 return r
158 }
159
160 // Get returns a string of everything that has been read so far and resets
161 // the string for the next round of parsing.
162 func (t *Tokeniser) Get() string {
163 return t.tokeniser.get()
164 }
165
166 // Len returns the number of bytes that has been read since the last Get.
167 func (t *Tokeniser) Len() int {
168 return t.length()
169 }
170
171 // AcceptRun reads from the string as long as the read character is in the
172 // given string.
173 //
174 // Returns the rune that stopped the run.
175 func (t *Tokeniser) AcceptRun(chars string) rune {
176 for {
177 if c := t.next(); !strings.ContainsRune(chars, c) {
178 t.backup()
179
180 return c
181 }
182 }
183 }
184
185 // AcceptString attempts to accept each character from the given string, in
186 // order, returning the number of characters accepted before a failure.
187 func (t *Tokeniser) AcceptString(str string, caseInsensitive bool) int {
188 for n, r := range str {
189 if p := t.Peek(); p < 0 || !runeComparison(p, r, caseInsensitive) {
190 return n
191 }
192
193 t.Next()
194 }
195
196 return len(str)
197 }
198
199 func runeComparison(a, b rune, caseInsensitive bool) bool {
200 if caseInsensitive {
201 al := unicode.SimpleFold(a)
202 bl := unicode.SimpleFold(b)
203
204 return a == b || a == bl || bl == a || al == bl
205 }
206
207 return a == b
208 }
209
210 // AcceptWord attempts to parse one of the words (string of characters)
211 // provided in the slice.
212 //
213 // Returns the longest word parsed, or empty string if no words matched.
214 func (t *Tokeniser) AcceptWord(words []string, caseInsensitive bool) string {
215 words = slices.Clone(words)
216
217 return t.acceptWord(words, caseInsensitive)
218 }
219
220 func (t *Tokeniser) acceptWord(words []string, caseInsensitive bool) string {
221 s := t.State()
222
223 var sb strings.Builder
224
225 for len(words) > 0 {
226 char := t.Next()
227
228 sb.WriteRune(char)
229
230 if char < 0 {
231 break
232 }
233
234 var found bool
235
236 newWords := words[:0]
237
238 for _, word := range words {
239 if len(word) > 0 {
240 r, s := utf8.DecodeRuneInString(word)
241 if r == utf8.RuneError && s == 1 {
242 r = rune(word[0])
243 }
244
245 if runeComparison(char, r, caseInsensitive) {
246 word = word[s:]
247 found = found || word == ""
248 newWords = append(newWords, word)
249 }
250 }
251 }
252
253 words = newWords
254
255 if found {
256 if len(words) > 0 {
257 sb.WriteString(t.acceptWord(words, caseInsensitive))
258 }
259
260 return sb.String()
261 }
262 }
263
264 s.Reset()
265
266 return ""
267 }
268
269 // Except returns true if the next character to be read is not contained within
270 // the given string.
271 // Upon true, it advances the read position, otherwise the position remains the
272 // same.
273 func (t *Tokeniser) Except(chars string) bool {
274 if r := t.next(); r == -1 || strings.ContainsRune(chars, r) {
275 t.backup()
276
277 return false
278 }
279
280 return true
281 }
282
283 // Reset restores the state to after the last Get() call (or init, it Get() has
284 // not been called).
285 func (t *Tokeniser) Reset() {
286 t.reset()
287 }
288
289 // Retrieve the current Tokeniser state that allows you to reset to that point.
290 // State is only valid until next 'Get' call.
291 func (t *Tokeniser) State() State {
292 return t.tokeniser.state()
293 }
294
295 // SubTokeniser create a new Tokeniser that uses this existing tokeniser as its
296 // source.
297 //
298 // This allows the sub-tokenisers Get method to be called without calling it on
299 // its parent.
300 func (t *Tokeniser) SubTokeniser() *Tokeniser {
301 return &Tokeniser{
302 tokeniser: t.tokeniser.sub(),
303 }
304 }
305
306 // ExceptRun reads from the string as long as the read character is not in the
307 // given string.
308 //
309 // Returns the rune that stopped the run.
310 func (t *Tokeniser) ExceptRun(chars string) rune {
311 for {
312 if r := t.next(); r == -1 || strings.ContainsRune(chars, r) {
313 t.backup()
314
315 return r
316 }
317 }
318 }
319
320 // Return simplifies the returning from TokenFns, taking a TokenType and a next
321 // TokenFn, default to Done.
322 //
323 // The returned token is of the type specified with the data set to the output
324 // of t.Get().
325 func (t *Tokeniser) Return(typ TokenType, fn TokenFunc) (Token, TokenFunc) {
326 if fn == nil {
327 fn = (*Tokeniser).Done
328 }
329
330 return Token{
331 Type: typ,
332 Data: t.Get(),
333 }, fn
334 }
335
336 // ReturnError simplifies the handling of errors, setting the error and calling
337 // Tokeniser.Error().
338 func (t *Tokeniser) ReturnError(err error) (Token, TokenFunc) {
339 t.Err = err
340
341 return t.Error()
342 }
343
344 // Done is a TokenFunc that is used to indicate that there are no more tokens to
345 // parse.
346 func (t *Tokeniser) Done() (Token, TokenFunc) {
347 t.Err = io.EOF
348
349 return Token{
350 Type: TokenDone,
351 Data: "",
352 }, (*Tokeniser).Done
353 }
354
355 // Error represents an error state for the parser.
356 //
357 // The error value should be set in Tokeniser.Err and then this func should be
358 // called.
359 func (t *Tokeniser) Error() (Token, TokenFunc) {
360 if t.Err == nil {
361 t.Err = ErrUnknownError
362 }
363
364 return Token{
365 Type: TokenError,
366 Data: t.Err.Error(),
367 }, (*Tokeniser).Error
368 }
369
370 type sub struct {
371 tokeniser
372 tState, start int
373 }
374
375 func (s *sub) get() string {
376 if s.start < 0 {
377 return ""
378 }
379
380 var str string
381
382 str, s.start = s.slice(s.tState, s.start)
383
384 return str
385 }
386
387 // Errors.
388 var (
389 ErrNoState = errors.New("no state")
390 ErrUnknownError = errors.New("unknown error")
391 )
392