github.com/ronaksoft/rony@v0.16.26-0.20230807065236-1743dbfe6959/internal/parser/lex.go

github.com/ronaksoft/rony@v0.16.26-0.20230807065236-1743dbfe6959/internal/parser/lex.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package parse
     6  
     7  import (
     8  	"fmt"
     9  	"strings"
    10  	"unicode"
    11  	"unicode/utf8"
    12  )
    13  
    14  // tokenItem represents a token or text string returned from the scanner.
    15  type tokenItem struct {
    16  	tok  token  // The type of this tokenItem.
    17  	pos  Pos    // The starting position, in bytes, of this tokenItem in the input string.
    18  	val  string // The value of this tokenItem.
    19  	line int    // The line number at the start of this tokenItem.
    20  }
    21  
    22  func (i tokenItem) String() string {
    23  	switch {
    24  	case i.tok == EOF:
    25  		return "EOF"
    26  	case i.tok == ERROR:
    27  		return i.val
    28  	case i.tok > keyword_beg && i.tok < keyword_end:
    29  		return fmt.Sprintf("%d: <%s>", i.tok, i.val)
    30  	}
    31  
    32  	return fmt.Sprintf("%d: %q", i.tok, i.val)
    33  }
    34  
    35  var key = map[string]token{
    36  	"model": MODEL,
    37  	"tab":   TABLE,
    38  	"view":  VIEW,
    39  	"cnt":   COUNTER,
    40  }
    41  
    42  // state functions
    43  const (
    44  	leftDelim  = "{{"
    45  	rightDelim = "}}"
    46  	eof        = -1
    47  )
    48  
    49  // stateFn represents the state of the scanner as a function that returns the next state.
    50  type stateFn func(*lexer) stateFn
    51  
    52  // lexer holds the state of the scanner.
    53  type lexer struct {
    54  	name       string         // the name of the input; used only for error reports
    55  	input      string         // the string being scanned
    56  	pos        Pos            // current position in the input
    57  	start      Pos            // start position of this tokenItem
    58  	width      Pos            // width of last rune read from input
    59  	items      chan tokenItem // channel of scanned items
    60  	parenDepth int            // nesting depth of ( ) exprs
    61  	line       int            // 1+number of newlines seen
    62  	startLine  int            // start line of this tokenItem
    63  }
    64  
    65  // next returns the next rune in the input.
    66  func (l *lexer) next() rune {
    67  	if int(l.pos) >= len(l.input) {
    68  		l.width = 0
    69  
    70  		return eof
    71  	}
    72  	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
    73  	l.width = Pos(w)
    74  	l.pos += l.width
    75  	if r == '\n' {
    76  		l.line++
    77  	}
    78  
    79  	return r
    80  }
    81  
    82  // peek returns but does not consume the next rune in the input.
    83  func (l *lexer) peek() rune {
    84  	r := l.next()
    85  	l.backup()
    86  
    87  	return r
    88  }
    89  
    90  // backup steps back one rune. Can only be called once per call of next.
    91  func (l *lexer) backup() {
    92  	l.pos -= l.width
    93  	// Correct newline count.
    94  	if l.width == 1 && l.input[l.pos] == '\n' {
    95  		l.line--
    96  	}
    97  }
    98  
    99  // emit passes an tokenItem back to the client.
   100  func (l *lexer) emit(t token) {
   101  	l.items <- tokenItem{t, l.start, l.input[l.start:l.pos], l.startLine}
   102  	l.start = l.pos
   103  	l.startLine = l.line
   104  }
   105  
   106  // ignore skips over the pending input before this point.
   107  func (l *lexer) ignore() {
   108  	l.line += strings.Count(l.input[l.start:l.pos], "\n")
   109  	l.start = l.pos
   110  	l.startLine = l.line
   111  }
   112  
   113  // errorf returns an error token and terminates the scan by passing
   114  // back a nil pointer that will be the next state, terminating l.nextItem.
   115  func (l *lexer) errorf(format string, args ...interface{}) stateFn {
   116  	l.items <- tokenItem{ERROR, l.start, fmt.Sprintf(format, args...), l.startLine}
   117  
   118  	return nil
   119  }
   120  
   121  // nextItem returns the next tokenItem from the input.
   122  // Called by the parser, not in the lexing goroutine.
   123  func (l *lexer) nextItem() tokenItem {
   124  	return <-l.items
   125  }
   126  
   127  // drain drains the output so the lexing goroutine will exit.
   128  // Called by the parser, not in the lexing goroutine.
   129  func (l *lexer) drain() {
   130  	for range l.items {
   131  	}
   132  }
   133  
   134  // run runs the state machine for the lexer.
   135  func (l *lexer) run() {
   136  	for state := lexText; state != nil; {
   137  		state = state(l)
   138  	}
   139  	close(l.items)
   140  }
   141  
   142  // atRightDelim reports whether the lexer is at a right delimiter, possibly preceded by a trim marker.
   143  func (l *lexer) atRightDelim() (delim bool) {
   144  	return strings.HasPrefix(l.input[l.pos:], rightDelim)
   145  }
   146  
   147  // atTerminator reports whether the input is at valid termination character to
   148  // appear after an identifier. Breaks .X.Y into two pieces. Also catches cases
   149  // like "$x+2" not being acceptable without a space, in case we decide one
   150  // day to implement arithmetic.
   151  func (l *lexer) atTerminator() bool {
   152  	r := l.peek()
   153  	if isSpace(r) || isEndOfLine(r) {
   154  		return true
   155  	}
   156  	switch r {
   157  	case eof, '.', ',', '|', ':', ')', '(':
   158  		return true
   159  	}
   160  	// Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
   161  	// succeed but should fail) but only in extremely rare cases caused by willfully
   162  	// bad choice of delimiter.
   163  	if rd, _ := utf8.DecodeRuneInString(rightDelim); rd == r {
   164  		return true
   165  	}
   166  
   167  	return false
   168  }
   169  
   170  // lex creates a new scanner for the input string.
   171  func lex(name, input string) *lexer {
   172  	l := &lexer{
   173  		name:      name,
   174  		input:     input,
   175  		items:     make(chan tokenItem),
   176  		line:      1,
   177  		startLine: 1,
   178  	}
   179  	go l.run()
   180  
   181  	return l
   182  }
   183  
   184  // lexText scans until an opening action delimiter, "{{".
   185  func lexText(l *lexer) stateFn {
   186  	l.width = 0
   187  	if x := strings.Index(l.input[l.pos:], leftDelim); x >= 0 {
   188  		l.pos += Pos(x)
   189  		if l.pos > l.start {
   190  			l.line += strings.Count(l.input[l.start:l.pos], "\n")
   191  			l.emit(TEXT)
   192  		}
   193  		l.ignore()
   194  
   195  		return lexLeftDelim
   196  	}
   197  	l.pos = Pos(len(l.input))
   198  	// Correctly reached EOF.
   199  	if l.pos > l.start {
   200  		l.line += strings.Count(l.input[l.start:l.pos], "\n")
   201  		l.emit(TEXT)
   202  		l.ignore()
   203  	}
   204  	l.emit(EOF)
   205  
   206  	return nil
   207  }
   208  
   209  // lexLeftDelim scans the left delimiter, which is known to be present
   210  func lexLeftDelim(l *lexer) stateFn {
   211  	l.pos += Pos(len(leftDelim))
   212  	l.emit(L_DELIM)
   213  	l.ignore()
   214  	l.parenDepth = 0
   215  
   216  	return lexInsideAction
   217  }
   218  
   219  // lexRightDelim scans the right delimiter, which is known to be present
   220  func lexRightDelim(l *lexer) stateFn {
   221  	l.pos += Pos(len(rightDelim))
   222  	l.emit(R_DELIM)
   223  	l.ignore()
   224  
   225  	return lexText
   226  }
   227  
   228  // lexInsideAction scans the elements inside action delimiters.
   229  func lexInsideAction(l *lexer) stateFn {
   230  	// Either number, quoted string, or identifier.
   231  	// Spaces separate arguments; runs of spaces turn into itemSpace.
   232  	// Pipe symbols separate and are emitted.
   233  	if delim := l.atRightDelim(); delim {
   234  		if l.parenDepth == 0 {
   235  			return lexRightDelim
   236  		}
   237  
   238  		return l.errorf("unclosed left paren")
   239  	}
   240  	switch r := l.next(); {
   241  	case r == eof || isEndOfLine(r):
   242  		return l.errorf("unclosed action")
   243  	case isSpace(r):
   244  		l.backup() // Put space back in case we have " -}}".
   245  
   246  		return lexSpace
   247  	case r == '@':
   248  		l.emit(AT_SIGN)
   249  
   250  		return lexIdentifier
   251  	case r == ',':
   252  		l.emit(COMMA)
   253  	case isAlphaNumeric(r):
   254  		l.backup()
   255  
   256  		return lexIdentifier
   257  	case r == '(':
   258  		l.emit(L_PAREN)
   259  		l.parenDepth++
   260  	case r == ')':
   261  		l.emit(R_PAREN)
   262  		l.parenDepth--
   263  		if l.parenDepth < 0 {
   264  			return l.errorf("unexpected right paren %#U", r)
   265  		}
   266  	default:
   267  
   268  		return l.errorf("unrecognized character in action: %#U", r)
   269  	}
   270  
   271  	return lexInsideAction
   272  }
   273  
   274  // lexSpace scans a run of space characters.
   275  // We have not consumed the first space, which is known to be present.
   276  func lexSpace(l *lexer) stateFn {
   277  	var r rune
   278  	for {
   279  		r = l.peek()
   280  		if !isSpace(r) {
   281  			break
   282  		}
   283  		l.next()
   284  	}
   285  	l.emit(SPACE)
   286  
   287  	return lexInsideAction
   288  }
   289  
   290  // lexIdentifier scans an alphanumeric.
   291  func lexIdentifier(l *lexer) stateFn {
   292  Loop:
   293  	for {
   294  		switch r := l.next(); {
   295  		case isAlphaNumeric(r):
   296  			// absorb.
   297  		default:
   298  			l.backup()
   299  			word := l.input[l.start:l.pos]
   300  			if !l.atTerminator() {
   301  				return l.errorf("bad character %#U", r)
   302  			}
   303  			switch {
   304  			case key[word] > keyword_beg && key[word] < keyword_end:
   305  				l.emit(key[word])
   306  			default:
   307  				l.emit(IDENT)
   308  			}
   309  
   310  			break Loop
   311  		}
   312  	}
   313  
   314  	return lexInsideAction
   315  }
   316  
   317  // isSpace reports whether r is a space character.
   318  func isSpace(r rune) bool {
   319  	return r == ' ' || r == '\t'
   320  }
   321  
   322  // isEndOfLine reports whether r is an end-of-line character.
   323  func isEndOfLine(r rune) bool {
   324  	return r == '\r' || r == '\n'
   325  }
   326  
   327  // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
   328  func isAlphaNumeric(r rune) bool {
   329  	return r == '_' || r == '-' || unicode.IsLetter(r) || unicode.IsDigit(r)
   330  }