github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/internal/rsg/yacc/lex.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in licenses/BSD-golang.txt.
     4  
     5  // Portions of this file are additionally subject to the following
     6  // license and copyright.
     7  //
     8  // Copyright 2016 The Cockroach Authors.
     9  //
    10  // Use of this software is governed by the Business Source License
    11  // included in the file licenses/BSL.txt.
    12  //
    13  // As of the Change Date specified in that file, in accordance with
    14  // the Business Source License, use of this software will be governed
    15  // by the Apache License, Version 2.0, included in the file
    16  // licenses/APL.txt.
    17  
    18  // Copied from Go's text/template/parse package and modified for yacc.
    19  
    20  package yacc
    21  
    22  import (
    23  	"fmt"
    24  	"strings"
    25  	"unicode"
    26  	"unicode/utf8"
    27  )
    28  
    29  // item represents a token or text string returned from the scanner.
    30  type item struct {
    31  	typ itemType // The type of this item.
    32  	pos Pos      // The starting position, in bytes, of this item in the input string.
    33  	val string   // The value of this item.
    34  }
    35  
    36  func (i item) String() string {
    37  	switch {
    38  	case i.typ == itemEOF:
    39  		return "EOF"
    40  	case i.typ == itemError:
    41  		return i.val
    42  	case len(i.val) > 10:
    43  		return fmt.Sprintf("%.10q...", i.val)
    44  	}
    45  	return fmt.Sprintf("%q", i.val)
    46  }
    47  
    48  // itemType identifies the type of lex items.
    49  type itemType int
    50  
    51  const (
    52  	itemError itemType = iota // error occurred; value is text of error
    53  	itemEOF
    54  	itemComment
    55  	itemPct
    56  	itemDoublePct
    57  	itemIdent
    58  	itemColon
    59  	itemLiteral
    60  	itemExpr
    61  	itemPipe
    62  	itemNL
    63  )
    64  
    65  const eof = -1
    66  
    67  // stateFn represents the state of the scanner as a function that returns the next state.
    68  type stateFn func(*lexer) stateFn
    69  
    70  // lexer holds the state of the scanner.
    71  type lexer struct {
    72  	name    string    // the name of the input; used only for error reports
    73  	input   string    // the string being scanned
    74  	state   stateFn   // the next lexing function to enter
    75  	pos     Pos       // current position in the input
    76  	start   Pos       // start position of this item
    77  	width   Pos       // width of last rune read from input
    78  	lastPos Pos       // position of most recent item returned by nextItem
    79  	items   chan item // channel of scanned items
    80  }
    81  
    82  // next returns the next rune in the input.
    83  func (l *lexer) next() rune {
    84  	if int(l.pos) >= len(l.input) {
    85  		l.width = 0
    86  		return eof
    87  	}
    88  	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
    89  	l.width = Pos(w)
    90  	l.pos += l.width
    91  	return r
    92  }
    93  
    94  // peek returns but does not consume the next rune in the input.
    95  func (l *lexer) peek() rune {
    96  	r := l.next()
    97  	l.backup()
    98  	return r
    99  }
   100  
   101  // backup steps back one rune. Can only be called once per call of next.
   102  func (l *lexer) backup() {
   103  	l.pos -= l.width
   104  }
   105  
   106  // emit passes an item back to the client.
   107  func (l *lexer) emit(t itemType) {
   108  	l.items <- item{t, l.start, l.input[l.start:l.pos]}
   109  	l.start = l.pos
   110  }
   111  
   112  // ignore skips over the pending input before this point.
   113  func (l *lexer) ignore() {
   114  	l.start = l.pos
   115  }
   116  
   117  // lineNumber reports which line we're on, based on the position of
   118  // the previous item returned by nextItem. Doing it this way
   119  // means we don't have to worry about peek double counting.
   120  func (l *lexer) lineNumber() int {
   121  	return 1 + strings.Count(l.input[:l.lastPos], "\n")
   122  }
   123  
   124  // errorf returns an error token and terminates the scan by passing
   125  // back a nil pointer that will be the next state, terminating l.nextItem.
   126  func (l *lexer) errorf(format string, args ...interface{}) stateFn {
   127  	l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)}
   128  	return nil
   129  }
   130  
   131  // nextItem returns the next item from the input.
   132  func (l *lexer) nextItem() item {
   133  	i := <-l.items
   134  	l.lastPos = i.pos
   135  	return i
   136  }
   137  
   138  // lex creates a new scanner for the input string.
   139  func lex(name, input string) *lexer {
   140  	l := &lexer{
   141  		name:  name,
   142  		input: input,
   143  		items: make(chan item),
   144  	}
   145  	go l.run()
   146  	return l
   147  }
   148  
   149  // run runs the state machine for the lexer.
   150  func (l *lexer) run() {
   151  	for l.state = lexStart; l.state != nil; {
   152  		l.state = l.state(l)
   153  	}
   154  }
   155  
   156  // state functions
   157  
   158  func lexStart(l *lexer) stateFn {
   159  Loop:
   160  	for {
   161  		switch r := l.next(); {
   162  		case r == '/':
   163  			return lexComment
   164  		case r == '%':
   165  			return lexPct
   166  		case r == '\n':
   167  			l.emit(itemNL)
   168  		case r == ':':
   169  			l.emit(itemColon)
   170  		case r == '|':
   171  			l.emit(itemPipe)
   172  		case r == '{':
   173  			return lexExpr
   174  		case isSpace(r):
   175  			l.ignore()
   176  		case isIdent(r):
   177  			return lexIdent
   178  		case r == '\'':
   179  			return lexLiteral
   180  		case r == eof:
   181  			l.emit(itemEOF)
   182  			break Loop
   183  		default:
   184  			return l.errorf("invalid character: %v", string(r))
   185  		}
   186  	}
   187  	return nil
   188  }
   189  
   190  func lexLiteral(l *lexer) stateFn {
   191  	for {
   192  		switch l.next() {
   193  		case '\'':
   194  			l.emit(itemLiteral)
   195  			return lexStart
   196  		}
   197  	}
   198  }
   199  
   200  func lexExpr(l *lexer) stateFn {
   201  	ct := 1
   202  	for {
   203  		switch l.next() {
   204  		case '{':
   205  			ct++
   206  		case '}':
   207  			ct--
   208  			if ct == 0 {
   209  				l.emit(itemExpr)
   210  				return lexStart
   211  			}
   212  		}
   213  	}
   214  }
   215  
   216  func lexComment(l *lexer) stateFn {
   217  	switch r := l.next(); r {
   218  	case '/':
   219  		for {
   220  			switch l.next() {
   221  			case '\n':
   222  				l.backup()
   223  				l.emit(itemComment)
   224  				return lexStart
   225  			}
   226  		}
   227  	case '*':
   228  		for {
   229  			switch l.next() {
   230  			case '*':
   231  				if l.peek() == '/' {
   232  					l.next()
   233  					l.emit(itemComment)
   234  					return lexStart
   235  				}
   236  			}
   237  		}
   238  	default:
   239  		return l.errorf("expected comment: %c", r)
   240  	}
   241  }
   242  
   243  func lexPct(l *lexer) stateFn {
   244  	switch l.next() {
   245  	case '%':
   246  		l.emit(itemDoublePct)
   247  		return lexStart
   248  	case '{':
   249  		for {
   250  			switch l.next() {
   251  			case '%':
   252  				if l.peek() == '}' {
   253  					l.next()
   254  					l.emit(itemPct)
   255  					return lexStart
   256  				}
   257  			}
   258  		}
   259  	case 'p':
   260  		if l.next() != 'r' || l.next() != 'e' || l.next() != 'c' || l.next() != ' ' {
   261  			l.errorf("expected %%prec")
   262  		}
   263  		for {
   264  			switch r := l.next(); {
   265  			case isIdent(r):
   266  				// absorb
   267  			default:
   268  				l.backup()
   269  				l.emit(itemPct)
   270  				return lexStart
   271  			}
   272  		}
   273  	default:
   274  		ct := 0
   275  		for {
   276  			switch l.next() {
   277  			case ' ':
   278  			case '{':
   279  				ct++
   280  			case '}':
   281  				ct--
   282  				if ct == 0 {
   283  					l.emit(itemPct)
   284  					return lexStart
   285  				}
   286  			case '\n':
   287  				if ct == 0 {
   288  					l.backup()
   289  					l.emit(itemPct)
   290  					return lexStart
   291  				}
   292  			}
   293  		}
   294  	}
   295  }
   296  
   297  func lexIdent(l *lexer) stateFn {
   298  	for {
   299  		switch r := l.next(); {
   300  		case isIdent(r):
   301  			// absorb
   302  		default:
   303  			l.backup()
   304  			l.emit(itemIdent)
   305  			return lexStart
   306  		}
   307  	}
   308  }
   309  
   310  func isSpace(r rune) bool {
   311  	return r == ' ' || r == '\t'
   312  }
   313  
   314  func isIdent(r rune) bool {
   315  	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
   316  }