github.com/ronaksoft/rony@v0.16.26-0.20230807065236-1743dbfe6959/internal/parser/lex.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package parse 6 7 import ( 8 "fmt" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12 ) 13 14 // tokenItem represents a token or text string returned from the scanner. 15 type tokenItem struct { 16 tok token // The type of this tokenItem. 17 pos Pos // The starting position, in bytes, of this tokenItem in the input string. 18 val string // The value of this tokenItem. 19 line int // The line number at the start of this tokenItem. 20 } 21 22 func (i tokenItem) String() string { 23 switch { 24 case i.tok == EOF: 25 return "EOF" 26 case i.tok == ERROR: 27 return i.val 28 case i.tok > keyword_beg && i.tok < keyword_end: 29 return fmt.Sprintf("%d: <%s>", i.tok, i.val) 30 } 31 32 return fmt.Sprintf("%d: %q", i.tok, i.val) 33 } 34 35 var key = map[string]token{ 36 "model": MODEL, 37 "tab": TABLE, 38 "view": VIEW, 39 "cnt": COUNTER, 40 } 41 42 // state functions 43 const ( 44 leftDelim = "{{" 45 rightDelim = "}}" 46 eof = -1 47 ) 48 49 // stateFn represents the state of the scanner as a function that returns the next state. 50 type stateFn func(*lexer) stateFn 51 52 // lexer holds the state of the scanner. 53 type lexer struct { 54 name string // the name of the input; used only for error reports 55 input string // the string being scanned 56 pos Pos // current position in the input 57 start Pos // start position of this tokenItem 58 width Pos // width of last rune read from input 59 items chan tokenItem // channel of scanned items 60 parenDepth int // nesting depth of ( ) exprs 61 line int // 1+number of newlines seen 62 startLine int // start line of this tokenItem 63 } 64 65 // next returns the next rune in the input. 66 func (l *lexer) next() rune { 67 if int(l.pos) >= len(l.input) { 68 l.width = 0 69 70 return eof 71 } 72 r, w := utf8.DecodeRuneInString(l.input[l.pos:]) 73 l.width = Pos(w) 74 l.pos += l.width 75 if r == '\n' { 76 l.line++ 77 } 78 79 return r 80 } 81 82 // peek returns but does not consume the next rune in the input. 83 func (l *lexer) peek() rune { 84 r := l.next() 85 l.backup() 86 87 return r 88 } 89 90 // backup steps back one rune. Can only be called once per call of next. 91 func (l *lexer) backup() { 92 l.pos -= l.width 93 // Correct newline count. 94 if l.width == 1 && l.input[l.pos] == '\n' { 95 l.line-- 96 } 97 } 98 99 // emit passes an tokenItem back to the client. 100 func (l *lexer) emit(t token) { 101 l.items <- tokenItem{t, l.start, l.input[l.start:l.pos], l.startLine} 102 l.start = l.pos 103 l.startLine = l.line 104 } 105 106 // ignore skips over the pending input before this point. 107 func (l *lexer) ignore() { 108 l.line += strings.Count(l.input[l.start:l.pos], "\n") 109 l.start = l.pos 110 l.startLine = l.line 111 } 112 113 // errorf returns an error token and terminates the scan by passing 114 // back a nil pointer that will be the next state, terminating l.nextItem. 115 func (l *lexer) errorf(format string, args ...interface{}) stateFn { 116 l.items <- tokenItem{ERROR, l.start, fmt.Sprintf(format, args...), l.startLine} 117 118 return nil 119 } 120 121 // nextItem returns the next tokenItem from the input. 122 // Called by the parser, not in the lexing goroutine. 123 func (l *lexer) nextItem() tokenItem { 124 return <-l.items 125 } 126 127 // drain drains the output so the lexing goroutine will exit. 128 // Called by the parser, not in the lexing goroutine. 129 func (l *lexer) drain() { 130 for range l.items { 131 } 132 } 133 134 // run runs the state machine for the lexer. 135 func (l *lexer) run() { 136 for state := lexText; state != nil; { 137 state = state(l) 138 } 139 close(l.items) 140 } 141 142 // atRightDelim reports whether the lexer is at a right delimiter, possibly preceded by a trim marker. 143 func (l *lexer) atRightDelim() (delim bool) { 144 return strings.HasPrefix(l.input[l.pos:], rightDelim) 145 } 146 147 // atTerminator reports whether the input is at valid termination character to 148 // appear after an identifier. Breaks .X.Y into two pieces. Also catches cases 149 // like "$x+2" not being acceptable without a space, in case we decide one 150 // day to implement arithmetic. 151 func (l *lexer) atTerminator() bool { 152 r := l.peek() 153 if isSpace(r) || isEndOfLine(r) { 154 return true 155 } 156 switch r { 157 case eof, '.', ',', '|', ':', ')', '(': 158 return true 159 } 160 // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will 161 // succeed but should fail) but only in extremely rare cases caused by willfully 162 // bad choice of delimiter. 163 if rd, _ := utf8.DecodeRuneInString(rightDelim); rd == r { 164 return true 165 } 166 167 return false 168 } 169 170 // lex creates a new scanner for the input string. 171 func lex(name, input string) *lexer { 172 l := &lexer{ 173 name: name, 174 input: input, 175 items: make(chan tokenItem), 176 line: 1, 177 startLine: 1, 178 } 179 go l.run() 180 181 return l 182 } 183 184 // lexText scans until an opening action delimiter, "{{". 185 func lexText(l *lexer) stateFn { 186 l.width = 0 187 if x := strings.Index(l.input[l.pos:], leftDelim); x >= 0 { 188 l.pos += Pos(x) 189 if l.pos > l.start { 190 l.line += strings.Count(l.input[l.start:l.pos], "\n") 191 l.emit(TEXT) 192 } 193 l.ignore() 194 195 return lexLeftDelim 196 } 197 l.pos = Pos(len(l.input)) 198 // Correctly reached EOF. 199 if l.pos > l.start { 200 l.line += strings.Count(l.input[l.start:l.pos], "\n") 201 l.emit(TEXT) 202 l.ignore() 203 } 204 l.emit(EOF) 205 206 return nil 207 } 208 209 // lexLeftDelim scans the left delimiter, which is known to be present 210 func lexLeftDelim(l *lexer) stateFn { 211 l.pos += Pos(len(leftDelim)) 212 l.emit(L_DELIM) 213 l.ignore() 214 l.parenDepth = 0 215 216 return lexInsideAction 217 } 218 219 // lexRightDelim scans the right delimiter, which is known to be present 220 func lexRightDelim(l *lexer) stateFn { 221 l.pos += Pos(len(rightDelim)) 222 l.emit(R_DELIM) 223 l.ignore() 224 225 return lexText 226 } 227 228 // lexInsideAction scans the elements inside action delimiters. 229 func lexInsideAction(l *lexer) stateFn { 230 // Either number, quoted string, or identifier. 231 // Spaces separate arguments; runs of spaces turn into itemSpace. 232 // Pipe symbols separate and are emitted. 233 if delim := l.atRightDelim(); delim { 234 if l.parenDepth == 0 { 235 return lexRightDelim 236 } 237 238 return l.errorf("unclosed left paren") 239 } 240 switch r := l.next(); { 241 case r == eof || isEndOfLine(r): 242 return l.errorf("unclosed action") 243 case isSpace(r): 244 l.backup() // Put space back in case we have " -}}". 245 246 return lexSpace 247 case r == '@': 248 l.emit(AT_SIGN) 249 250 return lexIdentifier 251 case r == ',': 252 l.emit(COMMA) 253 case isAlphaNumeric(r): 254 l.backup() 255 256 return lexIdentifier 257 case r == '(': 258 l.emit(L_PAREN) 259 l.parenDepth++ 260 case r == ')': 261 l.emit(R_PAREN) 262 l.parenDepth-- 263 if l.parenDepth < 0 { 264 return l.errorf("unexpected right paren %#U", r) 265 } 266 default: 267 268 return l.errorf("unrecognized character in action: %#U", r) 269 } 270 271 return lexInsideAction 272 } 273 274 // lexSpace scans a run of space characters. 275 // We have not consumed the first space, which is known to be present. 276 func lexSpace(l *lexer) stateFn { 277 var r rune 278 for { 279 r = l.peek() 280 if !isSpace(r) { 281 break 282 } 283 l.next() 284 } 285 l.emit(SPACE) 286 287 return lexInsideAction 288 } 289 290 // lexIdentifier scans an alphanumeric. 291 func lexIdentifier(l *lexer) stateFn { 292 Loop: 293 for { 294 switch r := l.next(); { 295 case isAlphaNumeric(r): 296 // absorb. 297 default: 298 l.backup() 299 word := l.input[l.start:l.pos] 300 if !l.atTerminator() { 301 return l.errorf("bad character %#U", r) 302 } 303 switch { 304 case key[word] > keyword_beg && key[word] < keyword_end: 305 l.emit(key[word]) 306 default: 307 l.emit(IDENT) 308 } 309 310 break Loop 311 } 312 } 313 314 return lexInsideAction 315 } 316 317 // isSpace reports whether r is a space character. 318 func isSpace(r rune) bool { 319 return r == ' ' || r == '\t' 320 } 321 322 // isEndOfLine reports whether r is an end-of-line character. 323 func isEndOfLine(r rune) bool { 324 return r == '\r' || r == '\n' 325 } 326 327 // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore. 328 func isAlphaNumeric(r rune) bool { 329 return r == '_' || r == '-' || unicode.IsLetter(r) || unicode.IsDigit(r) 330 }