github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/optgen/lang/scanner.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package lang 12 13 import ( 14 "bufio" 15 "bytes" 16 "io" 17 "unicode" 18 ) 19 20 //go:generate stringer -type=Token scanner.go 21 22 // Token is the kind of lexical token returned by the scanner (string, 23 // parentheses, comment, etc). 24 type Token int 25 26 const ( 27 // ILLEGAL is the invalid token that indicates the scanner has encountered 28 // an invalid lexical pattern. 29 ILLEGAL Token = iota 30 // ERROR indicates that the scanner encountered an error while reading from 31 // the input files. The text of the error can be accessed via the Literal 32 // method. 33 ERROR 34 // EOF indicates the scanner has reached the end of the input. 35 EOF 36 // IDENT is an identifier composed of Unicode letter and number runes: 37 // UnicodeLetter (UnicodeLetter | UnicodeNumber)* 38 IDENT 39 // STRING is a literal string delimited by double quotes that cannot extend 40 // past the end of a line: " [^"\n]* " 41 STRING 42 // NUMBER is an numeric literal composed of Unicode numeric digits: 43 // UnicodeDigit+ 44 NUMBER 45 // WHITESPACE is any sequence of Unicode whitespace characters. 46 WHITESPACE 47 // COMMENT is a code comment that extends to end of line: # .* EOL 48 COMMENT 49 // LPAREN is the open parentheses rune: ( 50 LPAREN 51 // RPAREN is the close parentheses rune: ) 52 RPAREN 53 // LBRACKET is the open square bracket rune: [ 54 LBRACKET 55 // RBRACKET is the close square bracket rune: ] 56 RBRACKET 57 // LBRACE is the open curly brace rune: { 58 LBRACE 59 // RBRACE is the close curly brace rune: } 60 RBRACE 61 // DOLLAR is the dollar sign rune: $ 62 DOLLAR 63 // COLON is the colon rune: : 64 COLON 65 // ASTERISK is the asterisk rune: * 66 ASTERISK 67 // EQUALS is the equals sign rune: = 68 EQUALS 69 // ARROW is an equals sign followed by a greater than sign: => 70 ARROW 71 // AMPERSAND is the ampersand rune: & 72 AMPERSAND 73 // COMMA is the comma rune: , 74 COMMA 75 // CARET is the caret rune: ^ 76 CARET 77 // ELLIPSES is three periods in succession: ... 78 ELLIPSES 79 // PIPE is the vertical line rune: | 80 PIPE 81 ) 82 83 const ( 84 errRune = rune(-1) 85 eofRune = rune(0) 86 ) 87 88 // Scanner breaks a sequence of characters into a sequence of lexical tokens 89 // that are consumed by the parser in order to construct an Optgen AST. Each 90 // token is associated with a literal that is the string representation of that 91 // token. For many tokens, its literal is a constant. But for other tokens, 92 // like string and identifier tokens, the literal is the custom text that was 93 // scanned from the input file. Scanning stops unrecoverably at EOF, the first 94 // I/O error, or a token too large to fit in the buffer. 95 type Scanner struct { 96 r *bufio.Reader 97 tok Token 98 lit string 99 err error 100 101 // lineLoc tracks the current line and position within the current file 102 // being scanned. 103 lineLoc struct { 104 line int 105 pos int 106 prev int 107 } 108 } 109 110 // NewScanner constructs a new scanner that will tokenize the given input. 111 func NewScanner(r io.Reader) *Scanner { 112 return &Scanner{r: bufio.NewReader(r)} 113 } 114 115 // Token returns the last token that was scanned. 116 func (s *Scanner) Token() Token { 117 return s.tok 118 } 119 120 // Literal returns the literal associated with the last token that was scanned. 121 func (s *Scanner) Literal() string { 122 return s.lit 123 } 124 125 // LineLoc returns the current 0-based line number and column position of the 126 // scanner in the current file. 127 func (s *Scanner) LineLoc() (line, pos int) { 128 return s.lineLoc.line, s.lineLoc.pos 129 } 130 131 // Scan reads the next token from the input and returns it. The Token, Literal, 132 // and LineLoc methods are also initialized with information about the token 133 // that was read. 134 func (s *Scanner) Scan() Token { 135 // Read the next rune. 136 ch := s.read() 137 138 // If we see whitespace then consume all contiguous whitespace. 139 if unicode.IsSpace(ch) { 140 s.unread() 141 return s.scanWhitespace() 142 } 143 144 // If we see a letter or underscore then consume as an identifier or keyword. 145 if unicode.IsLetter(ch) || ch == '_' { 146 s.unread() 147 return s.scanIdentifier() 148 } 149 150 // If we see a digit then consume as a numeric literal. 151 if unicode.IsDigit(ch) { 152 s.unread() 153 return s.scanNumericLiteral() 154 } 155 156 // Otherwise read the individual character. 157 switch ch { 158 case errRune: 159 s.tok = ERROR 160 s.lit = s.err.Error() 161 162 case eofRune: 163 s.tok = EOF 164 s.lit = "" 165 166 case '(': 167 s.tok = LPAREN 168 s.lit = "(" 169 170 case ')': 171 s.tok = RPAREN 172 s.lit = ")" 173 174 case '[': 175 s.tok = LBRACKET 176 s.lit = "[" 177 178 case ']': 179 s.tok = RBRACKET 180 s.lit = "]" 181 182 case '{': 183 s.tok = LBRACE 184 s.lit = "{" 185 186 case '}': 187 s.tok = RBRACE 188 s.lit = "}" 189 190 case '$': 191 s.tok = DOLLAR 192 s.lit = "$" 193 194 case ':': 195 s.tok = COLON 196 s.lit = ":" 197 198 case '*': 199 s.tok = ASTERISK 200 s.lit = "*" 201 202 case ',': 203 s.tok = COMMA 204 s.lit = "," 205 206 case '^': 207 s.tok = CARET 208 s.lit = "^" 209 210 case '|': 211 s.tok = PIPE 212 s.lit = "|" 213 214 case '&': 215 s.tok = AMPERSAND 216 s.lit = "&" 217 218 case '=': 219 if s.read() == '>' { 220 s.tok = ARROW 221 s.lit = "=>" 222 break 223 } 224 225 s.unread() 226 s.tok = EQUALS 227 s.lit = "=" 228 229 case '.': 230 if s.read() == '.' { 231 if s.read() == '.' { 232 s.tok = ELLIPSES 233 s.lit = "..." 234 break 235 } 236 } 237 238 s.tok = ILLEGAL 239 s.lit = "." 240 241 case '"': 242 s.unread() 243 return s.scanStringLiteral('"', false /* multiLine */) 244 245 case '`': 246 s.unread() 247 return s.scanStringLiteral('`', true /* multiLine */) 248 249 case '#': 250 s.unread() 251 return s.scanComment() 252 253 default: 254 s.tok = ILLEGAL 255 s.lit = string(ch) 256 } 257 258 return s.tok 259 } 260 261 // read reads the next rune from the buffered reader. If no reader has yet been 262 // created, or if the current reader is exhausted, then the reader is reset to 263 // point to the next file. read returns errRune if there is an I/O error and 264 // eofRune once there are no more files to read. 265 func (s *Scanner) read() rune { 266 // Once the scanner gets in the error state, it stays there. 267 if s.err != nil { 268 return errRune 269 } 270 271 ch, _, err := s.r.ReadRune() 272 if err == io.EOF { 273 return eofRune 274 } 275 276 if err != nil { 277 s.err = err 278 return errRune 279 } 280 281 s.lineLoc.prev = s.lineLoc.pos 282 if ch == '\n' { 283 s.lineLoc.line++ 284 s.lineLoc.pos = 0 285 } else { 286 s.lineLoc.pos++ 287 } 288 289 return ch 290 } 291 292 // unread places the previously read rune back on the reader. 293 func (s *Scanner) unread() { 294 // Once the scanner gets in the error state, it stays there. 295 if s.err != nil { 296 return 297 } 298 299 err := s.r.UnreadRune() 300 if err != nil { 301 // Last read wasn't a rune (probably an eof), so no-op. 302 return 303 } 304 305 s.tok = ILLEGAL 306 s.lit = "" 307 308 if s.lineLoc.prev == -1 { 309 panic("unread cannot be called twice in succession") 310 } 311 312 if s.lineLoc.pos == 0 { 313 s.lineLoc.line-- 314 } 315 316 s.lineLoc.pos = s.lineLoc.prev 317 s.lineLoc.prev = -1 318 } 319 320 // scanWhitespace consumes the current rune and all contiguous whitespace. 321 func (s *Scanner) scanWhitespace() Token { 322 // Create a buffer and read the current character into it. 323 var buf bytes.Buffer 324 buf.WriteRune(s.read()) 325 326 // Read every subsequent whitespace character into the buffer. 327 // Non-whitespace characters and EOF will cause the loop to exit. 328 for { 329 ch := s.read() 330 if ch == eofRune { 331 break 332 } 333 334 if !unicode.IsSpace(ch) { 335 s.unread() 336 break 337 } 338 339 buf.WriteRune(ch) 340 } 341 342 s.tok = WHITESPACE 343 s.lit = buf.String() 344 return WHITESPACE 345 } 346 347 // scanIdentifier consumes the current rune and all contiguous identifier runes. 348 func (s *Scanner) scanIdentifier() Token { 349 // Create a buffer and read the current character into it. 350 var buf bytes.Buffer 351 buf.WriteRune(s.read()) 352 353 // Read every subsequent ident character into the buffer. 354 // Non-ident characters and EOF will cause the loop to exit. 355 for { 356 ch := s.read() 357 if ch == eofRune { 358 break 359 } 360 361 if !unicode.IsLetter(ch) && !unicode.IsDigit(ch) && ch != '_' { 362 s.unread() 363 break 364 } 365 366 buf.WriteRune(ch) 367 } 368 369 s.tok = IDENT 370 s.lit = buf.String() 371 return s.tok 372 } 373 374 func (s *Scanner) scanStringLiteral(endChar rune, multiLine bool) Token { 375 // Create a buffer and read the current character into it. 376 var buf bytes.Buffer 377 buf.WriteRune(s.read()) 378 379 // Read characters until the closing quote is found, or until either error, 380 // newline, or EOF is read. 381 for { 382 ch := s.read() 383 if ch == errRune || ch == eofRune || (!multiLine && ch == '\n') { 384 s.unread() 385 s.tok = ILLEGAL 386 break 387 } 388 389 buf.WriteRune(ch) 390 391 if ch == endChar { 392 s.tok = STRING 393 break 394 } 395 } 396 397 s.lit = buf.String() 398 return s.tok 399 } 400 401 func (s *Scanner) scanNumericLiteral() Token { 402 // Create a buffer and read the current character into it. 403 var buf bytes.Buffer 404 buf.WriteRune(s.read()) 405 406 // Read every subsequent Unicode digit character into the buffer. 407 // Non-digit characters and EOF will cause the loop to exit. 408 for { 409 ch := s.read() 410 if ch == eofRune { 411 break 412 } 413 414 if !unicode.IsDigit(ch) { 415 s.unread() 416 break 417 } 418 419 buf.WriteRune(ch) 420 } 421 422 s.tok = NUMBER 423 s.lit = buf.String() 424 return s.tok 425 } 426 427 // scanComment consumes the current rune and all characters until newline. 428 func (s *Scanner) scanComment() Token { 429 // Create a buffer and read the current character into it. 430 var buf bytes.Buffer 431 buf.WriteRune(s.read()) 432 433 // Read every subsequent character into the buffer until either error, 434 // newline, or EOF is read. 435 for { 436 ch := s.read() 437 if ch == errRune || ch == eofRune || ch == '\n' { 438 s.unread() 439 break 440 } 441 442 buf.WriteRune(ch) 443 } 444 445 s.tok = COMMENT 446 s.lit = buf.String() 447 return COMMENT 448 }