github.com/joomcode/cue@v0.4.4-0.20221111115225-539fe3512047/cue/scanner/scanner.go (about) 1 // Copyright 2018 The CUE Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package scanner implements a scanner for CUE source text. It takes a []byte 16 // as source which can then be tokenized through repeated calls to the Scan 17 // method. 18 package scanner // import "github.com/joomcode/cue/cue/scanner" 19 20 import ( 21 "bytes" 22 "fmt" 23 "path/filepath" 24 "strconv" 25 "strings" 26 "unicode" 27 "unicode/utf8" 28 29 "github.com/joomcode/cue/cue/token" 30 ) 31 32 // An ErrorHandler is a generic error handler used throughout CUE packages. 33 // 34 // The position points to the beginning of the offending value. 35 type ErrorHandler func(pos token.Pos, msg string, args []interface{}) 36 37 // A Scanner holds the Scanner's internal state while processing 38 // a given text. It can be allocated as part of another data 39 // structure but must be initialized via Init before use. 40 type Scanner struct { 41 // immutable state 42 file *token.File // source file handle 43 dir string // directory portion of file.Name() 44 src []byte // source 45 errh ErrorHandler // error reporting; or nil 46 mode Mode // scanning mode 47 48 // scanning state 49 ch rune // current character 50 offset int // character offset 51 rdOffset int // reading offset (position after current character) 52 lineOffset int // current line offset 53 linesSinceLast int 54 spacesSinceLast int 55 insertEOL bool // insert a comma before next newline 56 57 quoteStack []quoteInfo 58 59 // public state - ok to modify 60 ErrorCount int // number of errors encountered 61 } 62 63 type quoteInfo struct { 64 char rune 65 numChar int 66 numHash int 67 } 68 69 const bom = 0xFEFF // byte order mark, only permitted as very first character 70 71 // Read the next Unicode char into s.ch. 72 // s.ch < 0 means end-of-file. 73 func (s *Scanner) next() { 74 if s.rdOffset < len(s.src) { 75 s.offset = s.rdOffset 76 if s.ch == '\n' { 77 s.lineOffset = s.offset 78 s.file.AddLine(s.offset) 79 } 80 r, w := rune(s.src[s.rdOffset]), 1 81 switch { 82 case r == 0: 83 s.errf(s.offset, "illegal character NUL") 84 case r >= utf8.RuneSelf: 85 // not ASCII 86 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 87 if r == utf8.RuneError && w == 1 { 88 s.errf(s.offset, "illegal UTF-8 encoding") 89 } else if r == bom && s.offset > 0 { 90 s.errf(s.offset, "illegal byte order mark") 91 } 92 } 93 s.rdOffset += w 94 s.ch = r 95 } else { 96 s.offset = len(s.src) 97 if s.ch == '\n' { 98 s.lineOffset = s.offset 99 s.file.AddLine(s.offset) 100 } 101 s.ch = -1 // eof 102 } 103 } 104 105 // A Mode value is a set of flags (or 0). 106 // They control scanner behavior. 107 type Mode uint 108 109 // These constants are options to the Init function. 110 const ( 111 ScanComments Mode = 1 << iota // return comments as COMMENT tokens 112 dontInsertCommas // do not automatically insert commas - for testing only 113 ) 114 115 // Init prepares the scanner s to tokenize the text src by setting the 116 // scanner at the beginning of src. The scanner uses the file set file 117 // for position information and it adds line information for each line. 118 // It is ok to re-use the same file when re-scanning the same file as 119 // line information which is already present is ignored. Init causes a 120 // panic if the file size does not match the src size. 121 // 122 // Calls to Scan will invoke the error handler err if they encounter a 123 // syntax error and err is not nil. Also, for each error encountered, 124 // the Scanner field ErrorCount is incremented by one. The mode parameter 125 // determines how comments are handled. 126 // 127 // Note that Init may call err if there is an error in the first character 128 // of the file. 129 func (s *Scanner) Init(file *token.File, src []byte, eh ErrorHandler, mode Mode) { 130 // Explicitly initialize all fields since a scanner may be reused. 131 if file.Size() != len(src) { 132 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 133 } 134 s.file = file 135 s.dir, _ = filepath.Split(file.Name()) 136 s.src = src 137 s.errh = eh 138 s.mode = mode 139 140 s.ch = ' ' 141 s.offset = 0 142 s.rdOffset = 0 143 s.lineOffset = 0 144 s.insertEOL = false 145 s.ErrorCount = 0 146 147 s.next() 148 if s.ch == bom { 149 s.next() // ignore BOM at file beginning 150 } 151 } 152 153 func (s *Scanner) errf(offs int, msg string, args ...interface{}) { 154 if s.errh != nil { 155 s.errh(s.file.Pos(offs, 0), msg, args) 156 } 157 s.ErrorCount++ 158 } 159 160 var prefix = []byte("//line ") 161 162 func (s *Scanner) interpretLineComment(text []byte) { 163 if bytes.HasPrefix(text, prefix) { 164 // get filename and line number, if any 165 if i := bytes.LastIndex(text, []byte{':'}); i > 0 { 166 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 { 167 // valid //line filename:line comment 168 filename := string(bytes.TrimSpace(text[len(prefix):i])) 169 if filename != "" { 170 filename = filepath.Clean(filename) 171 if !filepath.IsAbs(filename) { 172 // make filename relative to current directory 173 filename = filepath.Join(s.dir, filename) 174 } 175 } 176 // update scanner position 177 s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line 178 } 179 } 180 } 181 } 182 183 func (s *Scanner) scanComment() string { 184 // initial '/' already consumed; s.ch == '/' || s.ch == '*' 185 offs := s.offset - 1 // position of initial '/' 186 hasCR := false 187 188 if s.ch == '/' { 189 //-style comment 190 s.next() 191 for s.ch != '\n' && s.ch >= 0 { 192 if s.ch == '\r' { 193 hasCR = true 194 } 195 s.next() 196 } 197 if offs == s.lineOffset { 198 // comment starts at the beginning of the current line 199 s.interpretLineComment(s.src[offs:s.offset]) 200 } 201 goto exit 202 } 203 204 s.errf(offs, "comment not terminated") 205 206 exit: 207 lit := s.src[offs:s.offset] 208 if hasCR { 209 // TODO: preserve /r/n 210 lit = stripCR(lit) 211 } 212 213 return string(lit) 214 } 215 216 func (s *Scanner) findLineEnd() bool { 217 // initial '/' already consumed 218 219 defer func(offs int) { 220 // reset scanner state to where it was upon calling findLineEnd 221 s.ch = '/' 222 s.offset = offs 223 s.rdOffset = offs + 1 224 s.next() // consume initial '/' again 225 }(s.offset - 1) 226 227 // read ahead until a newline, EOF, or non-comment token is found 228 for s.ch == '/' || s.ch == '*' { 229 if s.ch == '/' { 230 //-style comment always contains a newline 231 return true 232 } 233 /*-style comment: look for newline */ 234 s.next() 235 for s.ch >= 0 { 236 ch := s.ch 237 if ch == '\n' { 238 return true 239 } 240 s.next() 241 if ch == '*' && s.ch == '/' { 242 s.next() 243 break 244 } 245 } 246 s.skipWhitespace(0) // s.insertSemi is set 247 if s.ch < 0 || s.ch == '\n' { 248 return true 249 } 250 if s.ch != '/' { 251 // non-comment token 252 return false 253 } 254 s.next() // consume '/' 255 } 256 257 return false 258 } 259 260 func isLetter(ch rune) bool { 261 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) 262 } 263 264 func isDigit(ch rune) bool { 265 // TODO(mpvl): Is this correct? 266 return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch) 267 } 268 269 func (s *Scanner) scanFieldIdentifier() string { 270 offs := s.offset 271 if s.ch == '_' { 272 s.next() 273 } 274 if s.ch == '#' { 275 s.next() 276 // TODO: remove this block to allow #<num> 277 if isDigit(s.ch) { 278 return string(s.src[offs:s.offset]) 279 } 280 } 281 for isLetter(s.ch) || isDigit(s.ch) || s.ch == '_' || s.ch == '$' { 282 s.next() 283 } 284 return string(s.src[offs:s.offset]) 285 } 286 287 func (s *Scanner) scanIdentifier() string { 288 offs := s.offset 289 for isLetter(s.ch) || isDigit(s.ch) || s.ch == '_' || s.ch == '$' { 290 s.next() 291 } 292 return string(s.src[offs:s.offset]) 293 } 294 295 func isExtendedIdent(r rune) bool { 296 return strings.IndexRune("-_#$%. ", r) >= 0 297 } 298 299 func digitVal(ch rune) int { 300 switch { 301 case '0' <= ch && ch <= '9': 302 return int(ch - '0') 303 case ch == '_': 304 return 0 305 case 'a' <= ch && ch <= 'f': 306 return int(ch - 'a' + 10) 307 case 'A' <= ch && ch <= 'F': 308 return int(ch - 'A' + 10) 309 } 310 return 16 // larger than any legal digit val 311 } 312 313 func (s *Scanner) scanMantissa(base int) { 314 var last rune 315 for digitVal(s.ch) < base { 316 if last == '_' && s.ch == '_' { 317 s.errf(s.offset, "illegal '_' in number") 318 } 319 last = s.ch 320 s.next() 321 } 322 if last == '_' { 323 s.errf(s.offset-1, "illegal '_' in number") 324 } 325 } 326 327 func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) { 328 // digitVal(s.ch) < 10 329 offs := s.offset 330 tok := token.INT 331 332 if seenDecimalPoint { 333 offs-- 334 tok = token.FLOAT 335 s.scanMantissa(10) 336 goto exponent 337 } 338 339 if s.ch == '0' { 340 // int or float 341 offs := s.offset 342 s.next() 343 if s.ch == 'x' || s.ch == 'X' { 344 // hexadecimal int 345 s.next() 346 s.scanMantissa(16) 347 if s.offset-offs <= 2 { 348 // only scanned "0x" or "0X" 349 s.errf(offs, "illegal hexadecimal number") 350 } 351 } else if s.ch == 'b' { 352 // binary int 353 s.next() 354 s.scanMantissa(2) 355 if s.offset-offs <= 2 { 356 // only scanned "0b" 357 s.errf(offs, "illegal binary number") 358 } 359 } else if s.ch == 'o' { 360 // octal int 361 s.next() 362 s.scanMantissa(8) 363 if s.offset-offs <= 2 { 364 // only scanned "0o" 365 s.errf(offs, "illegal octal number") 366 } 367 } else { 368 // 0 or float 369 seenDigits := false 370 if s.ch >= '0' && s.ch <= '9' { 371 seenDigits = true 372 s.scanMantissa(10) 373 } 374 if s.ch == '.' || s.ch == 'e' || s.ch == 'E' { 375 goto fraction 376 } 377 if seenDigits { 378 // integer other than 0 may not start with 0 379 s.errf(offs, "illegal integer number") 380 } 381 } 382 goto exit 383 } 384 385 // decimal int or float 386 s.scanMantissa(10) 387 388 // TODO: allow 3h4s, etc. 389 // switch s.ch { 390 // case 'h', 'm', 's', "ยต"[0], 'u', 'n': 391 // } 392 393 fraction: 394 if s.ch == '.' { 395 if p := s.offset + 1; p < len(s.src) && s.src[p] == '.' { 396 // interpret dot as part of a range. 397 goto exit 398 } 399 tok = token.FLOAT 400 s.next() 401 s.scanMantissa(10) 402 } 403 404 exponent: 405 switch s.ch { 406 case 'K', 'M', 'G', 'T', 'P': 407 tok = token.INT // TODO: Or should we allow this to be a float? 408 s.next() 409 if s.ch == 'i' { 410 s.next() 411 } 412 goto exit 413 } 414 415 if s.ch == 'e' || s.ch == 'E' { 416 tok = token.FLOAT 417 s.next() 418 if s.ch == '-' || s.ch == '+' { 419 s.next() 420 } 421 s.scanMantissa(10) 422 } 423 424 exit: 425 return tok, string(s.src[offs:s.offset]) 426 } 427 428 // scanEscape parses an escape sequence where rune is the accepted 429 // escaped quote. In case of a syntax error, it stops at the offending 430 // character (without consuming it) and returns false. Otherwise 431 // it returns true. 432 // 433 // Must be compliant with https://tools.ietf.org/html/rfc4627. 434 func (s *Scanner) scanEscape(quote quoteInfo) (ok, interpolation bool) { 435 for i := 0; i < quote.numHash; i++ { 436 if s.ch != '#' { 437 return true, false 438 } 439 s.next() 440 } 441 442 offs := s.offset 443 444 var n int 445 var base, max uint32 446 switch s.ch { 447 case '(': 448 return true, true 449 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '/', quote.char: 450 s.next() 451 return true, false 452 case '0', '1', '2', '3', '4', '5', '6', '7': 453 n, base, max = 3, 8, 255 454 case 'x': 455 s.next() 456 n, base, max = 2, 16, 255 457 case 'u': 458 s.next() 459 n, base, max = 4, 16, unicode.MaxRune 460 case 'U': 461 s.next() 462 n, base, max = 8, 16, unicode.MaxRune 463 default: 464 msg := "unknown escape sequence" 465 if s.ch < 0 { 466 msg = "escape sequence not terminated" 467 } 468 s.errf(offs, msg) 469 return false, false 470 } 471 472 var x uint32 473 for n > 0 { 474 d := uint32(digitVal(s.ch)) 475 if d >= base { 476 if s.ch < 0 { 477 s.errf(s.offset, "escape sequence not terminated") 478 } else { 479 s.errf(s.offset, "illegal character %#U in escape sequence", s.ch) 480 } 481 return false, false 482 } 483 x = x*base + d 484 s.next() 485 n-- 486 } 487 488 // TODO: this is valid JSON, so remove, but normalize and report an error 489 // if for unmatched surrogate pairs . 490 if x > max { 491 s.errf(offs, "escape sequence is invalid Unicode code point") 492 return false, false 493 } 494 495 return true, false 496 } 497 498 func (s *Scanner) scanString(offs int, quote quoteInfo) (token.Token, string) { 499 // ", """, ', or ''' opening already consumed 500 501 tok := token.STRING 502 503 hasCR := false 504 extra := 0 505 for { 506 ch := s.ch 507 if (quote.numChar != 3 && ch == '\n') || ch < 0 { 508 s.errf(offs, "string literal not terminated") 509 lit := s.src[offs:s.offset] 510 if hasCR { 511 lit = stripCR(lit) 512 } 513 return tok, string(lit) 514 } 515 516 s.next() 517 ch, ok := s.consumeStringClose(ch, quote) 518 if ok { 519 break 520 } 521 if ch == '\r' && quote.numChar == 3 { 522 hasCR = true 523 } 524 if ch == '\\' { 525 if _, interpolation := s.scanEscape(quote); interpolation { 526 tok = token.INTERPOLATION 527 extra = 1 528 s.quoteStack = append(s.quoteStack, quote) 529 break 530 } 531 } 532 } 533 lit := s.src[offs : s.offset+extra] 534 if hasCR { 535 lit = stripCR(lit) 536 } 537 return tok, string(lit) 538 } 539 540 func (s *Scanner) consumeQuotes(quote rune, max int) (next rune, n int) { 541 for ; n < max; n++ { 542 if s.ch != quote { 543 return s.ch, n 544 } 545 s.next() 546 } 547 return s.ch, n 548 } 549 550 func (s *Scanner) consumeStringClose(ch rune, quote quoteInfo) (next rune, atEnd bool) { 551 if quote.char != ch { 552 return ch, false 553 } 554 numChar := quote.numChar 555 n := numChar + quote.numHash 556 want := quote.char 557 for i := 1; i < n; i++ { 558 if i == numChar { 559 want = '#' 560 } 561 if want != s.ch { 562 return ch, false 563 } 564 ch = s.ch 565 s.next() 566 } 567 return s.ch, true 568 } 569 570 func (s *Scanner) checkHashCount(offs int, quote quoteInfo) { 571 for i := 0; i < quote.numHash; i++ { 572 if s.ch != '#' { 573 s.errf(offs, "string literal not terminated") 574 return 575 } 576 s.next() 577 } 578 } 579 580 func stripCR(b []byte) []byte { 581 c := make([]byte, len(b)) 582 i := 0 583 for _, ch := range b { 584 if ch != '\r' { 585 c[i] = ch 586 i++ 587 } 588 } 589 return c[:i] 590 } 591 592 // scanAttribute scans aa full attribute of the form @foo(str). An attribute 593 // is a lexical entry and as such whitespace is treated as normal characters 594 // within the attribute. 595 func (s *Scanner) scanAttribute() (tok token.Token, lit string) { 596 offs := s.offset - 1 // @ already consumed 597 598 s.scanIdentifier() 599 600 if _, tok, _ := s.Scan(); tok == token.LPAREN { 601 s.scanAttributeTokens(token.RPAREN) 602 } else { 603 s.errf(s.offset, "invalid attribute: expected '('") 604 } 605 return token.ATTRIBUTE, string(s.src[offs:s.offset]) 606 } 607 608 func (s *Scanner) scanAttributeTokens(close token.Token) { 609 for { 610 switch _, tok, _ := s.Scan(); tok { 611 case close: 612 return 613 case token.EOF: 614 s.errf(s.offset, "attribute missing '%s'", close) 615 return 616 617 case token.INTERPOLATION: 618 s.errf(s.offset, "interpolation not allowed in attribute") 619 s.popInterpolation() 620 s.recoverParen(1) 621 case token.LPAREN: 622 s.scanAttributeTokens(token.RPAREN) 623 case token.LBRACE: 624 s.scanAttributeTokens(token.RBRACE) 625 case token.LBRACK: 626 s.scanAttributeTokens(token.RBRACK) 627 case token.RPAREN, token.RBRACK, token.RBRACE: 628 s.errf(s.offset, "unexpected '%s'", tok) 629 } 630 } 631 } 632 633 // recoverParen is an approximate recovery mechanism to recover from invalid 634 // attributes. 635 func (s *Scanner) recoverParen(open int) { 636 for { 637 switch s.ch { 638 case '\n', -1: 639 return 640 case '(': 641 open++ 642 case ')': 643 if open--; open == 0 { 644 return 645 } 646 } 647 s.next() 648 } 649 } 650 651 func (s *Scanner) skipWhitespace(inc int) { 652 for { 653 switch s.ch { 654 case ' ', '\t': 655 s.spacesSinceLast += inc 656 case '\n': 657 s.linesSinceLast += inc 658 if s.insertEOL { 659 return 660 } 661 case '\r': 662 default: 663 return 664 } 665 s.next() 666 } 667 } 668 669 // Helper functions for scanning multi-byte tokens such as >> += >>= . 670 // Different routines recognize different length tok_i based on matches 671 // of ch_i. If a token ends in '=', the result is tok1 or tok3 672 // respectively. Otherwise, the result is tok0 if there was no other 673 // matching character, or tok2 if the matching character was ch2. 674 675 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token { 676 if s.ch == '=' { 677 s.next() 678 return tok1 679 } 680 return tok0 681 } 682 683 func (s *Scanner) popInterpolation() quoteInfo { 684 quote := s.quoteStack[len(s.quoteStack)-1] 685 s.quoteStack = s.quoteStack[:len(s.quoteStack)-1] 686 return quote 687 } 688 689 // ResumeInterpolation resumes scanning of a string interpolation. 690 func (s *Scanner) ResumeInterpolation() string { 691 quote := s.popInterpolation() 692 _, str := s.scanString(s.offset-1, quote) 693 return str 694 } 695 696 // Scan scans the next token and returns the token position, the token, 697 // and its literal string if applicable. The source end is indicated by 698 // EOF. 699 // 700 // If the returned token is a literal (IDENT, INT, FLOAT, 701 // IMAG, CHAR, STRING) or COMMENT, the literal string 702 // has the corresponding value. 703 // 704 // If the returned token is a keyword, the literal string is the keyword. 705 // 706 // If the returned token is Comma, the corresponding 707 // literal string is "," if the comma was present in the source, 708 // and "\n" if the semicolon was inserted because of a newline or 709 // at EOF. 710 // 711 // If the returned token is ILLEGAL, the literal string is the 712 // offending character. 713 // 714 // In all other cases, Scan returns an empty literal string. 715 // 716 // For more tolerant parsing, Scan will return a valid token if 717 // possible even if a syntax error was encountered. Thus, even 718 // if the resulting token sequence contains no illegal tokens, 719 // a client may not assume that no error occurred. Instead it 720 // must check the scanner's ErrorCount or the number of calls 721 // of the error handler, if there was one installed. 722 // 723 // Scan adds line information to the file added to the file 724 // set with Init. Token positions are relative to that file 725 // and thus relative to the file set. 726 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) { 727 scanAgain: 728 s.skipWhitespace(1) 729 730 var rel token.RelPos 731 switch { 732 case s.linesSinceLast > 1: 733 rel = token.NewSection 734 case s.linesSinceLast == 1: 735 rel = token.Newline 736 case s.spacesSinceLast > 0: 737 rel = token.Blank 738 default: 739 rel = token.NoSpace 740 } 741 // current token start 742 offset := s.offset 743 pos = s.file.Pos(offset, rel) 744 745 // determine token value 746 insertEOL := false 747 var quote quoteInfo 748 switch ch := s.ch; { 749 case '0' <= ch && ch <= '9': 750 insertEOL = true 751 tok, lit = s.scanNumber(false) 752 case isLetter(ch), ch == '$', ch == '#': 753 lit = s.scanFieldIdentifier() 754 if len(lit) > 1 { 755 // keywords are longer than one letter - avoid lookup otherwise 756 tok = token.Lookup(lit) 757 insertEOL = true 758 break 759 } 760 if ch != '#' || (s.ch != '\'' && s.ch != '"' && s.ch != '#') { 761 tok = token.IDENT 762 insertEOL = true 763 break 764 } 765 quote.numHash = 1 766 ch = s.ch 767 fallthrough 768 default: 769 s.next() // always make progress 770 switch ch { 771 case -1: 772 if s.insertEOL { 773 s.insertEOL = false // EOF consumed 774 return s.file.Pos(offset, token.Elided), token.COMMA, "\n" 775 } 776 tok = token.EOF 777 case '_': 778 if s.ch == '|' { 779 // Unconditionally require this to be followed by another 780 // underscore to avoid needing an extra lookahead. 781 // Note that `_|x` is always equal to _. 782 s.next() 783 if s.ch != '_' { 784 s.errf(s.file.Offset(pos), "illegal token '_|'; expected '_'") 785 insertEOL = s.insertEOL // preserve insertComma info 786 tok = token.ILLEGAL 787 lit = "_|" 788 break 789 } 790 s.next() 791 tok = token.BOTTOM 792 lit = "_|_" 793 } else { 794 tok = token.IDENT 795 lit = "_" + s.scanFieldIdentifier() 796 } 797 insertEOL = true 798 799 case '\n': 800 // we only reach here if s.insertComma was 801 // set in the first place and exited early 802 // from s.skipWhitespace() 803 s.insertEOL = false // newline consumed 804 p := s.file.Pos(offset, token.Elided) 805 s.skipWhitespace(1) 806 // Don't elide comma before a ',' or ':' to ensure JSON 807 // conformance. Note that cue fmt should immediately undo those. 808 if s.ch == ',' || s.ch == ':' { 809 return s.Scan() 810 } 811 return p, token.COMMA, "\n" 812 813 case '#': 814 for quote.numHash++; s.ch == '#'; quote.numHash++ { 815 s.next() 816 } 817 ch = s.ch 818 if ch != '\'' && ch != '"' { 819 break 820 } 821 s.next() 822 fallthrough 823 case '"', '\'': 824 insertEOL = true 825 quote.char = ch 826 quote.numChar = 1 827 offs := s.offset - 1 - quote.numHash 828 switch _, n := s.consumeQuotes(ch, 2); n { 829 case 0: 830 quote.numChar = 1 831 tok, lit = s.scanString(offs, quote) 832 case 1: 833 s.checkHashCount(offs, quote) 834 tok, lit = token.STRING, string(s.src[offs:s.offset]) 835 case 2: 836 quote.numChar = 3 837 switch s.ch { 838 case '\n': 839 s.next() 840 tok, lit = s.scanString(offs, quote) 841 case '\r': 842 s.next() 843 if s.ch == '\n' { 844 s.next() 845 tok, lit = s.scanString(offs, quote) 846 break 847 } 848 fallthrough 849 default: 850 s.errf(offs, "expected newline after multiline quote %s", 851 s.src[offs:s.offset]) 852 tok, lit = token.STRING, string(s.src[offs:s.offset]) 853 } 854 } 855 case '@': 856 insertEOL = true 857 tok, lit = s.scanAttribute() 858 case ':': 859 if s.ch == ':' { 860 s.next() 861 tok = token.ISA 862 } else { 863 tok = token.COLON 864 } 865 case ';': 866 tok = token.SEMICOLON 867 insertEOL = true 868 case '?': 869 tok = token.OPTION 870 insertEOL = true 871 case '.': 872 if '0' <= s.ch && s.ch <= '9' { 873 insertEOL = true 874 tok, lit = s.scanNumber(true) 875 } else if s.ch == '.' { 876 s.next() 877 if s.ch == '.' { 878 s.next() 879 tok = token.ELLIPSIS 880 insertEOL = true 881 } else { 882 s.errf(s.file.Offset(pos), "illegal token '..'; expected '.'") 883 } 884 } else { 885 tok = token.PERIOD 886 } 887 case ',': 888 tok = token.COMMA 889 lit = "," 890 case '(': 891 tok = token.LPAREN 892 case ')': 893 insertEOL = true 894 tok = token.RPAREN 895 case '[': 896 tok = token.LBRACK 897 case ']': 898 insertEOL = true 899 tok = token.RBRACK 900 case '{': 901 tok = token.LBRACE 902 case '}': 903 insertEOL = true 904 tok = token.RBRACE 905 case '+': 906 tok = token.ADD // Consider ++ for list concatenate. 907 case '-': 908 tok = token.SUB 909 case '*': 910 tok = token.MUL 911 case '/': 912 if s.ch == '/' { 913 // comment 914 if s.insertEOL && s.findLineEnd() { 915 // reset position to the beginning of the comment 916 s.ch = '/' 917 s.offset = s.file.Offset(pos) 918 s.rdOffset = s.offset + 1 919 s.insertEOL = false // newline consumed 920 return s.file.Pos(offset, token.Elided), token.COMMA, "\n" 921 } 922 comment := s.scanComment() 923 if s.mode&ScanComments == 0 { 924 // skip comment 925 s.insertEOL = false // newline consumed 926 goto scanAgain 927 } 928 tok = token.COMMENT 929 lit = comment 930 } else { 931 tok = token.QUO 932 } 933 // We no longer use %, but seems like a useful token to use for 934 // something else at some point. 935 // case '%': 936 case '<': 937 if s.ch == '-' { 938 s.next() 939 tok = token.ARROW 940 } else { 941 tok = s.switch2(token.LSS, token.LEQ) 942 } 943 case '>': 944 tok = s.switch2(token.GTR, token.GEQ) 945 case '=': 946 if s.ch == '~' { 947 s.next() 948 tok = token.MAT 949 } else { 950 tok = s.switch2(token.BIND, token.EQL) 951 } 952 case '!': 953 if s.ch == '~' { 954 s.next() 955 tok = token.NMAT 956 } else { 957 tok = s.switch2(token.NOT, token.NEQ) 958 } 959 case '&': 960 switch s.ch { 961 case '&': 962 s.next() 963 tok = token.LAND 964 default: 965 tok = token.AND 966 } 967 case '|': 968 if s.ch == '|' { 969 s.next() 970 tok = token.LOR 971 } else { 972 tok = token.OR 973 } 974 default: 975 // next reports unexpected BOMs - don't repeat 976 if ch != bom { 977 s.errf(s.file.Offset(pos), "illegal character %#U", ch) 978 } 979 insertEOL = s.insertEOL // preserve insertSemi info 980 tok = token.ILLEGAL 981 lit = string(ch) 982 } 983 } 984 if s.mode&dontInsertCommas == 0 { 985 s.insertEOL = insertEOL 986 } 987 988 s.linesSinceLast = 0 989 s.spacesSinceLast = 0 990 return 991 }