github.com/hirochachacha/plua@v0.0.0-20170217012138-c82f520cc725/compiler/scanner/scanner.go (about) 1 // Original: src/go/scanner/scanner.go 2 // 3 // Copyright 2009 The Go Authors. All rights reserved. 4 // Portions Copyright 2016 Hiroshi Ioka. All rights reserved. 5 // 6 // Redistribution and use in source and binary forms, with or without 7 // modification, are permitted provided that the following conditions are 8 // met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above 13 // copyright notice, this list of conditions and the following disclaimer 14 // in the documentation and/or other materials provided with the 15 // distribution. 16 // * Neither the name of Google Inc. nor the names of its 17 // contributors may be used to endorse or promote products derived from 18 // this software without specific prior written permission. 19 // 20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 32 package scanner 33 34 import ( 35 "bytes" 36 "errors" 37 "fmt" 38 "io" 39 "unicode" 40 41 "github.com/hirochachacha/plua/compiler/token" 42 "github.com/hirochachacha/plua/position" 43 ) 44 45 const ( 46 maxConsecutiveEmptyReads = 100 47 bom1 = 0xFE 48 bom = "\xFE\xFF" 49 utf8bom1 = 0xEF 50 utf8bom = "\xEF\xBB\xBF" 51 ) 52 53 var ( 54 errInvalidLongStringDelimiter = errors.New("invalid long string delimiter") 55 errIllegalHexadecimalNumber = errors.New("illegal hexadecimal number") 56 errIllegalNumber = errors.New("illegal number") 57 errInvalidEscapeSequence = errors.New("escape sequence is invalid Unicode code point") 58 errUnknownEscapeSequence = errors.New("unknown escape sequence") 59 errMissingBracketInEscapeSequence = errors.New("missing bracket in escape sequence") 60 errIllegalCharacterInEscapeSequence = errors.New("illegal character in escape sequence") 61 errUnterminatedString = errors.New("unterminated string literal") 62 errUnterminatedLongString = errors.New("unterminated long string literal") 63 ) 64 65 type Mode uint 66 67 const ( 68 ScanComments = 1 << iota 69 ) 70 71 type ScanState struct { 72 sourceName string 73 shebang string 74 75 mode Mode 76 77 r io.Reader 78 79 buf []byte 80 start int 81 end int 82 _mark int 83 filled bool 84 85 clip bytes.Buffer 86 87 ch int 88 89 offset int 90 lineOffset int 91 line int 92 93 err error 94 } 95 96 type bailout struct{} 97 98 func Scan(r io.Reader, srcname string, mode Mode) *ScanState { 99 s := &ScanState{ 100 r: r, 101 sourceName: srcname, 102 buf: make([]byte, 4096), 103 mode: mode, 104 _mark: -1, 105 lineOffset: -1, 106 line: 1, 107 } 108 109 return s 110 } 111 112 func (s *ScanState) Reset(r io.Reader, srcname string, mode Mode) { 113 s.sourceName = srcname 114 s.shebang = "" 115 116 s.mode = mode 117 s.r = r 118 119 s.start = 0 120 s.end = 0 121 s._mark = -1 122 s.filled = false 123 124 s.clip.Reset() 125 126 s.offset = 0 127 s.lineOffset = -1 128 s.line = 1 129 130 s.err = nil 131 } 132 133 func (s *ScanState) SourceName() string { 134 return s.sourceName 135 } 136 137 func (s *ScanState) Shebang() string { 138 return s.shebang 139 } 140 141 func (s *ScanState) Token() (tok token.Token, err error) { 142 var typ token.Type 143 var pos position.Position 144 var lit string 145 146 defer func() { 147 if r := recover(); r != nil { 148 _ = r.(bailout) 149 150 err = s.err 151 tok = token.Token{Type: typ, Pos: pos, Lit: lit} 152 s.err = nil 153 s._mark = -1 154 } 155 }() 156 157 if s.offset == 0 { 158 s.init() 159 160 if s.ch == bom1 || s.ch == utf8bom1 { 161 s.skipBom() 162 } 163 164 if s.ch == '#' { 165 s.shebang = s.scanSheBang() 166 } 167 } 168 169 scanAgain: 170 s.skipSpace() 171 172 pos = s.pos() 173 174 switch ch := s.ch; { 175 case isLetter(ch): 176 lit = s.scanIdentifier() 177 if len(lit) > 1 { 178 // keywords are longer than one letter - avoid lookup otherwise 179 typ = token.Lookup(lit) 180 } else { 181 typ = token.NAME 182 } 183 case isDigit(ch): 184 typ, lit = s.scanNumber(false) 185 default: 186 switch ch { 187 case -1: 188 typ = token.EOF 189 case '"', '\'': 190 typ = token.STRING 191 lit = s.scanString(ch) 192 case ':': 193 s.next() 194 195 if s.ch == ':' { 196 s.next() 197 typ = token.LABEL 198 } else { 199 typ = token.COLON 200 } 201 case '.': 202 switch p := s.peek(2); p { 203 case "..": 204 s.next() 205 s.next() 206 if s.ch == '.' { 207 s.next() 208 typ = token.ELLIPSIS 209 } else { 210 typ = token.CONCAT 211 } 212 default: 213 if len(p) == 2 && '0' <= p[1] && p[1] <= '9' { 214 typ, lit = s.scanNumber(true) 215 } else { 216 s.next() 217 typ = token.PERIOD 218 } 219 } 220 case ',': 221 s.next() 222 223 typ = token.COMMA 224 case ';': 225 s.next() 226 227 typ = token.SEMICOLON 228 case '(': 229 s.next() 230 231 typ = token.LPAREN 232 case ')': 233 s.next() 234 235 typ = token.RPAREN 236 case '{': 237 s.next() 238 239 typ = token.LBRACE 240 case '}': 241 s.next() 242 243 typ = token.RBRACE 244 case '[': 245 switch s.peek(2) { 246 case "[[": 247 typ = token.STRING 248 lit = s.scanLongString(true) 249 case "[=": 250 typ = token.STRING 251 lit = s.scanLongString(false) 252 default: 253 s.next() 254 255 typ = token.LBRACK 256 } 257 case ']': 258 s.next() 259 260 typ = token.RBRACK 261 case '+': 262 s.next() 263 264 typ = token.ADD 265 case '-': 266 if s.peek(2) == "--" { 267 typ = token.COMMENT 268 269 lit = s.scanComment() 270 271 if s.mode&ScanComments == 0 { 272 goto scanAgain 273 } 274 } else { 275 s.next() 276 277 typ = token.SUB 278 } 279 case '*': 280 s.next() 281 282 typ = token.MUL 283 case '%': 284 s.next() 285 286 typ = token.MOD 287 case '^': 288 s.next() 289 290 typ = token.POW 291 case '/': 292 s.next() 293 294 if s.ch == '/' { 295 s.next() 296 typ = token.IDIV 297 } else { 298 typ = token.DIV 299 } 300 case '&': 301 s.next() 302 303 typ = token.BAND 304 case '|': 305 s.next() 306 307 typ = token.BOR 308 case '~': 309 s.next() 310 311 if s.ch == '=' { 312 s.next() 313 typ = token.NE 314 } else { 315 typ = token.BXOR 316 } 317 case '<': 318 s.next() 319 320 switch s.ch { 321 case '<': 322 s.next() 323 typ = token.SHL 324 case '=': 325 s.next() 326 typ = token.LE 327 default: 328 typ = token.LT 329 } 330 case '>': 331 s.next() 332 333 switch s.ch { 334 case '>': 335 s.next() 336 typ = token.SHR 337 case '=': 338 s.next() 339 typ = token.GE 340 default: 341 typ = token.GT 342 } 343 case '=': 344 s.next() 345 346 if s.ch == '=' { 347 s.next() 348 typ = token.EQ 349 } else { 350 typ = token.ASSIGN 351 } 352 case '#': 353 s.next() 354 355 typ = token.LEN 356 default: 357 s.next() 358 s.error(pos, fmt.Errorf("illegal character %c", ch)) 359 typ = token.ILLEGAL 360 lit = string(ch) 361 } 362 } 363 364 tok = token.Token{Type: typ, Pos: pos, Lit: lit} 365 366 return 367 } 368 369 func (s *ScanState) skipBom() { 370 switch { 371 case s.ch == bom1 && s.peek(2) == bom: 372 s.next() 373 s.next() 374 case s.ch == utf8bom1 && s.peek(3) == utf8bom: 375 s.next() 376 s.next() 377 s.next() 378 } 379 } 380 381 func trimRightCR(s string) string { 382 if len(s) > 0 && s[len(s)-1] == '\r' { 383 s = s[:len(s)-1] 384 } 385 return s 386 } 387 388 func (s *ScanState) scanSheBang() (shebang string) { 389 s.mark() 390 391 s.next() 392 for s.ch != '\n' { 393 if s.ch == -1 { 394 return trimRightCR(s.capture()) 395 } 396 s.next() 397 } 398 399 shebang = trimRightCR(s.capture()) 400 401 s.next() 402 403 return 404 } 405 406 func (s *ScanState) scanComment() (lit string) { 407 var err error 408 409 s.mark() 410 411 s.next() // skip '-' 412 s.next() // skip '-' 413 414 if s.ch == '[' { 415 s.next() 416 switch s.ch { 417 case '[': 418 err = s.skipLongString(true, true) 419 if err != nil { 420 s.error(s.pos(), err) 421 } 422 423 lit = s.capture() 424 425 return 426 case '=': 427 err = s.skipLongString(true, false) 428 if err != nil { 429 s.error(s.pos(), err) 430 } 431 432 lit = s.capture() 433 434 return 435 } 436 } 437 438 for s.ch != '\n' && s.ch >= 0 { 439 s.next() 440 } 441 442 lit = trimRightCR(s.capture()) 443 444 return 445 } 446 447 func (s *ScanState) scanIdentifier() (lit string) { 448 s.mark() 449 450 s.next() 451 452 for isLetter(s.ch) || isDigit(s.ch) { 453 s.next() 454 } 455 456 return s.capture() 457 } 458 459 func (s *ScanState) skipMantissa(base int) { 460 for digitVal(s.ch) < base { 461 s.next() 462 } 463 } 464 465 func (s *ScanState) scanNumber(seenDecimalPoint bool) (tok token.Type, lit string) { 466 s.mark() 467 468 tok = token.INT 469 470 base := 10 471 472 ioff := s.offset 473 ipos := s.pos() 474 475 if seenDecimalPoint { 476 s.next() // skip . 477 tok = token.FLOAT 478 479 if s.ch == '.' { 480 s.error(s.pos(), errIllegalNumber) 481 } 482 483 s.skipMantissa(base) 484 485 goto exponent 486 } 487 488 if s.ch == '0' { 489 // int or float 490 s.next() 491 492 // hexadecimal int or float 493 if s.ch == 'x' || s.ch == 'X' { 494 s.next() 495 496 base = 16 497 } 498 } 499 500 s.skipMantissa(base) 501 502 if s.ch == '.' { 503 tok = token.FLOAT 504 s.next() 505 506 if s.ch == '.' { 507 s.error(s.pos(), errIllegalNumber) 508 } 509 510 s.skipMantissa(base) 511 } 512 513 exponent: 514 if base == 16 { 515 if s.offset-ioff <= 2 { 516 // only scanned "0x" or "0X" 517 s.error(ipos, errIllegalHexadecimalNumber) 518 } 519 520 if s.ch == 'p' || s.ch == 'P' { 521 tok = token.FLOAT 522 s.next() 523 524 if s.ch == '-' || s.ch == '+' { 525 s.next() 526 } 527 528 poff := s.offset 529 530 s.skipMantissa(10) 531 532 if s.offset-poff == 0 { 533 // only scanned "p" 534 s.error(s.pos(), errIllegalHexadecimalNumber) 535 } 536 } 537 } else { 538 if s.ch == 'e' || s.ch == 'E' { 539 tok = token.FLOAT 540 s.next() 541 542 if s.ch == '-' || s.ch == '+' { 543 s.next() 544 } 545 546 poff := s.offset 547 548 s.skipMantissa(base) 549 550 if s.offset-poff == 0 { 551 // only scanned "e" 552 s.error(s.pos(), errIllegalNumber) 553 } 554 } 555 } 556 557 lit = s.capture() 558 559 return 560 } 561 562 func (s *ScanState) scanString(quote int) (lit string) { 563 s.mark() 564 565 s.next() 566 567 for s.ch != quote { 568 if s.ch == '\n' || s.ch == '\r' || s.ch < 0 { 569 lit = s.capture() 570 571 s.error(s.pos(), errUnterminatedString) 572 573 return 574 } 575 576 if s.ch == '\\' { 577 s.skipEscape(quote) 578 } else { 579 s.next() 580 } 581 } 582 583 s.next() 584 585 lit = s.capture() 586 587 return 588 } 589 590 func (s *ScanState) skipEscape(quote int) { 591 s.next() 592 593 pos := s.pos() 594 595 var pred func(int) bool 596 var i, base, max uint32 597 598 switch s.ch { 599 case '\r': 600 s.next() 601 if s.ch == '\n' { // CRLN 602 s.next() 603 } 604 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\n', '\'', '"': 605 s.next() 606 return 607 case 'z': 608 s.next() 609 s.skipSpace() 610 return 611 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 612 i, base, max = 3, 10, 255 613 pred = isDigit 614 case 'x': 615 s.next() 616 i, base, max = 2, 16, 255 617 pred = isXdigit 618 case 'u': 619 s.next() 620 621 if s.ch != '{' { 622 s.error(pos, errMissingBracketInEscapeSequence) 623 624 return 625 } 626 627 s.next() 628 629 i, base, max = 8, 16, unicode.MaxRune 630 pred = isXdigit 631 default: 632 ch := s.ch 633 s.next() // always make progress 634 s.error(pos, fmt.Errorf("unknown escape sequence %c", ch)) 635 636 return 637 } 638 639 var x uint32 640 641 j := i 642 for ; j > 0 && s.ch != quote && pred(s.ch); j-- { 643 d := uint32(digitVal(s.ch)) 644 if d >= base { 645 // if not unicode 646 if max != unicode.MaxRune { 647 s.error(pos, fmt.Errorf("illegal character %c in escape sequence", s.ch)) 648 } 649 650 break 651 } 652 653 // check overflow 654 if x > (unicode.MaxRune-d)/base { 655 s.error(pos, fmt.Errorf("escape sequence is invalid Unicode code point %c", s.ch)) 656 657 return 658 } 659 660 x = x*base + d 661 662 s.next() 663 } 664 665 // hex 666 if i == 2 { 667 if j > 0 { 668 s.error(pos, errUnknownEscapeSequence) 669 670 return 671 } 672 } 673 674 // unicode 675 if max == unicode.MaxRune { 676 if s.ch != '}' { 677 s.error(pos, errMissingBracketInEscapeSequence) 678 679 return 680 } 681 682 s.next() 683 684 if 0xD800 <= x && x < 0xE000 { 685 s.error(pos, fmt.Errorf("escape sequence is invalid Unicode code point %c", s.ch)) 686 } 687 688 return 689 } 690 691 if x > max { 692 s.error(pos, errInvalidEscapeSequence) 693 } 694 } 695 696 func (s *ScanState) scanLongString(simple bool) (lit string) { 697 var err error 698 699 s.mark() 700 701 s.next() 702 703 err = s.skipLongString(false, simple) 704 if err != nil { 705 s.error(s.pos(), err) 706 } 707 708 lit = s.capture() 709 710 return 711 } 712 713 func (s *ScanState) skipLongString(comment bool, simple bool) (err error) { 714 s.next() 715 716 if simple { 717 for { 718 for s.ch != ']' { 719 if s.ch < 0 { 720 err = errUnterminatedLongString 721 722 return 723 } 724 s.next() 725 } 726 727 s.next() 728 729 if s.ch == ']' { 730 s.next() 731 break 732 } 733 } 734 735 return 736 } 737 738 depth := 1 739 740 for s.ch == '=' { 741 depth++ 742 s.next() 743 } 744 745 if s.ch != '[' { 746 if comment { 747 for s.ch != '\n' && s.ch != '\r' && s.ch >= 0 { 748 s.next() 749 } 750 return 751 } 752 753 err = errInvalidLongStringDelimiter 754 755 return 756 } 757 758 s.next() 759 760 for { 761 _depth := depth 762 for s.ch != ']' { 763 if s.ch < 0 { 764 err = errUnterminatedLongString 765 766 return 767 } 768 s.next() 769 } 770 771 s.next() 772 773 for s.ch == '=' { 774 _depth-- 775 s.next() 776 } 777 778 if _depth != 0 { 779 continue 780 } 781 782 if s.ch == ']' { 783 s.next() 784 break 785 } 786 } 787 788 return 789 } 790 791 func (s *ScanState) skipSpace() { 792 for isSpace(s.ch) { 793 s.next() 794 } 795 } 796 797 func (s *ScanState) error(pos position.Position, err error) { 798 pos.SourceName = s.sourceName 799 800 s.err = &Error{ 801 Pos: pos, 802 Err: err, 803 } 804 805 panic(bailout{}) 806 } 807 808 func (s *ScanState) pos() position.Position { 809 return position.Position{ 810 Line: s.line, 811 Column: s.offset - s.lineOffset, 812 } 813 } 814 815 func (s *ScanState) mark() { 816 if s._mark != -1 { 817 panic("mark twice") 818 } 819 820 s._mark = s.start 821 } 822 823 func (s *ScanState) capture() string { 824 if s._mark == -1 { 825 panic("no mark") 826 } 827 828 buf := s.buf[s._mark:s.start] 829 830 s._mark = -1 831 832 if s.clip.Len() > 0 { 833 s.clip.Write(buf) 834 buf = s.clip.Bytes() 835 s.clip.Reset() 836 } 837 838 return string(buf) 839 } 840 841 func (s *ScanState) init() { 842 s.fill() 843 844 if s.start == s.end { 845 s.ch = -1 846 s.start = 0 847 s.end = 0 848 849 return 850 } 851 852 s.ch = int(s.buf[s.start]) 853 } 854 855 func (s *ScanState) next() { 856 if s.ch == -1 { 857 return 858 } 859 860 if s.ch == '\n' { 861 s.lineOffset = s.offset 862 s.line++ 863 } 864 865 s.start++ 866 s.offset++ 867 868 if s.start == s.end { 869 s.fill() 870 if s.start == s.end { 871 s.ch = -1 872 s.start = 0 873 s.end = 0 874 875 return 876 } 877 } 878 879 s.ch = int(s.buf[s.start]) 880 } 881 882 func (s *ScanState) peek(n int) string { 883 if n > s.end-s.start { 884 s.fill() 885 if n > s.end-s.start { 886 return string(s.buf[s.start:s.end]) 887 } 888 } 889 890 return string(s.buf[s.start : s.start+n]) 891 } 892 893 func (s *ScanState) fill() { 894 if s.filled { 895 return 896 } 897 898 if s.start > 0 { 899 if s._mark != -1 { 900 s.clip.Write(s.buf[s._mark:s.start]) 901 902 s._mark = 0 903 } 904 905 copy(s.buf, s.buf[s.start:s.end]) 906 s.end -= s.start 907 s.start = 0 908 } 909 910 for i := maxConsecutiveEmptyReads; i > 0; i-- { 911 n, err := s.r.Read(s.buf[s.end:]) 912 if err == io.EOF { 913 s.filled = true 914 915 return 916 } 917 if n < 0 { 918 panic("reader returned negative count from Read") 919 } 920 s.end += n 921 if err != nil { 922 s.error(position.NoPos, err) 923 return 924 } 925 926 if n > 0 { 927 return 928 } 929 } 930 s.error(position.NoPos, io.ErrNoProgress) 931 } 932 933 func digitVal(ch int) int { 934 switch { 935 case uint(ch)-'0' < 10: 936 return int(ch - '0') 937 case uint(ch)-'a' < 6: 938 return int(ch - 'a' + 10) 939 case uint(ch)-'A' < 6: 940 return int(ch - 'A' + 10) 941 } 942 943 return 16 // larger than any legal digit val 944 } 945 946 func isSpace(ch int) bool { 947 return ch == ' ' || uint(ch)-'\t' < 5 948 } 949 950 func isLetter(ch int) bool { 951 return ch == '_' || (uint(ch)|32)-'a' < 26 952 } 953 954 func isDigit(ch int) bool { 955 return uint(ch)-'0' < 10 956 } 957 958 func isXdigit(ch int) bool { 959 return uint(ch)-'0' < 10 || (uint(ch)|32)-'a' < 6 960 }