github.com/syumai/protoreflect@v1.7.1-0.20200810020253-2ac7e3b3a321/desc/protoparse/lexer.go (about) 1 package protoparse 2 3 import ( 4 "bufio" 5 "bytes" 6 "errors" 7 "fmt" 8 "io" 9 "strconv" 10 "strings" 11 "unicode/utf8" 12 ) 13 14 type runeReader struct { 15 rr *bufio.Reader 16 unread []rune 17 err error 18 } 19 20 func (rr *runeReader) readRune() (r rune, size int, err error) { 21 if rr.err != nil { 22 return 0, 0, rr.err 23 } 24 if len(rr.unread) > 0 { 25 r := rr.unread[len(rr.unread)-1] 26 rr.unread = rr.unread[:len(rr.unread)-1] 27 return r, utf8.RuneLen(r), nil 28 } 29 r, sz, err := rr.rr.ReadRune() 30 if err != nil { 31 rr.err = err 32 } 33 return r, sz, err 34 } 35 36 func (rr *runeReader) unreadRune(r rune) { 37 rr.unread = append(rr.unread, r) 38 } 39 40 func lexError(l protoLexer, pos *SourcePos, err string) { 41 pl := l.(*protoLex) 42 _ = pl.errs.handleErrorWithPos(pos, err) 43 } 44 45 type protoLex struct { 46 filename string 47 input *runeReader 48 errs *errorHandler 49 res *fileNode 50 51 lineNo int 52 colNo int 53 offset int 54 55 prevSym terminalNode 56 57 prevLineNo int 58 prevColNo int 59 prevOffset int 60 comments []comment 61 } 62 63 var utf8Bom = []byte{0xEF, 0xBB, 0xBF} 64 65 func newLexer(in io.Reader, filename string, errs *errorHandler) *protoLex { 66 br := bufio.NewReader(in) 67 68 // if file has UTF8 byte order marker preface, consume it 69 marker, err := br.Peek(3) 70 if err == nil && bytes.Equal(marker, utf8Bom) { 71 _, _ = br.Discard(3) 72 } 73 74 return &protoLex{ 75 input: &runeReader{rr: br}, 76 filename: filename, 77 errs: errs, 78 } 79 } 80 81 var keywords = map[string]int{ 82 "syntax": _SYNTAX, 83 "import": _IMPORT, 84 "weak": _WEAK, 85 "public": _PUBLIC, 86 "package": _PACKAGE, 87 "option": _OPTION, 88 "true": _TRUE, 89 "false": _FALSE, 90 "inf": _INF, 91 "nan": _NAN, 92 "repeated": _REPEATED, 93 "optional": _OPTIONAL, 94 "required": _REQUIRED, 95 "double": _DOUBLE, 96 "float": _FLOAT, 97 "int32": _INT32, 98 "int64": _INT64, 99 "uint32": _UINT32, 100 "uint64": _UINT64, 101 "sint32": _SINT32, 102 "sint64": _SINT64, 103 "fixed32": _FIXED32, 104 "fixed64": _FIXED64, 105 "sfixed32": _SFIXED32, 106 "sfixed64": _SFIXED64, 107 "bool": _BOOL, 108 "string": _STRING, 109 "bytes": _BYTES, 110 "group": _GROUP, 111 "oneof": _ONEOF, 112 "map": _MAP, 113 "extensions": _EXTENSIONS, 114 "to": _TO, 115 "max": _MAX, 116 "reserved": _RESERVED, 117 "enum": _ENUM, 118 "message": _MESSAGE, 119 "extend": _EXTEND, 120 "service": _SERVICE, 121 "rpc": _RPC, 122 "stream": _STREAM, 123 "returns": _RETURNS, 124 } 125 126 func (l *protoLex) cur() SourcePos { 127 return SourcePos{ 128 Filename: l.filename, 129 Offset: l.offset, 130 Line: l.lineNo + 1, 131 Col: l.colNo + 1, 132 } 133 } 134 135 func (l *protoLex) adjustPos(consumedChars ...rune) { 136 for _, c := range consumedChars { 137 switch c { 138 case '\n': 139 // new line, back to first column 140 l.colNo = 0 141 l.lineNo++ 142 case '\r': 143 // no adjustment 144 case '\t': 145 // advance to next tab stop 146 mod := l.colNo % 8 147 l.colNo += 8 - mod 148 default: 149 l.colNo++ 150 } 151 } 152 } 153 154 func (l *protoLex) prev() *SourcePos { 155 if l.prevSym == nil { 156 return &SourcePos{ 157 Filename: l.filename, 158 Offset: 0, 159 Line: 1, 160 Col: 1, 161 } 162 } 163 return l.prevSym.start() 164 } 165 166 func (l *protoLex) Lex(lval *protoSymType) int { 167 if l.errs.err != nil { 168 // if error reporter already returned non-nil error, 169 // we can skip the rest of the input 170 return 0 171 } 172 173 l.prevLineNo = l.lineNo 174 l.prevColNo = l.colNo 175 l.prevOffset = l.offset 176 l.comments = nil 177 178 for { 179 c, n, err := l.input.readRune() 180 if err == io.EOF { 181 // we're not actually returning a rune, but this will associate 182 // accumulated comments as a trailing comment on last symbol 183 // (if appropriate) 184 l.setRune(lval) 185 return 0 186 } else if err != nil { 187 // we don't call setError because we don't want it wrapped 188 // with a source position because it's I/O, not syntax 189 lval.err = err 190 _ = l.errs.handleError(err) 191 return _ERROR 192 } 193 194 l.prevLineNo = l.lineNo 195 l.prevColNo = l.colNo 196 l.prevOffset = l.offset 197 198 l.offset += n 199 l.adjustPos(c) 200 if strings.ContainsRune("\n\r\t ", c) { 201 continue 202 } 203 204 if c == '.' { 205 // decimal literals could start with a dot 206 cn, _, err := l.input.readRune() 207 if err != nil { 208 l.setDot(lval) 209 return int(c) 210 } 211 if cn >= '0' && cn <= '9' { 212 l.adjustPos(cn) 213 token := []rune{c, cn} 214 token = l.readNumber(token, false, true) 215 f, err := strconv.ParseFloat(string(token), 64) 216 if err != nil { 217 l.setError(lval, err) 218 return _ERROR 219 } 220 l.setFloat(lval, f) 221 return _FLOAT_LIT 222 } 223 l.input.unreadRune(cn) 224 l.setDot(lval) 225 return int(c) 226 } 227 228 if c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') { 229 // identifier 230 token := []rune{c} 231 token = l.readIdentifier(token) 232 str := string(token) 233 if t, ok := keywords[str]; ok { 234 l.setIdent(lval, str) 235 return t 236 } 237 l.setIdent(lval, str) 238 return _NAME 239 } 240 241 if c >= '0' && c <= '9' { 242 // integer or float literal 243 if c == '0' { 244 cn, _, err := l.input.readRune() 245 if err != nil { 246 l.setInt(lval, 0) 247 return _INT_LIT 248 } 249 if cn == 'x' || cn == 'X' { 250 cnn, _, err := l.input.readRune() 251 if err != nil { 252 l.input.unreadRune(cn) 253 l.setInt(lval, 0) 254 return _INT_LIT 255 } 256 if (cnn >= '0' && cnn <= '9') || (cnn >= 'a' && cnn <= 'f') || (cnn >= 'A' && cnn <= 'F') { 257 // hexadecimal! 258 l.adjustPos(cn, cnn) 259 token := []rune{cnn} 260 token = l.readHexNumber(token) 261 ui, err := strconv.ParseUint(string(token), 16, 64) 262 if err != nil { 263 l.setError(lval, err) 264 return _ERROR 265 } 266 l.setInt(lval, ui) 267 return _INT_LIT 268 } 269 l.input.unreadRune(cnn) 270 l.input.unreadRune(cn) 271 l.setInt(lval, 0) 272 return _INT_LIT 273 } else { 274 l.input.unreadRune(cn) 275 } 276 } 277 token := []rune{c} 278 token = l.readNumber(token, true, true) 279 numstr := string(token) 280 if strings.Contains(numstr, ".") || strings.Contains(numstr, "e") || strings.Contains(numstr, "E") { 281 // floating point! 282 f, err := strconv.ParseFloat(numstr, 64) 283 if err != nil { 284 l.setError(lval, err) 285 return _ERROR 286 } 287 l.setFloat(lval, f) 288 return _FLOAT_LIT 289 } 290 // integer! (decimal or octal) 291 ui, err := strconv.ParseUint(numstr, 0, 64) 292 if err != nil { 293 if numErr, ok := err.(*strconv.NumError); ok && numErr.Err == strconv.ErrRange { 294 // if it's too big to be an int, parse it as a float 295 var f float64 296 f, err = strconv.ParseFloat(numstr, 64) 297 if err == nil { 298 l.setFloat(lval, f) 299 return _FLOAT_LIT 300 } 301 } 302 l.setError(lval, err) 303 return _ERROR 304 } 305 l.setInt(lval, ui) 306 return _INT_LIT 307 } 308 309 if c == '\'' || c == '"' { 310 // string literal 311 str, err := l.readStringLiteral(c) 312 if err != nil { 313 l.setError(lval, err) 314 return _ERROR 315 } 316 l.setString(lval, str) 317 return _STRING_LIT 318 } 319 320 if c == '/' { 321 // comment 322 cn, _, err := l.input.readRune() 323 if err != nil { 324 l.setRune(lval) 325 return int(c) 326 } 327 if cn == '/' { 328 l.adjustPos(cn) 329 hitNewline, txt := l.skipToEndOfLineComment() 330 commentPos := l.posRange() 331 commentPos.end.Col++ 332 if hitNewline { 333 // we don't do this inside of skipToEndOfLineComment 334 // because we want to know the length of previous 335 // line for calculation above 336 l.adjustPos('\n') 337 } 338 l.comments = append(l.comments, comment{posRange: commentPos, text: txt}) 339 continue 340 } 341 if cn == '*' { 342 l.adjustPos(cn) 343 if txt, ok := l.skipToEndOfBlockComment(); !ok { 344 l.setError(lval, errors.New("block comment never terminates, unexpected EOF")) 345 return _ERROR 346 } else { 347 l.comments = append(l.comments, comment{posRange: l.posRange(), text: txt}) 348 } 349 continue 350 } 351 l.input.unreadRune(cn) 352 } 353 354 l.setRune(lval) 355 return int(c) 356 } 357 } 358 359 func (l *protoLex) posRange() posRange { 360 return posRange{ 361 start: SourcePos{ 362 Filename: l.filename, 363 Offset: l.prevOffset, 364 Line: l.prevLineNo + 1, 365 Col: l.prevColNo + 1, 366 }, 367 end: l.cur(), 368 } 369 } 370 371 func (l *protoLex) newBasicNode() basicNode { 372 return basicNode{ 373 posRange: l.posRange(), 374 leading: l.comments, 375 } 376 } 377 378 func (l *protoLex) setPrev(n terminalNode, isDot bool) { 379 nStart := n.start().Line 380 if _, ok := n.(*basicNode); ok { 381 // This is really gross, but there are many cases where we don't want 382 // to attribute comments to punctuation (like commas, equals, semicolons) 383 // and would instead prefer to attribute comments to a more meaningful 384 // element in the AST. 385 // 386 // So if it's a simple node OTHER THAN PERIOD (since that is not just 387 // punctuation but typically part of a qualified identifier), don't 388 // attribute comments to it. We do that with this TOTAL HACK: adjusting 389 // the start line makes leading comments appear detached so logic below 390 // will naturally associated trailing comment to previous symbol 391 if !isDot { 392 nStart += 2 393 } 394 } 395 if l.prevSym != nil && len(n.leadingComments()) > 0 && l.prevSym.end().Line < nStart { 396 // we may need to re-attribute the first comment to 397 // instead be previous node's trailing comment 398 prevEnd := l.prevSym.end().Line 399 comments := n.leadingComments() 400 c := comments[0] 401 commentStart := c.start.Line 402 if commentStart == prevEnd { 403 // comment is on same line as previous symbol 404 n.popLeadingComment() 405 l.prevSym.pushTrailingComment(c) 406 } else if commentStart == prevEnd+1 { 407 // comment is right after previous symbol; see if it is detached 408 // and if so re-attribute 409 singleLineStyle := strings.HasPrefix(c.text, "//") 410 line := c.end.Line 411 groupEnd := -1 412 for i := 1; i < len(comments); i++ { 413 c := comments[i] 414 newGroup := false 415 if !singleLineStyle || c.start.Line > line+1 { 416 // we've found a gap between comments, which means the 417 // previous comments were detached 418 newGroup = true 419 } else { 420 line = c.end.Line 421 singleLineStyle = strings.HasPrefix(comments[i].text, "//") 422 if !singleLineStyle { 423 // we've found a switch from // comments to /* 424 // consider that a new group which means the 425 // previous comments were detached 426 newGroup = true 427 } 428 } 429 if newGroup { 430 groupEnd = i 431 break 432 } 433 } 434 435 if groupEnd == -1 { 436 // just one group of comments; we'll mark it as a trailing 437 // comment if it immediately follows previous symbol and is 438 // detached from current symbol 439 c1 := comments[0] 440 c2 := comments[len(comments)-1] 441 if c1.start.Line <= prevEnd+1 && c2.end.Line < nStart-1 { 442 groupEnd = len(comments) 443 } 444 } 445 446 for i := 0; i < groupEnd; i++ { 447 l.prevSym.pushTrailingComment(n.popLeadingComment()) 448 } 449 } 450 } 451 452 l.prevSym = n 453 } 454 455 func (l *protoLex) setString(lval *protoSymType, val string) { 456 lval.s = &stringLiteralNode{basicNode: l.newBasicNode(), val: val} 457 l.setPrev(lval.s, false) 458 } 459 460 func (l *protoLex) setIdent(lval *protoSymType, val string) { 461 lval.id = &identNode{basicNode: l.newBasicNode(), val: val} 462 l.setPrev(lval.id, false) 463 } 464 465 func (l *protoLex) setInt(lval *protoSymType, val uint64) { 466 lval.i = &intLiteralNode{basicNode: l.newBasicNode(), val: val} 467 l.setPrev(lval.i, false) 468 } 469 470 func (l *protoLex) setFloat(lval *protoSymType, val float64) { 471 lval.f = &floatLiteralNode{basicNode: l.newBasicNode(), val: val} 472 l.setPrev(lval.f, false) 473 } 474 475 func (l *protoLex) setRune(lval *protoSymType) { 476 b := l.newBasicNode() 477 lval.b = &b 478 l.setPrev(lval.b, false) 479 } 480 481 func (l *protoLex) setDot(lval *protoSymType) { 482 b := l.newBasicNode() 483 lval.b = &b 484 l.setPrev(lval.b, true) 485 } 486 487 func (l *protoLex) setError(lval *protoSymType, err error) { 488 lval.err = l.addSourceError(err) 489 } 490 491 func (l *protoLex) readNumber(sofar []rune, allowDot bool, allowExp bool) []rune { 492 token := sofar 493 for { 494 c, _, err := l.input.readRune() 495 if err != nil { 496 break 497 } 498 if c == '.' { 499 if !allowDot { 500 l.input.unreadRune(c) 501 break 502 } 503 allowDot = false 504 } else if c == 'e' || c == 'E' { 505 if !allowExp { 506 l.input.unreadRune(c) 507 break 508 } 509 allowExp = false 510 cn, _, err := l.input.readRune() 511 if err != nil { 512 l.input.unreadRune(c) 513 break 514 } 515 if cn == '-' || cn == '+' { 516 cnn, _, err := l.input.readRune() 517 if err != nil { 518 l.input.unreadRune(cn) 519 l.input.unreadRune(c) 520 break 521 } 522 if cnn < '0' || cnn > '9' { 523 l.input.unreadRune(cnn) 524 l.input.unreadRune(cn) 525 l.input.unreadRune(c) 526 break 527 } 528 l.adjustPos(c) 529 token = append(token, c) 530 c, cn = cn, cnn 531 } else if cn < '0' || cn > '9' { 532 l.input.unreadRune(cn) 533 l.input.unreadRune(c) 534 break 535 } 536 l.adjustPos(c) 537 token = append(token, c) 538 c = cn 539 } else if c < '0' || c > '9' { 540 l.input.unreadRune(c) 541 break 542 } 543 l.adjustPos(c) 544 token = append(token, c) 545 } 546 return token 547 } 548 549 func (l *protoLex) readHexNumber(sofar []rune) []rune { 550 token := sofar 551 for { 552 c, _, err := l.input.readRune() 553 if err != nil { 554 break 555 } 556 if (c < 'a' || c > 'f') && (c < 'A' || c > 'F') && (c < '0' || c > '9') { 557 l.input.unreadRune(c) 558 break 559 } 560 l.adjustPos(c) 561 token = append(token, c) 562 } 563 return token 564 } 565 566 func (l *protoLex) readIdentifier(sofar []rune) []rune { 567 token := sofar 568 for { 569 c, _, err := l.input.readRune() 570 if err != nil { 571 break 572 } 573 if c != '_' && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && (c < '0' || c > '9') { 574 l.input.unreadRune(c) 575 break 576 } 577 l.adjustPos(c) 578 token = append(token, c) 579 } 580 return token 581 } 582 583 func (l *protoLex) readStringLiteral(quote rune) (string, error) { 584 var buf bytes.Buffer 585 for { 586 c, _, err := l.input.readRune() 587 if err != nil { 588 if err == io.EOF { 589 err = io.ErrUnexpectedEOF 590 } 591 return "", err 592 } 593 if c == '\n' { 594 return "", errors.New("encountered end-of-line before end of string literal") 595 } 596 l.adjustPos(c) 597 if c == quote { 598 break 599 } 600 if c == 0 { 601 return "", errors.New("null character ('\\0') not allowed in string literal") 602 } 603 if c == '\\' { 604 // escape sequence 605 c, _, err = l.input.readRune() 606 if err != nil { 607 return "", err 608 } 609 l.adjustPos(c) 610 if c == 'x' || c == 'X' { 611 // hex escape 612 c, _, err := l.input.readRune() 613 if err != nil { 614 return "", err 615 } 616 l.adjustPos(c) 617 c2, _, err := l.input.readRune() 618 if err != nil { 619 return "", err 620 } 621 var hex string 622 if (c2 < '0' || c2 > '9') && (c2 < 'a' || c2 > 'f') && (c2 < 'A' || c2 > 'F') { 623 l.input.unreadRune(c2) 624 hex = string(c) 625 } else { 626 l.adjustPos(c2) 627 hex = string([]rune{c, c2}) 628 } 629 i, err := strconv.ParseInt(hex, 16, 32) 630 if err != nil { 631 return "", fmt.Errorf("invalid hex escape: \\x%q", hex) 632 } 633 buf.WriteByte(byte(i)) 634 635 } else if c >= '0' && c <= '7' { 636 // octal escape 637 c2, _, err := l.input.readRune() 638 if err != nil { 639 return "", err 640 } 641 var octal string 642 if c2 < '0' || c2 > '7' { 643 l.input.unreadRune(c2) 644 octal = string(c) 645 } else { 646 l.adjustPos(c2) 647 c3, _, err := l.input.readRune() 648 if err != nil { 649 return "", err 650 } 651 if c3 < '0' || c3 > '7' { 652 l.input.unreadRune(c3) 653 octal = string([]rune{c, c2}) 654 } else { 655 l.adjustPos(c3) 656 octal = string([]rune{c, c2, c3}) 657 } 658 } 659 i, err := strconv.ParseInt(octal, 8, 32) 660 if err != nil { 661 return "", fmt.Errorf("invalid octal escape: \\%q", octal) 662 } 663 if i > 0xff { 664 return "", fmt.Errorf("octal escape is out range, must be between 0 and 377: \\%q", octal) 665 } 666 buf.WriteByte(byte(i)) 667 668 } else if c == 'u' { 669 // short unicode escape 670 u := make([]rune, 4) 671 for i := range u { 672 c, _, err := l.input.readRune() 673 if err != nil { 674 return "", err 675 } 676 l.adjustPos(c) 677 u[i] = c 678 } 679 i, err := strconv.ParseInt(string(u), 16, 32) 680 if err != nil { 681 return "", fmt.Errorf("invalid unicode escape: \\u%q", string(u)) 682 } 683 buf.WriteRune(rune(i)) 684 685 } else if c == 'U' { 686 // long unicode escape 687 u := make([]rune, 8) 688 for i := range u { 689 c, _, err := l.input.readRune() 690 if err != nil { 691 return "", err 692 } 693 l.adjustPos(c) 694 u[i] = c 695 } 696 i, err := strconv.ParseInt(string(u), 16, 32) 697 if err != nil { 698 return "", fmt.Errorf("invalid unicode escape: \\U%q", string(u)) 699 } 700 if i > 0x10ffff || i < 0 { 701 return "", fmt.Errorf("unicode escape is out of range, must be between 0 and 0x10ffff: \\U%q", string(u)) 702 } 703 buf.WriteRune(rune(i)) 704 705 } else if c == 'a' { 706 buf.WriteByte('\a') 707 } else if c == 'b' { 708 buf.WriteByte('\b') 709 } else if c == 'f' { 710 buf.WriteByte('\f') 711 } else if c == 'n' { 712 buf.WriteByte('\n') 713 } else if c == 'r' { 714 buf.WriteByte('\r') 715 } else if c == 't' { 716 buf.WriteByte('\t') 717 } else if c == 'v' { 718 buf.WriteByte('\v') 719 } else if c == '\\' { 720 buf.WriteByte('\\') 721 } else if c == '\'' { 722 buf.WriteByte('\'') 723 } else if c == '"' { 724 buf.WriteByte('"') 725 } else if c == '?' { 726 buf.WriteByte('?') 727 } else { 728 return "", fmt.Errorf("invalid escape sequence: %q", "\\"+string(c)) 729 } 730 } else { 731 buf.WriteRune(c) 732 } 733 } 734 return buf.String(), nil 735 } 736 737 func (l *protoLex) skipToEndOfLineComment() (bool, string) { 738 txt := []rune{'/', '/'} 739 for { 740 c, _, err := l.input.readRune() 741 if err != nil { 742 return false, string(txt) 743 } 744 if c == '\n' { 745 return true, string(append(txt, '\n')) 746 } 747 l.adjustPos(c) 748 txt = append(txt, c) 749 } 750 } 751 752 func (l *protoLex) skipToEndOfBlockComment() (string, bool) { 753 txt := []rune{'/', '*'} 754 for { 755 c, _, err := l.input.readRune() 756 if err != nil { 757 return "", false 758 } 759 l.adjustPos(c) 760 txt = append(txt, c) 761 if c == '*' { 762 c, _, err := l.input.readRune() 763 if err != nil { 764 return "", false 765 } 766 if c == '/' { 767 l.adjustPos(c) 768 txt = append(txt, c) 769 return string(txt), true 770 } 771 l.input.unreadRune(c) 772 } 773 } 774 } 775 776 func (l *protoLex) addSourceError(err error) ErrorWithPos { 777 ewp, ok := err.(ErrorWithPos) 778 if !ok { 779 ewp = ErrorWithSourcePos{Pos: l.prev(), Underlying: err} 780 } 781 _ = l.errs.handleError(ewp) 782 return ewp 783 } 784 785 func (l *protoLex) Error(s string) { 786 _ = l.addSourceError(errors.New(s)) 787 }