github.com/bakjos/protoreflect@v1.9.2/desc/protoparse/lexer.go (about) 1 package protoparse 2 3 import ( 4 "bufio" 5 "bytes" 6 "errors" 7 "fmt" 8 "io" 9 "strconv" 10 "strings" 11 "unicode/utf8" 12 13 "github.com/bakjos/protoreflect/desc/protoparse/ast" 14 ) 15 16 type runeReader struct { 17 rr *bufio.Reader 18 marked []rune 19 unread []rune 20 err error 21 } 22 23 func (rr *runeReader) readRune() (r rune, size int, err error) { 24 if rr.err != nil { 25 return 0, 0, rr.err 26 } 27 if len(rr.unread) > 0 { 28 r := rr.unread[len(rr.unread)-1] 29 rr.unread = rr.unread[:len(rr.unread)-1] 30 if rr.marked != nil { 31 rr.marked = append(rr.marked, r) 32 } 33 return r, utf8.RuneLen(r), nil 34 } 35 r, sz, err := rr.rr.ReadRune() 36 if err != nil { 37 rr.err = err 38 } else if rr.marked != nil { 39 rr.marked = append(rr.marked, r) 40 } 41 return r, sz, err 42 } 43 44 func (rr *runeReader) unreadRune(r rune) { 45 if rr.marked != nil { 46 if rr.marked[len(rr.marked)-1] != r { 47 panic("unread rune is not the same as last marked rune!") 48 } 49 rr.marked = rr.marked[:len(rr.marked)-1] 50 } 51 rr.unread = append(rr.unread, r) 52 } 53 54 func (rr *runeReader) startMark(initial rune) { 55 rr.marked = []rune{initial} 56 } 57 58 func (rr *runeReader) endMark() string { 59 m := string(rr.marked) 60 rr.marked = rr.marked[:0] 61 return m 62 } 63 64 func lexError(l protoLexer, pos *SourcePos, err string) { 65 pl := l.(*protoLex) 66 _ = pl.errs.handleErrorWithPos(pos, err) 67 } 68 69 type protoLex struct { 70 filename string 71 input *runeReader 72 errs *errorHandler 73 res *ast.FileNode 74 75 lineNo int 76 colNo int 77 offset int 78 79 prevSym ast.TerminalNode 80 eof ast.TerminalNode 81 82 prevLineNo int 83 prevColNo int 84 prevOffset int 85 comments []ast.Comment 86 ws []rune 87 } 88 89 var utf8Bom = []byte{0xEF, 0xBB, 0xBF} 90 91 func newLexer(in io.Reader, filename string, errs *errorHandler) *protoLex { 92 br := bufio.NewReader(in) 93 94 // if file has UTF8 byte order marker preface, consume it 95 marker, err := br.Peek(3) 96 if err == nil && bytes.Equal(marker, utf8Bom) { 97 _, _ = br.Discard(3) 98 } 99 100 return &protoLex{ 101 input: &runeReader{rr: br}, 102 filename: filename, 103 errs: errs, 104 } 105 } 106 107 var keywords = map[string]int{ 108 "syntax": _SYNTAX, 109 "import": _IMPORT, 110 "weak": _WEAK, 111 "public": _PUBLIC, 112 "package": _PACKAGE, 113 "option": _OPTION, 114 "true": _TRUE, 115 "false": _FALSE, 116 "inf": _INF, 117 "nan": _NAN, 118 "repeated": _REPEATED, 119 "optional": _OPTIONAL, 120 "required": _REQUIRED, 121 "double": _DOUBLE, 122 "float": _FLOAT, 123 "int32": _INT32, 124 "int64": _INT64, 125 "uint32": _UINT32, 126 "uint64": _UINT64, 127 "sint32": _SINT32, 128 "sint64": _SINT64, 129 "fixed32": _FIXED32, 130 "fixed64": _FIXED64, 131 "sfixed32": _SFIXED32, 132 "sfixed64": _SFIXED64, 133 "bool": _BOOL, 134 "string": _STRING, 135 "bytes": _BYTES, 136 "group": _GROUP, 137 "oneof": _ONEOF, 138 "map": _MAP, 139 "extensions": _EXTENSIONS, 140 "to": _TO, 141 "max": _MAX, 142 "reserved": _RESERVED, 143 "enum": _ENUM, 144 "message": _MESSAGE, 145 "extend": _EXTEND, 146 "service": _SERVICE, 147 "rpc": _RPC, 148 "stream": _STREAM, 149 "returns": _RETURNS, 150 } 151 152 func (l *protoLex) cur() SourcePos { 153 return SourcePos{ 154 Filename: l.filename, 155 Offset: l.offset, 156 Line: l.lineNo + 1, 157 Col: l.colNo + 1, 158 } 159 } 160 161 func (l *protoLex) adjustPos(consumedChars ...rune) { 162 for _, c := range consumedChars { 163 switch c { 164 case '\n': 165 // new line, back to first column 166 l.colNo = 0 167 l.lineNo++ 168 case '\r': 169 // no adjustment 170 case '\t': 171 // advance to next tab stop 172 mod := l.colNo % 8 173 l.colNo += 8 - mod 174 default: 175 l.colNo++ 176 } 177 } 178 } 179 180 func (l *protoLex) prev() *SourcePos { 181 if l.prevSym == nil { 182 return &SourcePos{ 183 Filename: l.filename, 184 Offset: 0, 185 Line: 1, 186 Col: 1, 187 } 188 } 189 return l.prevSym.Start() 190 } 191 192 func (l *protoLex) Lex(lval *protoSymType) int { 193 if l.errs.err != nil { 194 // if error reporter already returned non-nil error, 195 // we can skip the rest of the input 196 return 0 197 } 198 199 l.prevLineNo = l.lineNo 200 l.prevColNo = l.colNo 201 l.prevOffset = l.offset 202 l.comments = nil 203 l.ws = nil 204 l.input.endMark() // reset, just in case 205 206 for { 207 c, n, err := l.input.readRune() 208 if err == io.EOF { 209 // we're not actually returning a rune, but this will associate 210 // accumulated comments as a trailing comment on last symbol 211 // (if appropriate) 212 l.setRune(lval, 0) 213 l.eof = lval.b 214 return 0 215 } else if err != nil { 216 // we don't call setError because we don't want it wrapped 217 // with a source position because it's I/O, not syntax 218 lval.err = err 219 _ = l.errs.handleError(err) 220 return _ERROR 221 } 222 223 l.prevLineNo = l.lineNo 224 l.prevColNo = l.colNo 225 l.prevOffset = l.offset 226 227 l.offset += n 228 l.adjustPos(c) 229 if strings.ContainsRune("\n\r\t ", c) { 230 l.ws = append(l.ws, c) 231 continue 232 } 233 234 l.input.startMark(c) 235 if c == '.' { 236 // decimal literals could start with a dot 237 cn, _, err := l.input.readRune() 238 if err != nil { 239 l.setRune(lval, c) 240 return int(c) 241 } 242 if cn >= '0' && cn <= '9' { 243 l.adjustPos(cn) 244 token := []rune{c, cn} 245 token = l.readNumber(token, false, true) 246 f, err := strconv.ParseFloat(string(token), 64) 247 if err != nil { 248 l.setError(lval, err) 249 return _ERROR 250 } 251 l.setFloat(lval, f) 252 return _FLOAT_LIT 253 } 254 l.input.unreadRune(cn) 255 l.setRune(lval, c) 256 return int(c) 257 } 258 259 if c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') { 260 // identifier 261 token := []rune{c} 262 token = l.readIdentifier(token) 263 str := string(token) 264 if t, ok := keywords[str]; ok { 265 l.setIdent(lval, str) 266 return t 267 } 268 l.setIdent(lval, str) 269 return _NAME 270 } 271 272 if c >= '0' && c <= '9' { 273 // integer or float literal 274 if c == '0' { 275 cn, _, err := l.input.readRune() 276 if err != nil { 277 l.setInt(lval, 0) 278 return _INT_LIT 279 } 280 if cn == 'x' || cn == 'X' { 281 cnn, _, err := l.input.readRune() 282 if err != nil { 283 l.input.unreadRune(cn) 284 l.setInt(lval, 0) 285 return _INT_LIT 286 } 287 if (cnn >= '0' && cnn <= '9') || (cnn >= 'a' && cnn <= 'f') || (cnn >= 'A' && cnn <= 'F') { 288 // hexadecimal! 289 l.adjustPos(cn, cnn) 290 token := []rune{cnn} 291 token = l.readHexNumber(token) 292 ui, err := strconv.ParseUint(string(token), 16, 64) 293 if err != nil { 294 l.setError(lval, err) 295 return _ERROR 296 } 297 l.setInt(lval, ui) 298 return _INT_LIT 299 } 300 l.input.unreadRune(cnn) 301 l.input.unreadRune(cn) 302 l.setInt(lval, 0) 303 return _INT_LIT 304 } else { 305 l.input.unreadRune(cn) 306 } 307 } 308 token := []rune{c} 309 token = l.readNumber(token, true, true) 310 numstr := string(token) 311 if strings.Contains(numstr, ".") || strings.Contains(numstr, "e") || strings.Contains(numstr, "E") { 312 // floating point! 313 f, err := strconv.ParseFloat(numstr, 64) 314 if err != nil { 315 l.setError(lval, err) 316 return _ERROR 317 } 318 l.setFloat(lval, f) 319 return _FLOAT_LIT 320 } 321 // integer! (decimal or octal) 322 ui, err := strconv.ParseUint(numstr, 0, 64) 323 if err != nil { 324 if numErr, ok := err.(*strconv.NumError); ok && numErr.Err == strconv.ErrRange { 325 // if it's too big to be an int, parse it as a float 326 var f float64 327 f, err = strconv.ParseFloat(numstr, 64) 328 if err == nil { 329 l.setFloat(lval, f) 330 return _FLOAT_LIT 331 } 332 } 333 l.setError(lval, err) 334 return _ERROR 335 } 336 l.setInt(lval, ui) 337 return _INT_LIT 338 } 339 340 if c == '\'' || c == '"' { 341 // string literal 342 str, err := l.readStringLiteral(c) 343 if err != nil { 344 l.setError(lval, err) 345 return _ERROR 346 } 347 l.setString(lval, str) 348 return _STRING_LIT 349 } 350 351 if c == '/' { 352 // comment 353 cn, _, err := l.input.readRune() 354 if err != nil { 355 l.setRune(lval, '/') 356 return int(c) 357 } 358 if cn == '/' { 359 l.adjustPos(cn) 360 hitNewline := l.skipToEndOfLineComment() 361 comment := l.newComment() 362 comment.PosRange.End.Col++ 363 if hitNewline { 364 // we don't do this inside of skipToEndOfLineComment 365 // because we want to know the length of previous 366 // line for calculation above 367 l.adjustPos('\n') 368 } 369 l.comments = append(l.comments, comment) 370 continue 371 } 372 if cn == '*' { 373 l.adjustPos(cn) 374 if ok := l.skipToEndOfBlockComment(); !ok { 375 l.setError(lval, errors.New("block comment never terminates, unexpected EOF")) 376 return _ERROR 377 } else { 378 l.comments = append(l.comments, l.newComment()) 379 } 380 continue 381 } 382 l.input.unreadRune(cn) 383 } 384 385 l.setRune(lval, c) 386 return int(c) 387 } 388 } 389 390 func (l *protoLex) posRange() ast.PosRange { 391 return ast.PosRange{ 392 Start: SourcePos{ 393 Filename: l.filename, 394 Offset: l.prevOffset, 395 Line: l.prevLineNo + 1, 396 Col: l.prevColNo + 1, 397 }, 398 End: l.cur(), 399 } 400 } 401 402 func (l *protoLex) newComment() ast.Comment { 403 ws := string(l.ws) 404 l.ws = l.ws[:0] 405 return ast.Comment{ 406 PosRange: l.posRange(), 407 LeadingWhitespace: ws, 408 Text: l.input.endMark(), 409 } 410 } 411 412 func (l *protoLex) newTokenInfo() ast.TokenInfo { 413 ws := string(l.ws) 414 l.ws = nil 415 return ast.TokenInfo{ 416 PosRange: l.posRange(), 417 LeadingComments: l.comments, 418 LeadingWhitespace: ws, 419 RawText: l.input.endMark(), 420 } 421 } 422 423 func (l *protoLex) setPrev(n ast.TerminalNode, isDot bool) { 424 nStart := n.Start().Line 425 if _, ok := n.(*ast.RuneNode); ok { 426 // This is really gross, but there are many cases where we don't want 427 // to attribute comments to punctuation (like commas, equals, semicolons) 428 // and would instead prefer to attribute comments to a more meaningful 429 // element in the AST. 430 // 431 // So if it's a simple node OTHER THAN PERIOD (since that is not just 432 // punctuation but typically part of a qualified identifier), don't 433 // attribute comments to it. We do that with this TOTAL HACK: adjusting 434 // the start line makes leading comments appear detached so logic below 435 // will naturally associated trailing comment to previous symbol 436 if !isDot { 437 nStart += 2 438 } 439 } 440 if l.prevSym != nil && len(n.LeadingComments()) > 0 && l.prevSym.End().Line < nStart { 441 // we may need to re-attribute the first comment to 442 // instead be previous node's trailing comment 443 prevEnd := l.prevSym.End().Line 444 comments := n.LeadingComments() 445 c := comments[0] 446 commentStart := c.Start.Line 447 if commentStart == prevEnd { 448 // comment is on same line as previous symbol 449 n.PopLeadingComment() 450 l.prevSym.PushTrailingComment(c) 451 } else if commentStart == prevEnd+1 { 452 // comment is right after previous symbol; see if it is detached 453 // and if so re-attribute 454 singleLineStyle := strings.HasPrefix(c.Text, "//") 455 line := c.End.Line 456 groupEnd := -1 457 for i := 1; i < len(comments); i++ { 458 c := comments[i] 459 newGroup := false 460 if !singleLineStyle || c.Start.Line > line+1 { 461 // we've found a gap between comments, which means the 462 // previous comments were detached 463 newGroup = true 464 } else { 465 line = c.End.Line 466 singleLineStyle = strings.HasPrefix(comments[i].Text, "//") 467 if !singleLineStyle { 468 // we've found a switch from // comments to /* 469 // consider that a new group which means the 470 // previous comments were detached 471 newGroup = true 472 } 473 } 474 if newGroup { 475 groupEnd = i 476 break 477 } 478 } 479 480 if groupEnd == -1 { 481 // just one group of comments; we'll mark it as a trailing 482 // comment if it immediately follows previous symbol and is 483 // detached from current symbol 484 c1 := comments[0] 485 c2 := comments[len(comments)-1] 486 if c1.Start.Line <= prevEnd+1 && c2.End.Line < nStart-1 { 487 groupEnd = len(comments) 488 } 489 } 490 491 for i := 0; i < groupEnd; i++ { 492 l.prevSym.PushTrailingComment(n.PopLeadingComment()) 493 } 494 } 495 } 496 497 l.prevSym = n 498 } 499 500 func (l *protoLex) setString(lval *protoSymType, val string) { 501 lval.s = ast.NewStringLiteralNode(val, l.newTokenInfo()) 502 l.setPrev(lval.s, false) 503 } 504 505 func (l *protoLex) setIdent(lval *protoSymType, val string) { 506 lval.id = ast.NewIdentNode(val, l.newTokenInfo()) 507 l.setPrev(lval.id, false) 508 } 509 510 func (l *protoLex) setInt(lval *protoSymType, val uint64) { 511 lval.i = ast.NewUintLiteralNode(val, l.newTokenInfo()) 512 l.setPrev(lval.i, false) 513 } 514 515 func (l *protoLex) setFloat(lval *protoSymType, val float64) { 516 lval.f = ast.NewFloatLiteralNode(val, l.newTokenInfo()) 517 l.setPrev(lval.f, false) 518 } 519 520 func (l *protoLex) setRune(lval *protoSymType, val rune) { 521 lval.b = ast.NewRuneNode(val, l.newTokenInfo()) 522 l.setPrev(lval.b, val == '.') 523 } 524 525 func (l *protoLex) setError(lval *protoSymType, err error) { 526 lval.err = l.addSourceError(err) 527 } 528 529 func (l *protoLex) readNumber(sofar []rune, allowDot bool, allowExp bool) []rune { 530 token := sofar 531 for { 532 c, _, err := l.input.readRune() 533 if err != nil { 534 break 535 } 536 if c == '.' { 537 if !allowDot { 538 l.input.unreadRune(c) 539 break 540 } 541 allowDot = false 542 } else if c == 'e' || c == 'E' { 543 if !allowExp { 544 l.input.unreadRune(c) 545 break 546 } 547 allowExp = false 548 cn, _, err := l.input.readRune() 549 if err != nil { 550 l.input.unreadRune(c) 551 break 552 } 553 if cn == '-' || cn == '+' { 554 cnn, _, err := l.input.readRune() 555 if err != nil { 556 l.input.unreadRune(cn) 557 l.input.unreadRune(c) 558 break 559 } 560 if cnn < '0' || cnn > '9' { 561 l.input.unreadRune(cnn) 562 l.input.unreadRune(cn) 563 l.input.unreadRune(c) 564 break 565 } 566 l.adjustPos(c) 567 token = append(token, c) 568 c, cn = cn, cnn 569 } else if cn < '0' || cn > '9' { 570 l.input.unreadRune(cn) 571 l.input.unreadRune(c) 572 break 573 } 574 l.adjustPos(c) 575 token = append(token, c) 576 c = cn 577 } else if c < '0' || c > '9' { 578 l.input.unreadRune(c) 579 break 580 } 581 l.adjustPos(c) 582 token = append(token, c) 583 } 584 return token 585 } 586 587 func (l *protoLex) readHexNumber(sofar []rune) []rune { 588 token := sofar 589 for { 590 c, _, err := l.input.readRune() 591 if err != nil { 592 break 593 } 594 if (c < 'a' || c > 'f') && (c < 'A' || c > 'F') && (c < '0' || c > '9') { 595 l.input.unreadRune(c) 596 break 597 } 598 l.adjustPos(c) 599 token = append(token, c) 600 } 601 return token 602 } 603 604 func (l *protoLex) readIdentifier(sofar []rune) []rune { 605 token := sofar 606 for { 607 c, _, err := l.input.readRune() 608 if err != nil { 609 break 610 } 611 if c != '_' && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && (c < '0' || c > '9') { 612 l.input.unreadRune(c) 613 break 614 } 615 l.adjustPos(c) 616 token = append(token, c) 617 } 618 return token 619 } 620 621 func (l *protoLex) readStringLiteral(quote rune) (string, error) { 622 var buf bytes.Buffer 623 for { 624 c, _, err := l.input.readRune() 625 if err != nil { 626 if err == io.EOF { 627 err = io.ErrUnexpectedEOF 628 } 629 return "", err 630 } 631 if c == '\n' { 632 return "", errors.New("encountered end-of-line before end of string literal") 633 } 634 l.adjustPos(c) 635 if c == quote { 636 break 637 } 638 if c == 0 { 639 return "", errors.New("null character ('\\0') not allowed in string literal") 640 } 641 if c == '\\' { 642 // escape sequence 643 c, _, err = l.input.readRune() 644 if err != nil { 645 return "", err 646 } 647 l.adjustPos(c) 648 if c == 'x' || c == 'X' { 649 // hex escape 650 c, _, err := l.input.readRune() 651 if err != nil { 652 return "", err 653 } 654 l.adjustPos(c) 655 c2, _, err := l.input.readRune() 656 if err != nil { 657 return "", err 658 } 659 var hex string 660 if (c2 < '0' || c2 > '9') && (c2 < 'a' || c2 > 'f') && (c2 < 'A' || c2 > 'F') { 661 l.input.unreadRune(c2) 662 hex = string(c) 663 } else { 664 l.adjustPos(c2) 665 hex = string([]rune{c, c2}) 666 } 667 i, err := strconv.ParseInt(hex, 16, 32) 668 if err != nil { 669 return "", fmt.Errorf("invalid hex escape: \\x%q", hex) 670 } 671 buf.WriteByte(byte(i)) 672 673 } else if c >= '0' && c <= '7' { 674 // octal escape 675 c2, _, err := l.input.readRune() 676 if err != nil { 677 return "", err 678 } 679 var octal string 680 if c2 < '0' || c2 > '7' { 681 l.input.unreadRune(c2) 682 octal = string(c) 683 } else { 684 l.adjustPos(c2) 685 c3, _, err := l.input.readRune() 686 if err != nil { 687 return "", err 688 } 689 if c3 < '0' || c3 > '7' { 690 l.input.unreadRune(c3) 691 octal = string([]rune{c, c2}) 692 } else { 693 l.adjustPos(c3) 694 octal = string([]rune{c, c2, c3}) 695 } 696 } 697 i, err := strconv.ParseInt(octal, 8, 32) 698 if err != nil { 699 return "", fmt.Errorf("invalid octal escape: \\%q", octal) 700 } 701 if i > 0xff { 702 return "", fmt.Errorf("octal escape is out range, must be between 0 and 377: \\%q", octal) 703 } 704 buf.WriteByte(byte(i)) 705 706 } else if c == 'u' { 707 // short unicode escape 708 u := make([]rune, 4) 709 for i := range u { 710 c, _, err := l.input.readRune() 711 if err != nil { 712 return "", err 713 } 714 l.adjustPos(c) 715 u[i] = c 716 } 717 i, err := strconv.ParseInt(string(u), 16, 32) 718 if err != nil { 719 return "", fmt.Errorf("invalid unicode escape: \\u%q", string(u)) 720 } 721 buf.WriteRune(rune(i)) 722 723 } else if c == 'U' { 724 // long unicode escape 725 u := make([]rune, 8) 726 for i := range u { 727 c, _, err := l.input.readRune() 728 if err != nil { 729 return "", err 730 } 731 l.adjustPos(c) 732 u[i] = c 733 } 734 i, err := strconv.ParseInt(string(u), 16, 32) 735 if err != nil { 736 return "", fmt.Errorf("invalid unicode escape: \\U%q", string(u)) 737 } 738 if i > 0x10ffff || i < 0 { 739 return "", fmt.Errorf("unicode escape is out of range, must be between 0 and 0x10ffff: \\U%q", string(u)) 740 } 741 buf.WriteRune(rune(i)) 742 743 } else if c == 'a' { 744 buf.WriteByte('\a') 745 } else if c == 'b' { 746 buf.WriteByte('\b') 747 } else if c == 'f' { 748 buf.WriteByte('\f') 749 } else if c == 'n' { 750 buf.WriteByte('\n') 751 } else if c == 'r' { 752 buf.WriteByte('\r') 753 } else if c == 't' { 754 buf.WriteByte('\t') 755 } else if c == 'v' { 756 buf.WriteByte('\v') 757 } else if c == '\\' { 758 buf.WriteByte('\\') 759 } else if c == '\'' { 760 buf.WriteByte('\'') 761 } else if c == '"' { 762 buf.WriteByte('"') 763 } else if c == '?' { 764 buf.WriteByte('?') 765 } else { 766 return "", fmt.Errorf("invalid escape sequence: %q", "\\"+string(c)) 767 } 768 } else { 769 buf.WriteRune(c) 770 } 771 } 772 return buf.String(), nil 773 } 774 775 func (l *protoLex) skipToEndOfLineComment() bool { 776 for { 777 c, _, err := l.input.readRune() 778 if err != nil { 779 return false 780 } 781 if c == '\n' { 782 return true 783 } 784 l.adjustPos(c) 785 } 786 } 787 788 func (l *protoLex) skipToEndOfBlockComment() bool { 789 for { 790 c, _, err := l.input.readRune() 791 if err != nil { 792 return false 793 } 794 l.adjustPos(c) 795 if c == '*' { 796 c, _, err := l.input.readRune() 797 if err != nil { 798 return false 799 } 800 if c == '/' { 801 l.adjustPos(c) 802 return true 803 } 804 l.input.unreadRune(c) 805 } 806 } 807 } 808 809 func (l *protoLex) addSourceError(err error) ErrorWithPos { 810 ewp, ok := err.(ErrorWithPos) 811 if !ok { 812 ewp = ErrorWithSourcePos{Pos: l.prev(), Underlying: err} 813 } 814 _ = l.errs.handleError(ewp) 815 return ewp 816 } 817 818 func (l *protoLex) Error(s string) { 819 _ = l.addSourceError(errors.New(s)) 820 }