github.com/Khushbukela/protoreflect@v1.0.1/desc/protoparse/lexer.go (about) 1 package protoparse 2 3 import ( 4 "bufio" 5 "bytes" 6 "errors" 7 "fmt" 8 "io" 9 "math" 10 "strconv" 11 "strings" 12 "unicode/utf8" 13 14 "github.com/jhump/protoreflect/desc/protoparse/ast" 15 ) 16 17 type runeReader struct { 18 rr *bufio.Reader 19 marked []rune 20 unread []rune 21 err error 22 } 23 24 func (rr *runeReader) readRune() (r rune, size int, err error) { 25 if rr.err != nil { 26 return 0, 0, rr.err 27 } 28 if len(rr.unread) > 0 { 29 r := rr.unread[len(rr.unread)-1] 30 rr.unread = rr.unread[:len(rr.unread)-1] 31 if rr.marked != nil { 32 rr.marked = append(rr.marked, r) 33 } 34 return r, utf8.RuneLen(r), nil 35 } 36 r, sz, err := rr.rr.ReadRune() 37 if err != nil { 38 rr.err = err 39 } else if rr.marked != nil { 40 rr.marked = append(rr.marked, r) 41 } 42 return r, sz, err 43 } 44 45 func (rr *runeReader) unreadRune(r rune) { 46 if rr.marked != nil { 47 if rr.marked[len(rr.marked)-1] != r { 48 panic("unread rune is not the same as last marked rune!") 49 } 50 rr.marked = rr.marked[:len(rr.marked)-1] 51 } 52 rr.unread = append(rr.unread, r) 53 } 54 55 func (rr *runeReader) startMark(initial rune) { 56 rr.marked = []rune{initial} 57 } 58 59 func (rr *runeReader) endMark() string { 60 m := string(rr.marked) 61 rr.marked = rr.marked[:0] 62 return m 63 } 64 65 type protoLex struct { 66 filename string 67 input *runeReader 68 errs *errorHandler 69 res *ast.FileNode 70 71 lineNo int 72 colNo int 73 offset int 74 75 prevSym ast.TerminalNode 76 eof ast.TerminalNode 77 78 prevLineNo int 79 prevColNo int 80 prevOffset int 81 comments []ast.Comment 82 ws []rune 83 } 84 85 var utf8Bom = []byte{0xEF, 0xBB, 0xBF} 86 87 func newLexer(in io.Reader, filename string, errs *errorHandler) *protoLex { 88 br := bufio.NewReader(in) 89 90 // if file has UTF8 byte order marker preface, consume it 91 marker, err := br.Peek(3) 92 if err == nil && bytes.Equal(marker, utf8Bom) { 93 _, _ = br.Discard(3) 94 } 95 96 return &protoLex{ 97 input: &runeReader{rr: br}, 98 filename: filename, 99 errs: errs, 100 } 101 } 102 103 var keywords = map[string]int{ 104 "syntax": _SYNTAX, 105 "import": _IMPORT, 106 "weak": _WEAK, 107 "public": _PUBLIC, 108 "package": _PACKAGE, 109 "option": _OPTION, 110 "true": _TRUE, 111 "false": _FALSE, 112 "inf": _INF, 113 "nan": _NAN, 114 "repeated": _REPEATED, 115 "optional": _OPTIONAL, 116 "required": _REQUIRED, 117 "double": _DOUBLE, 118 "float": _FLOAT, 119 "int32": _INT32, 120 "int64": _INT64, 121 "uint32": _UINT32, 122 "uint64": _UINT64, 123 "sint32": _SINT32, 124 "sint64": _SINT64, 125 "fixed32": _FIXED32, 126 "fixed64": _FIXED64, 127 "sfixed32": _SFIXED32, 128 "sfixed64": _SFIXED64, 129 "bool": _BOOL, 130 "string": _STRING, 131 "bytes": _BYTES, 132 "group": _GROUP, 133 "oneof": _ONEOF, 134 "map": _MAP, 135 "extensions": _EXTENSIONS, 136 "to": _TO, 137 "max": _MAX, 138 "reserved": _RESERVED, 139 "enum": _ENUM, 140 "message": _MESSAGE, 141 "extend": _EXTEND, 142 "service": _SERVICE, 143 "rpc": _RPC, 144 "stream": _STREAM, 145 "returns": _RETURNS, 146 } 147 148 func (l *protoLex) cur() SourcePos { 149 return SourcePos{ 150 Filename: l.filename, 151 Offset: l.offset, 152 Line: l.lineNo + 1, 153 Col: l.colNo + 1, 154 } 155 } 156 157 func (l *protoLex) adjustPos(consumedChars ...rune) { 158 for _, c := range consumedChars { 159 switch c { 160 case '\n': 161 // new line, back to first column 162 l.colNo = 0 163 l.lineNo++ 164 case '\r': 165 // no adjustment 166 case '\t': 167 // advance to next tab stop 168 mod := l.colNo % 8 169 l.colNo += 8 - mod 170 default: 171 l.colNo++ 172 } 173 } 174 } 175 176 func (l *protoLex) prev() *SourcePos { 177 if l.prevSym == nil { 178 return &SourcePos{ 179 Filename: l.filename, 180 Offset: 0, 181 Line: 1, 182 Col: 1, 183 } 184 } 185 return l.prevSym.Start() 186 } 187 188 func (l *protoLex) Lex(lval *protoSymType) int { 189 if l.errs.err != nil { 190 // if error reporter already returned non-nil error, 191 // we can skip the rest of the input 192 return 0 193 } 194 195 l.prevLineNo = l.lineNo 196 l.prevColNo = l.colNo 197 l.prevOffset = l.offset 198 l.comments = nil 199 l.ws = nil 200 l.input.endMark() // reset, just in case 201 202 for { 203 c, n, err := l.input.readRune() 204 if err == io.EOF { 205 // we're not actually returning a rune, but this will associate 206 // accumulated comments as a trailing comment on last symbol 207 // (if appropriate) 208 l.setRune(lval, 0) 209 l.eof = lval.b 210 return 0 211 } else if err != nil { 212 // we don't call setError because we don't want it wrapped 213 // with a source position because it's I/O, not syntax 214 lval.err = err 215 _ = l.errs.handleError(err) 216 return _ERROR 217 } 218 219 l.prevLineNo = l.lineNo 220 l.prevColNo = l.colNo 221 l.prevOffset = l.offset 222 223 l.offset += n 224 l.adjustPos(c) 225 if strings.ContainsRune("\n\r\t\f\v ", c) { 226 l.ws = append(l.ws, c) 227 continue 228 } 229 230 l.input.startMark(c) 231 if c == '.' { 232 // decimal literals could start with a dot 233 cn, _, err := l.input.readRune() 234 if err != nil { 235 l.setRune(lval, c) 236 return int(c) 237 } 238 if cn >= '0' && cn <= '9' { 239 l.adjustPos(cn) 240 token := l.readNumber(c, cn) 241 f, err := parseFloat(token) 242 if err != nil { 243 l.setError(lval, numError(err, "float", token)) 244 return _ERROR 245 } 246 l.setFloat(lval, f) 247 return _FLOAT_LIT 248 } 249 l.input.unreadRune(cn) 250 l.setRune(lval, c) 251 return int(c) 252 } 253 254 if c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') { 255 // identifier 256 token := []rune{c} 257 token = l.readIdentifier(token) 258 str := string(token) 259 if t, ok := keywords[str]; ok { 260 l.setIdent(lval, str) 261 return t 262 } 263 l.setIdent(lval, str) 264 return _NAME 265 } 266 267 if c >= '0' && c <= '9' { 268 // integer or float literal 269 token := l.readNumber(c) 270 if strings.HasPrefix(token, "0x") || strings.HasPrefix(token, "0X") { 271 // hexadecimal 272 ui, err := strconv.ParseUint(token[2:], 16, 64) 273 if err != nil { 274 l.setError(lval, numError(err, "hexadecimal integer", token[2:])) 275 return _ERROR 276 } 277 l.setInt(lval, ui) 278 return _INT_LIT 279 } 280 if strings.Contains(token, ".") || strings.Contains(token, "e") || strings.Contains(token, "E") { 281 // floating point! 282 f, err := parseFloat(token) 283 if err != nil { 284 l.setError(lval, numError(err, "float", token)) 285 return _ERROR 286 } 287 l.setFloat(lval, f) 288 return _FLOAT_LIT 289 } 290 // integer! (decimal or octal) 291 base := 10 292 if token[0] == '0' { 293 base = 8 294 } 295 ui, err := strconv.ParseUint(token, base, 64) 296 if err != nil { 297 kind := "integer" 298 if base == 8 { 299 kind = "octal integer" 300 } 301 if numErr, ok := err.(*strconv.NumError); ok && numErr.Err == strconv.ErrRange { 302 // if it's too big to be an int, parse it as a float 303 var f float64 304 kind = "float" 305 f, err = parseFloat(token) 306 if err == nil { 307 l.setFloat(lval, f) 308 return _FLOAT_LIT 309 } 310 } 311 l.setError(lval, numError(err, kind, token)) 312 return _ERROR 313 } 314 l.setInt(lval, ui) 315 return _INT_LIT 316 } 317 318 if c == '\'' || c == '"' { 319 // string literal 320 str, err := l.readStringLiteral(c) 321 if err != nil { 322 l.setError(lval, err) 323 return _ERROR 324 } 325 l.setString(lval, str) 326 return _STRING_LIT 327 } 328 329 if c == '/' { 330 // comment 331 cn, _, err := l.input.readRune() 332 if err != nil { 333 l.setRune(lval, '/') 334 return int(c) 335 } 336 if cn == '/' { 337 l.adjustPos(cn) 338 hitNewline, hasErr := l.skipToEndOfLineComment(lval) 339 if hasErr { 340 return _ERROR 341 } 342 comment := l.newComment() 343 comment.PosRange.End.Col++ 344 if hitNewline { 345 // we don't do this inside of skipToEndOfLineComment 346 // because we want to know the length of previous 347 // line for calculation above 348 l.adjustPos('\n') 349 } 350 l.comments = append(l.comments, comment) 351 continue 352 } 353 if cn == '*' { 354 l.adjustPos(cn) 355 ok, hasErr := l.skipToEndOfBlockComment(lval) 356 if hasErr { 357 return _ERROR 358 } 359 if !ok { 360 l.setError(lval, errors.New("block comment never terminates, unexpected EOF")) 361 return _ERROR 362 } 363 l.comments = append(l.comments, l.newComment()) 364 continue 365 } 366 l.input.unreadRune(cn) 367 } 368 369 if c < 32 || c == 127 { 370 l.setError(lval, errors.New("invalid control character")) 371 return _ERROR 372 } 373 if !strings.ContainsRune(";,.:=-+(){}[]<>", c) { 374 l.setError(lval, errors.New("invalid character")) 375 return _ERROR 376 } 377 l.setRune(lval, c) 378 return int(c) 379 } 380 } 381 382 func parseFloat(token string) (float64, error) { 383 // strconv.ParseFloat allows _ to separate digits, but protobuf does not 384 if strings.ContainsRune(token, '_') { 385 return 0, &strconv.NumError{ 386 Func: "parseFloat", 387 Num: token, 388 Err: strconv.ErrSyntax, 389 } 390 } 391 f, err := strconv.ParseFloat(token, 64) 392 if err == nil { 393 return f, nil 394 } 395 if numErr, ok := err.(*strconv.NumError); ok && numErr.Err == strconv.ErrRange && math.IsInf(f, 1) { 396 // protoc doesn't complain about float overflow and instead just uses "infinity" 397 // so we mirror that behavior by just returning infinity and ignoring the error 398 return f, nil 399 } 400 return f, err 401 } 402 403 func (l *protoLex) posRange() ast.PosRange { 404 return ast.PosRange{ 405 Start: SourcePos{ 406 Filename: l.filename, 407 Offset: l.prevOffset, 408 Line: l.prevLineNo + 1, 409 Col: l.prevColNo + 1, 410 }, 411 End: l.cur(), 412 } 413 } 414 415 func (l *protoLex) newComment() ast.Comment { 416 ws := string(l.ws) 417 l.ws = l.ws[:0] 418 return ast.Comment{ 419 PosRange: l.posRange(), 420 LeadingWhitespace: ws, 421 Text: l.input.endMark(), 422 } 423 } 424 425 func (l *protoLex) newTokenInfo() ast.TokenInfo { 426 ws := string(l.ws) 427 l.ws = nil 428 return ast.TokenInfo{ 429 PosRange: l.posRange(), 430 LeadingComments: l.comments, 431 LeadingWhitespace: ws, 432 RawText: l.input.endMark(), 433 } 434 } 435 436 func (l *protoLex) setPrev(n ast.TerminalNode, isDot bool) { 437 nStart := n.Start().Line 438 if _, ok := n.(*ast.RuneNode); ok { 439 // This is really gross, but there are many cases where we don't want 440 // to attribute comments to punctuation (like commas, equals, semicolons) 441 // and would instead prefer to attribute comments to a more meaningful 442 // element in the AST. 443 // 444 // So if it's a simple node OTHER THAN PERIOD (since that is not just 445 // punctuation but typically part of a qualified identifier), don't 446 // attribute comments to it. We do that with this TOTAL HACK: adjusting 447 // the start line makes leading comments appear detached so logic below 448 // will naturally associated trailing comment to previous symbol 449 if !isDot { 450 nStart += 2 451 } 452 } 453 if l.prevSym != nil && len(n.LeadingComments()) > 0 && l.prevSym.End().Line < nStart { 454 // we may need to re-attribute the first comment to 455 // instead be previous node's trailing comment 456 prevEnd := l.prevSym.End().Line 457 comments := n.LeadingComments() 458 c := comments[0] 459 commentStart := c.Start.Line 460 if commentStart == prevEnd { 461 // comment is on same line as previous symbol 462 n.PopLeadingComment() 463 l.prevSym.PushTrailingComment(c) 464 } else if commentStart == prevEnd+1 { 465 // comment is right after previous symbol; see if it is detached 466 // and if so re-attribute 467 singleLineStyle := strings.HasPrefix(c.Text, "//") 468 line := c.End.Line 469 groupEnd := -1 470 for i := 1; i < len(comments); i++ { 471 c := comments[i] 472 newGroup := false 473 if !singleLineStyle || c.Start.Line > line+1 { 474 // we've found a gap between comments, which means the 475 // previous comments were detached 476 newGroup = true 477 } else { 478 line = c.End.Line 479 singleLineStyle = strings.HasPrefix(comments[i].Text, "//") 480 if !singleLineStyle { 481 // we've found a switch from // comments to /* 482 // consider that a new group which means the 483 // previous comments were detached 484 newGroup = true 485 } 486 } 487 if newGroup { 488 groupEnd = i 489 break 490 } 491 } 492 493 if groupEnd == -1 { 494 // just one group of comments; we'll mark it as a trailing 495 // comment if it immediately follows previous symbol and is 496 // detached from current symbol 497 c1 := comments[0] 498 c2 := comments[len(comments)-1] 499 if c1.Start.Line <= prevEnd+1 && c2.End.Line < nStart-1 { 500 groupEnd = len(comments) 501 } 502 } 503 504 for i := 0; i < groupEnd; i++ { 505 l.prevSym.PushTrailingComment(n.PopLeadingComment()) 506 } 507 } 508 } 509 510 l.prevSym = n 511 } 512 513 func (l *protoLex) setString(lval *protoSymType, val string) { 514 lval.s = ast.NewStringLiteralNode(val, l.newTokenInfo()) 515 l.setPrev(lval.s, false) 516 } 517 518 func (l *protoLex) setIdent(lval *protoSymType, val string) { 519 lval.id = ast.NewIdentNode(val, l.newTokenInfo()) 520 l.setPrev(lval.id, false) 521 } 522 523 func (l *protoLex) setInt(lval *protoSymType, val uint64) { 524 lval.i = ast.NewUintLiteralNode(val, l.newTokenInfo()) 525 l.setPrev(lval.i, false) 526 } 527 528 func (l *protoLex) setFloat(lval *protoSymType, val float64) { 529 lval.f = ast.NewFloatLiteralNode(val, l.newTokenInfo()) 530 l.setPrev(lval.f, false) 531 } 532 533 func (l *protoLex) setRune(lval *protoSymType, val rune) { 534 lval.b = ast.NewRuneNode(val, l.newTokenInfo()) 535 l.setPrev(lval.b, val == '.') 536 } 537 538 func (l *protoLex) setError(lval *protoSymType, err error) { 539 lval.err = l.addSourceError(err) 540 } 541 542 func (l *protoLex) readNumber(sofar ...rune) string { 543 token := sofar 544 allowExpSign := false 545 for { 546 c, _, err := l.input.readRune() 547 if err != nil { 548 break 549 } 550 if (c == '-' || c == '+') && !allowExpSign { 551 l.input.unreadRune(c) 552 break 553 } 554 allowExpSign = false 555 if c != '.' && c != '_' && (c < '0' || c > '9') && 556 (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && 557 c != '-' && c != '+' { 558 // no more chars in the number token 559 l.input.unreadRune(c) 560 break 561 } 562 if c == 'e' || c == 'E' { 563 // scientific notation char can be followed by 564 // an exponent sign 565 allowExpSign = true 566 } 567 l.adjustPos(c) 568 token = append(token, c) 569 } 570 return string(token) 571 } 572 573 func numError(err error, kind, s string) error { 574 ne, ok := err.(*strconv.NumError) 575 if !ok { 576 return err 577 } 578 if ne.Err == strconv.ErrRange { 579 return fmt.Errorf("value out of range for %s: %s", kind, s) 580 } 581 // syntax error 582 return fmt.Errorf("invalid syntax in %s value: %s", kind, s) 583 } 584 585 func (l *protoLex) readIdentifier(sofar []rune) []rune { 586 token := sofar 587 for { 588 c, _, err := l.input.readRune() 589 if err != nil { 590 break 591 } 592 if c != '_' && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && (c < '0' || c > '9') { 593 l.input.unreadRune(c) 594 break 595 } 596 l.adjustPos(c) 597 token = append(token, c) 598 } 599 return token 600 } 601 602 func (l *protoLex) readStringLiteral(quote rune) (string, error) { 603 var buf bytes.Buffer 604 for { 605 c, _, err := l.input.readRune() 606 if err != nil { 607 if err == io.EOF { 608 err = io.ErrUnexpectedEOF 609 } 610 return "", err 611 } 612 if c == '\n' { 613 return "", errors.New("encountered end-of-line before end of string literal") 614 } 615 l.adjustPos(c) 616 if c == quote { 617 break 618 } 619 if c == 0 { 620 return "", errors.New("null character ('\\0') not allowed in string literal") 621 } 622 if c == '\\' { 623 // escape sequence 624 c, _, err = l.input.readRune() 625 if err != nil { 626 return "", err 627 } 628 l.adjustPos(c) 629 if c == 'x' || c == 'X' { 630 // hex escape 631 c, _, err := l.input.readRune() 632 if err != nil { 633 return "", err 634 } 635 l.adjustPos(c) 636 c2, _, err := l.input.readRune() 637 if err != nil { 638 return "", err 639 } 640 var hex string 641 if (c2 < '0' || c2 > '9') && (c2 < 'a' || c2 > 'f') && (c2 < 'A' || c2 > 'F') { 642 l.input.unreadRune(c2) 643 hex = string(c) 644 } else { 645 l.adjustPos(c2) 646 hex = string([]rune{c, c2}) 647 } 648 i, err := strconv.ParseInt(hex, 16, 32) 649 if err != nil { 650 return "", fmt.Errorf("invalid hex escape: \\x%q", hex) 651 } 652 buf.WriteByte(byte(i)) 653 654 } else if c >= '0' && c <= '7' { 655 // octal escape 656 c2, _, err := l.input.readRune() 657 if err != nil { 658 return "", err 659 } 660 var octal string 661 if c2 < '0' || c2 > '7' { 662 l.input.unreadRune(c2) 663 octal = string(c) 664 } else { 665 l.adjustPos(c2) 666 c3, _, err := l.input.readRune() 667 if err != nil { 668 return "", err 669 } 670 if c3 < '0' || c3 > '7' { 671 l.input.unreadRune(c3) 672 octal = string([]rune{c, c2}) 673 } else { 674 l.adjustPos(c3) 675 octal = string([]rune{c, c2, c3}) 676 } 677 } 678 i, err := strconv.ParseInt(octal, 8, 32) 679 if err != nil { 680 return "", fmt.Errorf("invalid octal escape: \\%q", octal) 681 } 682 if i > 0xff { 683 return "", fmt.Errorf("octal escape is out range, must be between 0 and 377: \\%q", octal) 684 } 685 buf.WriteByte(byte(i)) 686 687 } else if c == 'u' { 688 // short unicode escape 689 u := make([]rune, 4) 690 for i := range u { 691 c, _, err := l.input.readRune() 692 if err != nil { 693 return "", err 694 } 695 l.adjustPos(c) 696 u[i] = c 697 } 698 i, err := strconv.ParseInt(string(u), 16, 32) 699 if err != nil { 700 return "", fmt.Errorf("invalid unicode escape: \\u%q", string(u)) 701 } 702 buf.WriteRune(rune(i)) 703 704 } else if c == 'U' { 705 // long unicode escape 706 u := make([]rune, 8) 707 for i := range u { 708 c, _, err := l.input.readRune() 709 if err != nil { 710 return "", err 711 } 712 l.adjustPos(c) 713 u[i] = c 714 } 715 i, err := strconv.ParseInt(string(u), 16, 32) 716 if err != nil { 717 return "", fmt.Errorf("invalid unicode escape: \\U%q", string(u)) 718 } 719 if i > 0x10ffff || i < 0 { 720 return "", fmt.Errorf("unicode escape is out of range, must be between 0 and 0x10ffff: \\U%q", string(u)) 721 } 722 buf.WriteRune(rune(i)) 723 724 } else if c == 'a' { 725 buf.WriteByte('\a') 726 } else if c == 'b' { 727 buf.WriteByte('\b') 728 } else if c == 'f' { 729 buf.WriteByte('\f') 730 } else if c == 'n' { 731 buf.WriteByte('\n') 732 } else if c == 'r' { 733 buf.WriteByte('\r') 734 } else if c == 't' { 735 buf.WriteByte('\t') 736 } else if c == 'v' { 737 buf.WriteByte('\v') 738 } else if c == '\\' { 739 buf.WriteByte('\\') 740 } else if c == '\'' { 741 buf.WriteByte('\'') 742 } else if c == '"' { 743 buf.WriteByte('"') 744 } else if c == '?' { 745 buf.WriteByte('?') 746 } else { 747 return "", fmt.Errorf("invalid escape sequence: %q", "\\"+string(c)) 748 } 749 } else { 750 buf.WriteRune(c) 751 } 752 } 753 return buf.String(), nil 754 } 755 756 func (l *protoLex) skipToEndOfLineComment(lval *protoSymType) (ok, hasErr bool) { 757 for { 758 c, _, err := l.input.readRune() 759 if err != nil { 760 return false, false 761 } 762 switch c { 763 case '\n': 764 return true, false 765 case 0: 766 l.setError(lval, errors.New("invalid control character")) 767 return false, true 768 } 769 l.adjustPos(c) 770 } 771 } 772 773 func (l *protoLex) skipToEndOfBlockComment(lval *protoSymType) (ok, hasErr bool) { 774 for { 775 c, _, err := l.input.readRune() 776 if err != nil { 777 return false, false 778 } 779 if c == 0 { 780 l.setError(lval, errors.New("invalid control character")) 781 return false, true 782 } 783 l.adjustPos(c) 784 if c == '*' { 785 c, _, err := l.input.readRune() 786 if err != nil { 787 return false, false 788 } 789 if c == '/' { 790 l.adjustPos(c) 791 return true, false 792 } 793 l.input.unreadRune(c) 794 } 795 } 796 } 797 798 func (l *protoLex) addSourceError(err error) ErrorWithPos { 799 ewp, ok := err.(ErrorWithPos) 800 if !ok { 801 ewp = ErrorWithSourcePos{Pos: l.prev(), Underlying: err} 802 } 803 _ = l.errs.handleError(ewp) 804 return ewp 805 } 806 807 func (l *protoLex) Error(s string) { 808 _ = l.addSourceError(errors.New(s)) 809 }