github.com/jhump/protocompile@v0.0.0-20221021153901-4f6f732835e8/parser/lexer.go (about) 1 package parser 2 3 import ( 4 "bufio" 5 "bytes" 6 "errors" 7 "fmt" 8 "io" 9 "io/ioutil" 10 "math" 11 "strconv" 12 "strings" 13 "unicode/utf8" 14 15 "github.com/jhump/protocompile/ast" 16 "github.com/jhump/protocompile/reporter" 17 ) 18 19 type runeReader struct { 20 data []byte 21 pos int 22 err error 23 mark int 24 } 25 26 func (rr *runeReader) readRune() (r rune, size int, err error) { 27 if rr.err != nil { 28 return 0, 0, rr.err 29 } 30 if rr.pos == len(rr.data) { 31 rr.err = io.EOF 32 return 0, 0, rr.err 33 } 34 r, sz := utf8.DecodeRune(rr.data[rr.pos:]) 35 if r == utf8.RuneError { 36 rr.err = fmt.Errorf("invalid UTF8 at offset %d: %x", rr.pos, rr.data[rr.pos]) 37 return 0, 0, rr.err 38 } 39 rr.pos = rr.pos + sz 40 return r, sz, nil 41 } 42 43 func (rr *runeReader) offset() int { 44 return rr.pos 45 } 46 47 func (rr *runeReader) unreadRune(sz int) { 48 newPos := rr.pos - sz 49 if newPos < rr.mark { 50 panic("unread past mark") 51 } 52 rr.pos = newPos 53 } 54 55 func (rr *runeReader) setMark() { 56 rr.mark = rr.pos 57 } 58 59 func (rr *runeReader) getMark() string { 60 return string(rr.data[rr.mark:rr.pos]) 61 } 62 63 type protoLex struct { 64 input *runeReader 65 info *ast.FileInfo 66 handler *reporter.Handler 67 res *ast.FileNode 68 69 prevSym ast.TerminalNode 70 prevOffset int 71 eof ast.Token 72 73 comments []ast.Token 74 } 75 76 var utf8Bom = []byte{0xEF, 0xBB, 0xBF} 77 78 func newLexer(in io.Reader, filename string, handler *reporter.Handler) (*protoLex, error) { 79 br := bufio.NewReader(in) 80 81 // if file has UTF8 byte order marker preface, consume it 82 marker, err := br.Peek(3) 83 if err == nil && bytes.Equal(marker, utf8Bom) { 84 _, _ = br.Discard(3) 85 } 86 87 contents, err := ioutil.ReadAll(br) 88 if err != nil { 89 return nil, err 90 } 91 return &protoLex{ 92 input: &runeReader{data: contents}, 93 info: ast.NewFileInfo(filename, contents), 94 handler: handler, 95 }, nil 96 } 97 98 var keywords = map[string]int{ 99 "syntax": _SYNTAX, 100 "import": _IMPORT, 101 "weak": _WEAK, 102 "public": _PUBLIC, 103 "package": _PACKAGE, 104 "option": _OPTION, 105 "true": _TRUE, 106 "false": _FALSE, 107 "inf": _INF, 108 "nan": _NAN, 109 "repeated": _REPEATED, 110 "optional": _OPTIONAL, 111 "required": _REQUIRED, 112 "double": _DOUBLE, 113 "float": _FLOAT, 114 "int32": _INT32, 115 "int64": _INT64, 116 "uint32": _UINT32, 117 "uint64": _UINT64, 118 "sint32": _SINT32, 119 "sint64": _SINT64, 120 "fixed32": _FIXED32, 121 "fixed64": _FIXED64, 122 "sfixed32": _SFIXED32, 123 "sfixed64": _SFIXED64, 124 "bool": _BOOL, 125 "string": _STRING, 126 "bytes": _BYTES, 127 "group": _GROUP, 128 "oneof": _ONEOF, 129 "map": _MAP, 130 "extensions": _EXTENSIONS, 131 "to": _TO, 132 "max": _MAX, 133 "reserved": _RESERVED, 134 "enum": _ENUM, 135 "message": _MESSAGE, 136 "extend": _EXTEND, 137 "service": _SERVICE, 138 "rpc": _RPC, 139 "stream": _STREAM, 140 "returns": _RETURNS, 141 } 142 143 func (l *protoLex) maybeNewLine(r rune) { 144 if r == '\n' { 145 l.info.AddLine(l.input.offset()) 146 } 147 } 148 149 func (l *protoLex) prev() ast.SourcePos { 150 return l.info.SourcePos(l.prevOffset) 151 } 152 153 func (l *protoLex) Lex(lval *protoSymType) int { 154 if l.handler.ReporterError() != nil { 155 // if error reporter already returned non-nil error, 156 // we can skip the rest of the input 157 return 0 158 } 159 160 l.comments = nil 161 162 for { 163 l.input.setMark() 164 165 l.prevOffset = l.input.offset() 166 c, _, err := l.input.readRune() 167 if err == io.EOF { 168 // we're not actually returning a rune, but this will associate 169 // accumulated comments as a trailing comment on last symbol 170 // (if appropriate) 171 l.setRune(lval, 0) 172 l.eof = lval.b.Token() 173 return 0 174 } else if err != nil { 175 l.setError(lval, err) 176 return _ERROR 177 } 178 179 if strings.ContainsRune("\n\r\t\f\v ", c) { 180 // skip whitespace 181 l.maybeNewLine(c) 182 continue 183 } 184 185 if c == '.' { 186 // decimal literals could start with a dot 187 cn, szn, err := l.input.readRune() 188 if err != nil { 189 l.setRune(lval, c) 190 return int(c) 191 } 192 if cn >= '0' && cn <= '9' { 193 l.readNumber() 194 token := l.input.getMark() 195 f, err := parseFloat(token) 196 if err != nil { 197 l.setError(lval, numError(err, "float", token)) 198 return _ERROR 199 } 200 l.setFloat(lval, f) 201 return _FLOAT_LIT 202 } 203 l.input.unreadRune(szn) 204 l.setRune(lval, c) 205 return int(c) 206 } 207 208 if c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') { 209 // identifier 210 l.readIdentifier() 211 token := l.input.getMark() 212 str := string(token) 213 if t, ok := keywords[str]; ok { 214 l.setIdent(lval, str) 215 return t 216 } 217 l.setIdent(lval, str) 218 return _NAME 219 } 220 221 if c >= '0' && c <= '9' { 222 // integer or float literal 223 l.readNumber() 224 token := l.input.getMark() 225 if strings.HasPrefix(token, "0x") || strings.HasPrefix(token, "0X") { 226 // hexadecimal 227 ui, err := strconv.ParseUint(token[2:], 16, 64) 228 if err != nil { 229 l.setError(lval, numError(err, "hexadecimal integer", token[2:])) 230 return _ERROR 231 } 232 l.setInt(lval, ui) 233 return _INT_LIT 234 } 235 if strings.Contains(token, ".") || strings.Contains(token, "e") || strings.Contains(token, "E") { 236 // floating point! 237 f, err := parseFloat(token) 238 if err != nil { 239 l.setError(lval, numError(err, "float", token)) 240 return _ERROR 241 } 242 l.setFloat(lval, f) 243 return _FLOAT_LIT 244 } 245 // integer! (decimal or octal) 246 base := 10 247 if token[0] == '0' { 248 base = 8 249 } 250 ui, err := strconv.ParseUint(token, base, 64) 251 if err != nil { 252 kind := "integer" 253 if base == 8 { 254 kind = "octal integer" 255 } 256 if numErr, ok := err.(*strconv.NumError); ok && numErr.Err == strconv.ErrRange { 257 // if it's too big to be an int, parse it as a float 258 var f float64 259 kind = "float" 260 f, err = parseFloat(token) 261 if err == nil { 262 l.setFloat(lval, f) 263 return _FLOAT_LIT 264 } 265 } 266 l.setError(lval, numError(err, kind, token)) 267 return _ERROR 268 } 269 l.setInt(lval, ui) 270 return _INT_LIT 271 } 272 273 if c == '\'' || c == '"' { 274 // string literal 275 str, err := l.readStringLiteral(c) 276 if err != nil { 277 l.setError(lval, err) 278 return _ERROR 279 } 280 l.setString(lval, str) 281 return _STRING_LIT 282 } 283 284 if c == '/' { 285 // comment 286 cn, szn, err := l.input.readRune() 287 if err != nil { 288 l.setRune(lval, '/') 289 return int(c) 290 } 291 if cn == '/' { 292 hasErr := l.skipToEndOfLineComment(lval) 293 if hasErr { 294 return _ERROR 295 } 296 l.comments = append(l.comments, l.newToken()) 297 continue 298 } 299 if cn == '*' { 300 ok, hasErr := l.skipToEndOfBlockComment(lval) 301 if hasErr { 302 return _ERROR 303 } 304 if !ok { 305 l.setError(lval, errors.New("block comment never terminates, unexpected EOF")) 306 return _ERROR 307 } 308 l.comments = append(l.comments, l.newToken()) 309 continue 310 } 311 l.input.unreadRune(szn) 312 } 313 314 if c < 32 || c == 127 { 315 l.setError(lval, errors.New("invalid control character")) 316 return _ERROR 317 } 318 if !strings.ContainsRune(";,.:=-+(){}[]<>", c) { 319 l.setError(lval, errors.New("invalid character")) 320 return _ERROR 321 } 322 l.setRune(lval, c) 323 return int(c) 324 } 325 } 326 327 func parseFloat(token string) (float64, error) { 328 // strconv.ParseFloat allows _ to separate digits, but protobuf does not 329 if strings.ContainsRune(token, '_') { 330 return 0, &strconv.NumError{ 331 Func: "parseFloat", 332 Num: token, 333 Err: strconv.ErrSyntax, 334 } 335 } 336 f, err := strconv.ParseFloat(token, 64) 337 if err == nil { 338 return f, nil 339 } 340 if numErr, ok := err.(*strconv.NumError); ok && numErr.Err == strconv.ErrRange && math.IsInf(f, 1) { 341 // protoc doesn't complain about float overflow and instead just uses "infinity" 342 // so we mirror that behavior by just returning infinity and ignoring the error 343 return f, nil 344 } 345 return f, err 346 } 347 348 func (l *protoLex) newToken() ast.Token { 349 offset := l.input.mark 350 length := l.input.pos - l.input.mark 351 return l.info.AddToken(offset, length) 352 } 353 354 func (l *protoLex) setPrevAndAddComments(n ast.TerminalNode) { 355 comments := l.comments 356 l.comments = nil 357 var prevTrailingComments []ast.Token 358 359 if l.prevSym != nil && len(comments) > 0 { 360 prevEnd := l.info.NodeInfo(l.prevSym).End().Line 361 c := comments[0] 362 commentInfo := l.info.TokenInfo(c) 363 commentStart := commentInfo.Start().Line 364 if commentStart-prevEnd <= 1 { 365 // we may need to re-attribute the first comment to 366 // instead be previous node's trailing comment 367 groupEnd := 0 368 prevSingleLineStyle := strings.HasPrefix(commentInfo.RawText(), "//") 369 if commentStart == prevEnd || !prevSingleLineStyle { 370 groupEnd = 1 371 } else { 372 // merge adjacent single-line comments into one group 373 prevCommentLine := commentInfo.End().Line 374 for i := 1; i < len(comments); i++ { 375 c := comments[i] 376 commentInfo := l.info.TokenInfo(c) 377 detached := false 378 if !prevSingleLineStyle || commentInfo.Start().Line > prevCommentLine+1 { 379 // we've found a gap between comments, which means the 380 // previous comments were detached 381 detached = true 382 } else { 383 singleLineStyle := strings.HasPrefix(commentInfo.RawText(), "//") 384 if !singleLineStyle { 385 // we've found a switch from // comments to /* 386 // consider that a new group which means the 387 // previous comments were detached 388 detached = true 389 } 390 prevCommentLine = commentInfo.End().Line 391 prevSingleLineStyle = singleLineStyle 392 } 393 if detached { 394 groupEnd = i 395 break 396 } 397 } 398 if groupEnd == 0 { 399 // all comments belong to one group 400 groupEnd = len(comments) 401 } 402 } 403 404 var commentEnd int 405 if groupEnd == 1 { 406 commentEnd = commentInfo.End().Line 407 } else { 408 c2 := comments[groupEnd-1] 409 c2info := l.info.TokenInfo(c2) 410 commentEnd = c2info.End().Line 411 } 412 413 info := l.info.NodeInfo(n) 414 nStart := info.Start().Line 415 416 isPunctuation := false 417 if rn, ok := n.(*ast.RuneNode); ok { 418 isPunctuation = rn.Rune != '.' 419 } 420 421 if isPunctuation || 422 (len(comments) > groupEnd && nStart > prevEnd) || 423 (commentStart == prevEnd && nStart > commentEnd) || 424 (nStart-commentEnd > 1) { 425 426 // we can move the first group of comments to previous token 427 prevTrailingComments = comments[:groupEnd] 428 comments = comments[groupEnd:] 429 } 430 } 431 } 432 433 // now we can associate comments 434 for _, c := range prevTrailingComments { 435 l.info.AddComment(c, l.prevSym.Token()) 436 } 437 for _, c := range comments { 438 l.info.AddComment(c, n.Token()) 439 } 440 441 l.prevSym = n 442 } 443 444 func (l *protoLex) setString(lval *protoSymType, val string) { 445 lval.s = ast.NewStringLiteralNode(val, l.newToken()) 446 l.setPrevAndAddComments(lval.s) 447 } 448 449 func (l *protoLex) setIdent(lval *protoSymType, val string) { 450 lval.id = ast.NewIdentNode(val, l.newToken()) 451 l.setPrevAndAddComments(lval.id) 452 } 453 454 func (l *protoLex) setInt(lval *protoSymType, val uint64) { 455 lval.i = ast.NewUintLiteralNode(val, l.newToken()) 456 l.setPrevAndAddComments(lval.i) 457 } 458 459 func (l *protoLex) setFloat(lval *protoSymType, val float64) { 460 lval.f = ast.NewFloatLiteralNode(val, l.newToken()) 461 l.setPrevAndAddComments(lval.f) 462 } 463 464 func (l *protoLex) setRune(lval *protoSymType, val rune) { 465 lval.b = ast.NewRuneNode(val, l.newToken()) 466 l.setPrevAndAddComments(lval.b) 467 } 468 469 func (l *protoLex) setError(lval *protoSymType, err error) { 470 lval.err = l.addSourceError(err) 471 } 472 473 func (l *protoLex) readNumber() { 474 allowExpSign := false 475 for { 476 c, sz, err := l.input.readRune() 477 if err != nil { 478 break 479 } 480 if (c == '-' || c == '+') && !allowExpSign { 481 l.input.unreadRune(sz) 482 break 483 } 484 allowExpSign = false 485 if c != '.' && c != '_' && (c < '0' || c > '9') && 486 (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && 487 c != '-' && c != '+' { 488 // no more chars in the number token 489 l.input.unreadRune(sz) 490 break 491 } 492 if c == 'e' || c == 'E' { 493 // scientific notation char can be followed by 494 // an exponent sign 495 allowExpSign = true 496 } 497 } 498 } 499 500 func numError(err error, kind, s string) error { 501 ne, ok := err.(*strconv.NumError) 502 if !ok { 503 return err 504 } 505 if ne.Err == strconv.ErrRange { 506 return fmt.Errorf("value out of range for %s: %s", kind, s) 507 } 508 // syntax error 509 return fmt.Errorf("invalid syntax in %s value: %s", kind, s) 510 } 511 512 func (l *protoLex) readIdentifier() { 513 for { 514 c, sz, err := l.input.readRune() 515 if err != nil { 516 break 517 } 518 if c != '_' && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && (c < '0' || c > '9') { 519 l.input.unreadRune(sz) 520 break 521 } 522 } 523 } 524 525 func (l *protoLex) readStringLiteral(quote rune) (string, error) { 526 var buf bytes.Buffer 527 for { 528 c, _, err := l.input.readRune() 529 if err != nil { 530 if err == io.EOF { 531 err = io.ErrUnexpectedEOF 532 } 533 return "", err 534 } 535 if c == '\n' { 536 return "", errors.New("encountered end-of-line before end of string literal") 537 } 538 if c == quote { 539 break 540 } 541 if c == 0 { 542 return "", errors.New("null character ('\\0') not allowed in string literal") 543 } 544 if c == '\\' { 545 // escape sequence 546 c, _, err = l.input.readRune() 547 if err != nil { 548 return "", err 549 } 550 if c == 'x' || c == 'X' { 551 // hex escape 552 c, _, err := l.input.readRune() 553 if err != nil { 554 return "", err 555 } 556 c2, sz2, err := l.input.readRune() 557 if err != nil { 558 return "", err 559 } 560 var hex string 561 if (c2 < '0' || c2 > '9') && (c2 < 'a' || c2 > 'f') && (c2 < 'A' || c2 > 'F') { 562 l.input.unreadRune(sz2) 563 hex = string(c) 564 } else { 565 hex = string([]rune{c, c2}) 566 } 567 i, err := strconv.ParseInt(hex, 16, 32) 568 if err != nil { 569 return "", fmt.Errorf("invalid hex escape: \\x%q", hex) 570 } 571 buf.WriteByte(byte(i)) 572 573 } else if c >= '0' && c <= '7' { 574 // octal escape 575 c2, sz2, err := l.input.readRune() 576 if err != nil { 577 return "", err 578 } 579 var octal string 580 if c2 < '0' || c2 > '7' { 581 l.input.unreadRune(sz2) 582 octal = string(c) 583 } else { 584 c3, sz3, err := l.input.readRune() 585 if err != nil { 586 return "", err 587 } 588 if c3 < '0' || c3 > '7' { 589 l.input.unreadRune(sz3) 590 octal = string([]rune{c, c2}) 591 } else { 592 octal = string([]rune{c, c2, c3}) 593 } 594 } 595 i, err := strconv.ParseInt(octal, 8, 32) 596 if err != nil { 597 return "", fmt.Errorf("invalid octal escape: \\%q", octal) 598 } 599 if i > 0xff { 600 return "", fmt.Errorf("octal escape is out range, must be between 0 and 377: \\%q", octal) 601 } 602 buf.WriteByte(byte(i)) 603 604 } else if c == 'u' { 605 // short unicode escape 606 u := make([]rune, 4) 607 for i := range u { 608 c, _, err := l.input.readRune() 609 if err != nil { 610 return "", err 611 } 612 u[i] = c 613 } 614 i, err := strconv.ParseInt(string(u), 16, 32) 615 if err != nil { 616 return "", fmt.Errorf("invalid unicode escape: \\u%q", string(u)) 617 } 618 buf.WriteRune(rune(i)) 619 620 } else if c == 'U' { 621 // long unicode escape 622 u := make([]rune, 8) 623 for i := range u { 624 c, _, err := l.input.readRune() 625 if err != nil { 626 return "", err 627 } 628 u[i] = c 629 } 630 i, err := strconv.ParseInt(string(u), 16, 32) 631 if err != nil { 632 return "", fmt.Errorf("invalid unicode escape: \\U%q", string(u)) 633 } 634 if i > 0x10ffff || i < 0 { 635 return "", fmt.Errorf("unicode escape is out of range, must be between 0 and 0x10ffff: \\U%q", string(u)) 636 } 637 buf.WriteRune(rune(i)) 638 639 } else if c == 'a' { 640 buf.WriteByte('\a') 641 } else if c == 'b' { 642 buf.WriteByte('\b') 643 } else if c == 'f' { 644 buf.WriteByte('\f') 645 } else if c == 'n' { 646 buf.WriteByte('\n') 647 } else if c == 'r' { 648 buf.WriteByte('\r') 649 } else if c == 't' { 650 buf.WriteByte('\t') 651 } else if c == 'v' { 652 buf.WriteByte('\v') 653 } else if c == '\\' { 654 buf.WriteByte('\\') 655 } else if c == '\'' { 656 buf.WriteByte('\'') 657 } else if c == '"' { 658 buf.WriteByte('"') 659 } else if c == '?' { 660 buf.WriteByte('?') 661 } else { 662 return "", fmt.Errorf("invalid escape sequence: %q", "\\"+string(c)) 663 } 664 } else { 665 buf.WriteRune(c) 666 } 667 } 668 return buf.String(), nil 669 } 670 671 func (l *protoLex) skipToEndOfLineComment(lval *protoSymType) (hasErr bool) { 672 for { 673 c, _, err := l.input.readRune() 674 if err != nil { 675 return false 676 } 677 switch c { 678 case '\n': 679 l.info.AddLine(l.input.offset()) 680 return false 681 case 0: 682 l.setError(lval, errors.New("invalid control character")) 683 return true 684 } 685 } 686 } 687 688 func (l *protoLex) skipToEndOfBlockComment(lval *protoSymType) (ok, hasErr bool) { 689 for { 690 c, _, err := l.input.readRune() 691 if err != nil { 692 return false, false 693 } 694 if c == 0 { 695 l.setError(lval, errors.New("invalid control character")) 696 return false, true 697 } 698 l.maybeNewLine(c) 699 if c == '*' { 700 c, sz, err := l.input.readRune() 701 if err != nil { 702 return false, false 703 } 704 if c == '/' { 705 return true, false 706 } 707 l.input.unreadRune(sz) 708 } 709 } 710 } 711 712 func (l *protoLex) addSourceError(err error) reporter.ErrorWithPos { 713 ewp, ok := err.(reporter.ErrorWithPos) 714 if !ok { 715 ewp = reporter.Error(l.prev(), err) 716 } 717 _ = l.handler.HandleError(ewp) 718 return ewp 719 } 720 721 func (l *protoLex) Error(s string) { 722 _ = l.addSourceError(errors.New(s)) 723 }