github.com/whiteCcinn/protobuf-go@v1.0.9/internal/encoding/text/decode.go (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package text 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "strconv" 12 "unicode/utf8" 13 14 "github.com/whiteCcinn/protobuf-go/internal/errors" 15 ) 16 17 // Decoder is a token-based textproto decoder. 18 type Decoder struct { 19 // lastCall is last method called, either readCall or peekCall. 20 // Initial value is readCall. 21 lastCall call 22 23 // lastToken contains the last read token. 24 lastToken Token 25 26 // lastErr contains the last read error. 27 lastErr error 28 29 // openStack is a stack containing the byte characters for MessageOpen and 30 // ListOpen kinds. The top of stack represents the message or the list that 31 // the current token is nested in. An empty stack means the current token is 32 // at the top level message. The characters '{' and '<' both represent the 33 // MessageOpen kind. 34 openStack []byte 35 36 // orig is used in reporting line and column. 37 orig []byte 38 // in contains the unconsumed input. 39 in []byte 40 } 41 42 // NewDecoder returns a Decoder to read the given []byte. 43 func NewDecoder(b []byte) *Decoder { 44 return &Decoder{orig: b, in: b} 45 } 46 47 // ErrUnexpectedEOF means that EOF was encountered in the middle of the input. 48 var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF) 49 50 // call specifies which Decoder method was invoked. 51 type call uint8 52 53 const ( 54 readCall call = iota 55 peekCall 56 ) 57 58 // Peek looks ahead and returns the next token and error without advancing a read. 59 func (d *Decoder) Peek() (Token, error) { 60 defer func() { d.lastCall = peekCall }() 61 if d.lastCall == readCall { 62 d.lastToken, d.lastErr = d.Read() 63 } 64 return d.lastToken, d.lastErr 65 } 66 67 // Read returns the next token. 68 // It will return an error if there is no valid token. 69 func (d *Decoder) Read() (Token, error) { 70 defer func() { d.lastCall = readCall }() 71 if d.lastCall == peekCall { 72 return d.lastToken, d.lastErr 73 } 74 75 tok, err := d.parseNext(d.lastToken.kind) 76 if err != nil { 77 return Token{}, err 78 } 79 80 switch tok.kind { 81 case comma, semicolon: 82 tok, err = d.parseNext(tok.kind) 83 if err != nil { 84 return Token{}, err 85 } 86 } 87 d.lastToken = tok 88 return tok, nil 89 } 90 91 const ( 92 mismatchedFmt = "mismatched close character %q" 93 unexpectedFmt = "unexpected character %q" 94 ) 95 96 // parseNext parses the next Token based on given last kind. 97 func (d *Decoder) parseNext(lastKind Kind) (Token, error) { 98 // Trim leading spaces. 99 d.consume(0) 100 isEOF := false 101 if len(d.in) == 0 { 102 isEOF = true 103 } 104 105 switch lastKind { 106 case EOF: 107 return d.consumeToken(EOF, 0, 0), nil 108 109 case bof: 110 // Start of top level message. Next token can be EOF or Name. 111 if isEOF { 112 return d.consumeToken(EOF, 0, 0), nil 113 } 114 return d.parseFieldName() 115 116 case Name: 117 // Next token can be MessageOpen, ListOpen or Scalar. 118 if isEOF { 119 return Token{}, ErrUnexpectedEOF 120 } 121 switch ch := d.in[0]; ch { 122 case '{', '<': 123 d.pushOpenStack(ch) 124 return d.consumeToken(MessageOpen, 1, 0), nil 125 case '[': 126 d.pushOpenStack(ch) 127 return d.consumeToken(ListOpen, 1, 0), nil 128 default: 129 return d.parseScalar() 130 } 131 132 case Scalar: 133 openKind, closeCh := d.currentOpenKind() 134 switch openKind { 135 case bof: 136 // Top level message. 137 // Next token can be EOF, comma, semicolon or Name. 138 if isEOF { 139 return d.consumeToken(EOF, 0, 0), nil 140 } 141 switch d.in[0] { 142 case ',': 143 return d.consumeToken(comma, 1, 0), nil 144 case ';': 145 return d.consumeToken(semicolon, 1, 0), nil 146 default: 147 return d.parseFieldName() 148 } 149 150 case MessageOpen: 151 // Next token can be MessageClose, comma, semicolon or Name. 152 if isEOF { 153 return Token{}, ErrUnexpectedEOF 154 } 155 switch ch := d.in[0]; ch { 156 case closeCh: 157 d.popOpenStack() 158 return d.consumeToken(MessageClose, 1, 0), nil 159 case otherCloseChar[closeCh]: 160 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 161 case ',': 162 return d.consumeToken(comma, 1, 0), nil 163 case ';': 164 return d.consumeToken(semicolon, 1, 0), nil 165 default: 166 return d.parseFieldName() 167 } 168 169 case ListOpen: 170 // Next token can be ListClose or comma. 171 if isEOF { 172 return Token{}, ErrUnexpectedEOF 173 } 174 switch ch := d.in[0]; ch { 175 case ']': 176 d.popOpenStack() 177 return d.consumeToken(ListClose, 1, 0), nil 178 case ',': 179 return d.consumeToken(comma, 1, 0), nil 180 default: 181 return Token{}, d.newSyntaxError(unexpectedFmt, ch) 182 } 183 } 184 185 case MessageOpen: 186 // Next token can be MessageClose or Name. 187 if isEOF { 188 return Token{}, ErrUnexpectedEOF 189 } 190 _, closeCh := d.currentOpenKind() 191 switch ch := d.in[0]; ch { 192 case closeCh: 193 d.popOpenStack() 194 return d.consumeToken(MessageClose, 1, 0), nil 195 case otherCloseChar[closeCh]: 196 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 197 default: 198 return d.parseFieldName() 199 } 200 201 case MessageClose: 202 openKind, closeCh := d.currentOpenKind() 203 switch openKind { 204 case bof: 205 // Top level message. 206 // Next token can be EOF, comma, semicolon or Name. 207 if isEOF { 208 return d.consumeToken(EOF, 0, 0), nil 209 } 210 switch ch := d.in[0]; ch { 211 case ',': 212 return d.consumeToken(comma, 1, 0), nil 213 case ';': 214 return d.consumeToken(semicolon, 1, 0), nil 215 default: 216 return d.parseFieldName() 217 } 218 219 case MessageOpen: 220 // Next token can be MessageClose, comma, semicolon or Name. 221 if isEOF { 222 return Token{}, ErrUnexpectedEOF 223 } 224 switch ch := d.in[0]; ch { 225 case closeCh: 226 d.popOpenStack() 227 return d.consumeToken(MessageClose, 1, 0), nil 228 case otherCloseChar[closeCh]: 229 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 230 case ',': 231 return d.consumeToken(comma, 1, 0), nil 232 case ';': 233 return d.consumeToken(semicolon, 1, 0), nil 234 default: 235 return d.parseFieldName() 236 } 237 238 case ListOpen: 239 // Next token can be ListClose or comma 240 if isEOF { 241 return Token{}, ErrUnexpectedEOF 242 } 243 switch ch := d.in[0]; ch { 244 case closeCh: 245 d.popOpenStack() 246 return d.consumeToken(ListClose, 1, 0), nil 247 case ',': 248 return d.consumeToken(comma, 1, 0), nil 249 default: 250 return Token{}, d.newSyntaxError(unexpectedFmt, ch) 251 } 252 } 253 254 case ListOpen: 255 // Next token can be ListClose, MessageStart or Scalar. 256 if isEOF { 257 return Token{}, ErrUnexpectedEOF 258 } 259 switch ch := d.in[0]; ch { 260 case ']': 261 d.popOpenStack() 262 return d.consumeToken(ListClose, 1, 0), nil 263 case '{', '<': 264 d.pushOpenStack(ch) 265 return d.consumeToken(MessageOpen, 1, 0), nil 266 default: 267 return d.parseScalar() 268 } 269 270 case ListClose: 271 openKind, closeCh := d.currentOpenKind() 272 switch openKind { 273 case bof: 274 // Top level message. 275 // Next token can be EOF, comma, semicolon or Name. 276 if isEOF { 277 return d.consumeToken(EOF, 0, 0), nil 278 } 279 switch ch := d.in[0]; ch { 280 case ',': 281 return d.consumeToken(comma, 1, 0), nil 282 case ';': 283 return d.consumeToken(semicolon, 1, 0), nil 284 default: 285 return d.parseFieldName() 286 } 287 288 case MessageOpen: 289 // Next token can be MessageClose, comma, semicolon or Name. 290 if isEOF { 291 return Token{}, ErrUnexpectedEOF 292 } 293 switch ch := d.in[0]; ch { 294 case closeCh: 295 d.popOpenStack() 296 return d.consumeToken(MessageClose, 1, 0), nil 297 case otherCloseChar[closeCh]: 298 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 299 case ',': 300 return d.consumeToken(comma, 1, 0), nil 301 case ';': 302 return d.consumeToken(semicolon, 1, 0), nil 303 default: 304 return d.parseFieldName() 305 } 306 307 default: 308 // It is not possible to have this case. Let it panic below. 309 } 310 311 case comma, semicolon: 312 openKind, closeCh := d.currentOpenKind() 313 switch openKind { 314 case bof: 315 // Top level message. Next token can be EOF or Name. 316 if isEOF { 317 return d.consumeToken(EOF, 0, 0), nil 318 } 319 return d.parseFieldName() 320 321 case MessageOpen: 322 // Next token can be MessageClose or Name. 323 if isEOF { 324 return Token{}, ErrUnexpectedEOF 325 } 326 switch ch := d.in[0]; ch { 327 case closeCh: 328 d.popOpenStack() 329 return d.consumeToken(MessageClose, 1, 0), nil 330 case otherCloseChar[closeCh]: 331 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 332 default: 333 return d.parseFieldName() 334 } 335 336 case ListOpen: 337 if lastKind == semicolon { 338 // It is not be possible to have this case as logic here 339 // should not have produced a semicolon Token when inside a 340 // list. Let it panic below. 341 break 342 } 343 // Next token can be MessageOpen or Scalar. 344 if isEOF { 345 return Token{}, ErrUnexpectedEOF 346 } 347 switch ch := d.in[0]; ch { 348 case '{', '<': 349 d.pushOpenStack(ch) 350 return d.consumeToken(MessageOpen, 1, 0), nil 351 default: 352 return d.parseScalar() 353 } 354 } 355 } 356 357 line, column := d.Position(len(d.orig) - len(d.in)) 358 panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind)) 359 } 360 361 var otherCloseChar = map[byte]byte{ 362 '}': '>', 363 '>': '}', 364 } 365 366 // currentOpenKind indicates whether current position is inside a message, list 367 // or top-level message by returning MessageOpen, ListOpen or bof respectively. 368 // If the returned kind is either a MessageOpen or ListOpen, it also returns the 369 // corresponding closing character. 370 func (d *Decoder) currentOpenKind() (Kind, byte) { 371 if len(d.openStack) == 0 { 372 return bof, 0 373 } 374 openCh := d.openStack[len(d.openStack)-1] 375 switch openCh { 376 case '{': 377 return MessageOpen, '}' 378 case '<': 379 return MessageOpen, '>' 380 case '[': 381 return ListOpen, ']' 382 } 383 panic(fmt.Sprintf("Decoder: openStack contains invalid byte %c", openCh)) 384 } 385 386 func (d *Decoder) pushOpenStack(ch byte) { 387 d.openStack = append(d.openStack, ch) 388 } 389 390 func (d *Decoder) popOpenStack() { 391 d.openStack = d.openStack[:len(d.openStack)-1] 392 } 393 394 // parseFieldName parses field name and separator. 395 func (d *Decoder) parseFieldName() (tok Token, err error) { 396 defer func() { 397 if err == nil && d.tryConsumeChar(':') { 398 tok.attrs |= hasSeparator 399 } 400 }() 401 402 // Extension or Any type URL. 403 if d.in[0] == '[' { 404 return d.parseTypeName() 405 } 406 407 // Identifier. 408 if size := parseIdent(d.in, false); size > 0 { 409 return d.consumeToken(Name, size, uint8(IdentName)), nil 410 } 411 412 // Field number. Identify if input is a valid number that is not negative 413 // and is decimal integer within 32-bit range. 414 if num := parseNumber(d.in); num.size > 0 { 415 if !num.neg && num.kind == numDec { 416 if _, err := strconv.ParseInt(string(d.in[:num.size]), 10, 32); err == nil { 417 return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil 418 } 419 } 420 return Token{}, d.newSyntaxError("invalid field number: %s", d.in[:num.size]) 421 } 422 423 return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in)) 424 } 425 426 // parseTypeName parses Any type URL or extension field name. The name is 427 // enclosed in [ and ] characters. The C++ parser does not handle many legal URL 428 // strings. This implementation is more liberal and allows for the pattern 429 // ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed 430 // in between [ ], '.', '/' and the sub names. 431 func (d *Decoder) parseTypeName() (Token, error) { 432 startPos := len(d.orig) - len(d.in) 433 // Use alias s to advance first in order to use d.in for error handling. 434 // Caller already checks for [ as first character. 435 s := consume(d.in[1:], 0) 436 if len(s) == 0 { 437 return Token{}, ErrUnexpectedEOF 438 } 439 440 var name []byte 441 for len(s) > 0 && isTypeNameChar(s[0]) { 442 name = append(name, s[0]) 443 s = s[1:] 444 } 445 s = consume(s, 0) 446 447 var closed bool 448 for len(s) > 0 && !closed { 449 switch { 450 case s[0] == ']': 451 s = s[1:] 452 closed = true 453 454 case s[0] == '/', s[0] == '.': 455 if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') { 456 return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s", 457 d.orig[startPos:len(d.orig)-len(s)+1]) 458 } 459 name = append(name, s[0]) 460 s = s[1:] 461 s = consume(s, 0) 462 for len(s) > 0 && isTypeNameChar(s[0]) { 463 name = append(name, s[0]) 464 s = s[1:] 465 } 466 s = consume(s, 0) 467 468 default: 469 return Token{}, d.newSyntaxError( 470 "invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1]) 471 } 472 } 473 474 if !closed { 475 return Token{}, ErrUnexpectedEOF 476 } 477 478 // First character cannot be '.'. Last character cannot be '.' or '/'. 479 size := len(name) 480 if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' { 481 return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s", 482 d.orig[startPos:len(d.orig)-len(s)]) 483 } 484 485 d.in = s 486 endPos := len(d.orig) - len(d.in) 487 d.consume(0) 488 489 return Token{ 490 kind: Name, 491 attrs: uint8(TypeName), 492 pos: startPos, 493 raw: d.orig[startPos:endPos], 494 str: string(name), 495 }, nil 496 } 497 498 func isTypeNameChar(b byte) bool { 499 return (b == '-' || b == '_' || 500 ('0' <= b && b <= '9') || 501 ('a' <= b && b <= 'z') || 502 ('A' <= b && b <= 'Z')) 503 } 504 505 func isWhiteSpace(b byte) bool { 506 switch b { 507 case ' ', '\n', '\r', '\t': 508 return true 509 default: 510 return false 511 } 512 } 513 514 // parseIdent parses an unquoted proto identifier and returns size. 515 // If allowNeg is true, it allows '-' to be the first character in the 516 // identifier. This is used when parsing literal values like -infinity, etc. 517 // Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*` 518 func parseIdent(input []byte, allowNeg bool) int { 519 var size int 520 521 s := input 522 if len(s) == 0 { 523 return 0 524 } 525 526 if allowNeg && s[0] == '-' { 527 s = s[1:] 528 size++ 529 if len(s) == 0 { 530 return 0 531 } 532 } 533 534 switch { 535 case s[0] == '_', 536 'a' <= s[0] && s[0] <= 'z', 537 'A' <= s[0] && s[0] <= 'Z': 538 s = s[1:] 539 size++ 540 default: 541 return 0 542 } 543 544 for len(s) > 0 && (s[0] == '_' || 545 'a' <= s[0] && s[0] <= 'z' || 546 'A' <= s[0] && s[0] <= 'Z' || 547 '0' <= s[0] && s[0] <= '9') { 548 s = s[1:] 549 size++ 550 } 551 552 if len(s) > 0 && !isDelim(s[0]) { 553 return 0 554 } 555 556 return size 557 } 558 559 // parseScalar parses for a string, literal or number value. 560 func (d *Decoder) parseScalar() (Token, error) { 561 if d.in[0] == '"' || d.in[0] == '\'' { 562 return d.parseStringValue() 563 } 564 565 if tok, ok := d.parseLiteralValue(); ok { 566 return tok, nil 567 } 568 569 if tok, ok := d.parseNumberValue(); ok { 570 return tok, nil 571 } 572 573 return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in)) 574 } 575 576 // parseLiteralValue parses a literal value. A literal value is used for 577 // bools, special floats and enums. This function simply identifies that the 578 // field value is a literal. 579 func (d *Decoder) parseLiteralValue() (Token, bool) { 580 size := parseIdent(d.in, true) 581 if size == 0 { 582 return Token{}, false 583 } 584 return d.consumeToken(Scalar, size, literalValue), true 585 } 586 587 // consumeToken constructs a Token for given Kind from d.in and consumes given 588 // size-length from it. 589 func (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token { 590 // Important to compute raw and pos before consuming. 591 tok := Token{ 592 kind: kind, 593 attrs: attrs, 594 pos: len(d.orig) - len(d.in), 595 raw: d.in[:size], 596 } 597 d.consume(size) 598 return tok 599 } 600 601 // newSyntaxError returns a syntax error with line and column information for 602 // current position. 603 func (d *Decoder) newSyntaxError(f string, x ...interface{}) error { 604 e := errors.New(f, x...) 605 line, column := d.Position(len(d.orig) - len(d.in)) 606 return errors.New("syntax error (line %d:%d): %v", line, column, e) 607 } 608 609 // Position returns line and column number of given index of the original input. 610 // It will panic if index is out of range. 611 func (d *Decoder) Position(idx int) (line int, column int) { 612 b := d.orig[:idx] 613 line = bytes.Count(b, []byte("\n")) + 1 614 if i := bytes.LastIndexByte(b, '\n'); i >= 0 { 615 b = b[i+1:] 616 } 617 column = utf8.RuneCount(b) + 1 // ignore multi-rune characters 618 return line, column 619 } 620 621 func (d *Decoder) tryConsumeChar(c byte) bool { 622 if len(d.in) > 0 && d.in[0] == c { 623 d.consume(1) 624 return true 625 } 626 return false 627 } 628 629 // consume consumes n bytes of input and any subsequent whitespace or comments. 630 func (d *Decoder) consume(n int) { 631 d.in = consume(d.in, n) 632 return 633 } 634 635 // consume consumes n bytes of input and any subsequent whitespace or comments. 636 func consume(b []byte, n int) []byte { 637 b = b[n:] 638 for len(b) > 0 { 639 switch b[0] { 640 case ' ', '\n', '\r', '\t': 641 b = b[1:] 642 case '#': 643 if i := bytes.IndexByte(b, '\n'); i >= 0 { 644 b = b[i+len("\n"):] 645 } else { 646 b = nil 647 } 648 default: 649 return b 650 } 651 } 652 return b 653 } 654 655 // errId extracts a byte sequence that looks like an invalid ID 656 // (for the purposes of error reporting). 657 func errId(seq []byte) []byte { 658 const maxLen = 32 659 for i := 0; i < len(seq); { 660 if i > maxLen { 661 return append(seq[:i:i], "…"...) 662 } 663 r, size := utf8.DecodeRune(seq[i:]) 664 if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) { 665 if i == 0 { 666 // Either the first byte is invalid UTF-8 or a 667 // delimiter, or the first rune is non-ASCII. 668 // Return it as-is. 669 i = size 670 } 671 return seq[:i:i] 672 } 673 i += size 674 } 675 // No delimiter found. 676 return seq 677 } 678 679 // isDelim returns true if given byte is a delimiter character. 680 func isDelim(c byte) bool { 681 return !(c == '-' || c == '+' || c == '.' || c == '_' || 682 ('a' <= c && c <= 'z') || 683 ('A' <= c && c <= 'Z') || 684 ('0' <= c && c <= '9')) 685 }