github.com/segmentio/encoding@v0.4.0/json/parse.go (about) 1 package json 2 3 import ( 4 "bytes" 5 "encoding/binary" 6 "math" 7 "math/bits" 8 "reflect" 9 "unicode" 10 "unicode/utf16" 11 "unicode/utf8" 12 13 "github.com/segmentio/encoding/ascii" 14 ) 15 16 // All spaces characters defined in the json specification. 17 const ( 18 sp = ' ' 19 ht = '\t' 20 nl = '\n' 21 cr = '\r' 22 ) 23 24 const ( 25 escape = '\\' 26 quote = '"' 27 ) 28 29 func internalParseFlags(b []byte) (flags ParseFlags) { 30 // Don't consider surrounding whitespace 31 b = skipSpaces(b) 32 b = trimTrailingSpaces(b) 33 if ascii.ValidPrint(b) { 34 flags |= validAsciiPrint 35 } 36 if bytes.IndexByte(b, '\\') == -1 { 37 flags |= noBackslash 38 } 39 return 40 } 41 42 func skipSpaces(b []byte) []byte { 43 if len(b) > 0 && b[0] <= 0x20 { 44 b, _ = skipSpacesN(b) 45 } 46 return b 47 } 48 49 func skipSpacesN(b []byte) ([]byte, int) { 50 for i := range b { 51 switch b[i] { 52 case sp, ht, nl, cr: 53 default: 54 return b[i:], i 55 } 56 } 57 return nil, 0 58 } 59 60 func trimTrailingSpaces(b []byte) []byte { 61 if len(b) > 0 && b[len(b)-1] <= 0x20 { 62 b = trimTrailingSpacesN(b) 63 } 64 return b 65 } 66 67 func trimTrailingSpacesN(b []byte) []byte { 68 i := len(b) - 1 69 loop: 70 for ; i >= 0; i-- { 71 switch b[i] { 72 case sp, ht, nl, cr: 73 default: 74 break loop 75 } 76 } 77 return b[:i+1] 78 } 79 80 // parseInt parses a decimal representation of an int64 from b. 81 // 82 // The function is equivalent to calling strconv.ParseInt(string(b), 10, 64) but 83 // it prevents Go from making a memory allocation for converting a byte slice to 84 // a string (escape analysis fails due to the error returned by strconv.ParseInt). 85 // 86 // Because it only works with base 10 the function is also significantly faster 87 // than strconv.ParseInt. 88 func (d decoder) parseInt(b []byte, t reflect.Type) (int64, []byte, error) { 89 var value int64 90 var count int 91 92 if len(b) == 0 { 93 return 0, b, syntaxError(b, "cannot decode integer from an empty input") 94 } 95 96 if b[0] == '-' { 97 const max = math.MinInt64 98 const lim = max / 10 99 100 if len(b) == 1 { 101 return 0, b, syntaxError(b, "cannot decode integer from '-'") 102 } 103 104 if len(b) > 2 && b[1] == '0' && '0' <= b[2] && b[2] <= '9' { 105 return 0, b, syntaxError(b, "invalid leading character '0' in integer") 106 } 107 108 for _, c := range b[1:] { 109 if !(c >= '0' && c <= '9') { 110 if count == 0 { 111 b, err := d.inputError(b, t) 112 return 0, b, err 113 } 114 break 115 } 116 117 if value < lim { 118 return 0, b, unmarshalOverflow(b, t) 119 } 120 121 value *= 10 122 x := int64(c - '0') 123 124 if value < (max + x) { 125 return 0, b, unmarshalOverflow(b, t) 126 } 127 128 value -= x 129 count++ 130 } 131 132 count++ 133 } else { 134 if len(b) > 1 && b[0] == '0' && '0' <= b[1] && b[1] <= '9' { 135 return 0, b, syntaxError(b, "invalid leading character '0' in integer") 136 } 137 138 for ; count < len(b) && b[count] >= '0' && b[count] <= '9'; count++ { 139 x := int64(b[count] - '0') 140 next := value*10 + x 141 if next < value { 142 return 0, b, unmarshalOverflow(b, t) 143 } 144 value = next 145 } 146 147 if count == 0 { 148 b, err := d.inputError(b, t) 149 return 0, b, err 150 } 151 } 152 153 if count < len(b) { 154 switch b[count] { 155 case '.', 'e', 'E': // was this actually a float? 156 v, r, _, err := d.parseNumber(b) 157 if err != nil { 158 v, r = b[:count+1], b[count+1:] 159 } 160 return 0, r, unmarshalTypeError(v, t) 161 } 162 } 163 164 return value, b[count:], nil 165 } 166 167 // parseUint is like parseInt but for unsigned integers. 168 func (d decoder) parseUint(b []byte, t reflect.Type) (uint64, []byte, error) { 169 var value uint64 170 var count int 171 172 if len(b) == 0 { 173 return 0, b, syntaxError(b, "cannot decode integer value from an empty input") 174 } 175 176 if len(b) > 1 && b[0] == '0' && '0' <= b[1] && b[1] <= '9' { 177 return 0, b, syntaxError(b, "invalid leading character '0' in integer") 178 } 179 180 for ; count < len(b) && b[count] >= '0' && b[count] <= '9'; count++ { 181 x := uint64(b[count] - '0') 182 next := value*10 + x 183 if next < value { 184 return 0, b, unmarshalOverflow(b, t) 185 } 186 value = next 187 } 188 189 if count == 0 { 190 b, err := d.inputError(b, t) 191 return 0, b, err 192 } 193 194 if count < len(b) { 195 switch b[count] { 196 case '.', 'e', 'E': // was this actually a float? 197 v, r, _, err := d.parseNumber(b) 198 if err != nil { 199 v, r = b[:count+1], b[count+1:] 200 } 201 return 0, r, unmarshalTypeError(v, t) 202 } 203 } 204 205 return value, b[count:], nil 206 } 207 208 // parseUintHex parses a hexadecimanl representation of a uint64 from b. 209 // 210 // The function is equivalent to calling strconv.ParseUint(string(b), 16, 64) but 211 // it prevents Go from making a memory allocation for converting a byte slice to 212 // a string (escape analysis fails due to the error returned by strconv.ParseUint). 213 // 214 // Because it only works with base 16 the function is also significantly faster 215 // than strconv.ParseUint. 216 func (d decoder) parseUintHex(b []byte) (uint64, []byte, error) { 217 const max = math.MaxUint64 218 const lim = max / 0x10 219 220 var value uint64 221 var count int 222 223 if len(b) == 0 { 224 return 0, b, syntaxError(b, "cannot decode hexadecimal value from an empty input") 225 } 226 227 parseLoop: 228 for i, c := range b { 229 var x uint64 230 231 switch { 232 case c >= '0' && c <= '9': 233 x = uint64(c - '0') 234 235 case c >= 'A' && c <= 'F': 236 x = uint64(c-'A') + 0xA 237 238 case c >= 'a' && c <= 'f': 239 x = uint64(c-'a') + 0xA 240 241 default: 242 if i == 0 { 243 return 0, b, syntaxError(b, "expected hexadecimal digit but found '%c'", c) 244 } 245 break parseLoop 246 } 247 248 if value > lim { 249 return 0, b, syntaxError(b, "hexadecimal value out of range") 250 } 251 252 if value *= 0x10; value > (max - x) { 253 return 0, b, syntaxError(b, "hexadecimal value out of range") 254 } 255 256 value += x 257 count++ 258 } 259 260 return value, b[count:], nil 261 } 262 263 func (d decoder) parseNull(b []byte) ([]byte, []byte, Kind, error) { 264 if hasNullPrefix(b) { 265 return b[:4], b[4:], Null, nil 266 } 267 if len(b) < 4 { 268 return nil, b[len(b):], Undefined, unexpectedEOF(b) 269 } 270 return nil, b, Undefined, syntaxError(b, "expected 'null' but found invalid token") 271 } 272 273 func (d decoder) parseTrue(b []byte) ([]byte, []byte, Kind, error) { 274 if hasTruePrefix(b) { 275 return b[:4], b[4:], True, nil 276 } 277 if len(b) < 4 { 278 return nil, b[len(b):], Undefined, unexpectedEOF(b) 279 } 280 return nil, b, Undefined, syntaxError(b, "expected 'true' but found invalid token") 281 } 282 283 func (d decoder) parseFalse(b []byte) ([]byte, []byte, Kind, error) { 284 if hasFalsePrefix(b) { 285 return b[:5], b[5:], False, nil 286 } 287 if len(b) < 5 { 288 return nil, b[len(b):], Undefined, unexpectedEOF(b) 289 } 290 return nil, b, Undefined, syntaxError(b, "expected 'false' but found invalid token") 291 } 292 293 func (d decoder) parseNumber(b []byte) (v, r []byte, kind Kind, err error) { 294 if len(b) == 0 { 295 r, err = b, unexpectedEOF(b) 296 return 297 } 298 299 // Assume it's an unsigned integer at first. 300 kind = Uint 301 302 i := 0 303 // sign 304 if b[i] == '-' { 305 kind = Int 306 i++ 307 } 308 309 if i == len(b) { 310 r, err = b[i:], syntaxError(b, "missing number value after sign") 311 return 312 } 313 314 if b[i] < '0' || b[i] > '9' { 315 r, err = b[i:], syntaxError(b, "expected digit but got '%c'", b[i]) 316 return 317 } 318 319 // integer part 320 if b[i] == '0' { 321 i++ 322 if i == len(b) || (b[i] != '.' && b[i] != 'e' && b[i] != 'E') { 323 v, r = b[:i], b[i:] 324 return 325 } 326 if '0' <= b[i] && b[i] <= '9' { 327 r, err = b[i:], syntaxError(b, "cannot decode number with leading '0' character") 328 return 329 } 330 } 331 332 for i < len(b) && '0' <= b[i] && b[i] <= '9' { 333 i++ 334 } 335 336 // decimal part 337 if i < len(b) && b[i] == '.' { 338 kind = Float 339 i++ 340 decimalStart := i 341 342 for i < len(b) { 343 if c := b[i]; !('0' <= c && c <= '9') { 344 if i == decimalStart { 345 r, err = b[i:], syntaxError(b, "expected digit but found '%c'", c) 346 return 347 } 348 break 349 } 350 i++ 351 } 352 353 if i == decimalStart { 354 r, err = b[i:], syntaxError(b, "expected decimal part after '.'") 355 return 356 } 357 } 358 359 // exponent part 360 if i < len(b) && (b[i] == 'e' || b[i] == 'E') { 361 kind = Float 362 i++ 363 364 if i < len(b) { 365 if c := b[i]; c == '+' || c == '-' { 366 i++ 367 } 368 } 369 370 if i == len(b) { 371 r, err = b[i:], syntaxError(b, "missing exponent in number") 372 return 373 } 374 375 exponentStart := i 376 377 for i < len(b) { 378 if c := b[i]; !('0' <= c && c <= '9') { 379 if i == exponentStart { 380 err = syntaxError(b, "expected digit but found '%c'", c) 381 return 382 } 383 break 384 } 385 i++ 386 } 387 } 388 389 v, r = b[:i], b[i:] 390 return 391 } 392 393 func (d decoder) parseUnicode(b []byte) (rune, int, error) { 394 if len(b) < 4 { 395 return 0, len(b), syntaxError(b, "unicode code point must have at least 4 characters") 396 } 397 398 u, r, err := d.parseUintHex(b[:4]) 399 if err != nil { 400 return 0, 4, syntaxError(b, "parsing unicode code point: %s", err) 401 } 402 403 if len(r) != 0 { 404 return 0, 4, syntaxError(b, "invalid unicode code point") 405 } 406 407 return rune(u), 4, nil 408 } 409 410 func (d decoder) parseString(b []byte) ([]byte, []byte, Kind, error) { 411 if len(b) < 2 { 412 return nil, b[len(b):], Undefined, unexpectedEOF(b) 413 } 414 if b[0] != '"' { 415 return nil, b, Undefined, syntaxError(b, "expected '\"' at the beginning of a string value") 416 } 417 418 var n int 419 if len(b) >= 9 { 420 // This is an optimization for short strings. We read 8/16 bytes, 421 // and XOR each with 0x22 (") so that these bytes (and only 422 // these bytes) are now zero. We use the hasless(u,1) trick 423 // from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 424 // to determine whether any bytes are zero. Finally, we CTZ 425 // to find the index of that byte. 426 const mask1 = 0x2222222222222222 427 const mask2 = 0x0101010101010101 428 const mask3 = 0x8080808080808080 429 u := binary.LittleEndian.Uint64(b[1:]) ^ mask1 430 if mask := (u - mask2) & ^u & mask3; mask != 0 { 431 n = bits.TrailingZeros64(mask)/8 + 2 432 goto found 433 } 434 if len(b) >= 17 { 435 u = binary.LittleEndian.Uint64(b[9:]) ^ mask1 436 if mask := (u - mask2) & ^u & mask3; mask != 0 { 437 n = bits.TrailingZeros64(mask)/8 + 10 438 goto found 439 } 440 } 441 } 442 n = bytes.IndexByte(b[1:], '"') + 2 443 if n <= 1 { 444 return nil, b[len(b):], Undefined, syntaxError(b, "missing '\"' at the end of a string value") 445 } 446 found: 447 if (d.flags.has(noBackslash) || bytes.IndexByte(b[1:n], '\\') < 0) && 448 (d.flags.has(validAsciiPrint) || ascii.ValidPrint(b[1:n])) { 449 return b[:n], b[n:], Unescaped, nil 450 } 451 452 for i := 1; i < len(b); i++ { 453 switch b[i] { 454 case '\\': 455 if i++; i < len(b) { 456 switch b[i] { 457 case '"', '\\', '/', 'n', 'r', 't', 'f', 'b': 458 case 'u': 459 _, n, err := d.parseUnicode(b[i+1:]) 460 if err != nil { 461 return nil, b[i+1+n:], Undefined, err 462 } 463 i += n 464 default: 465 return nil, b, Undefined, syntaxError(b, "invalid character '%c' in string escape code", b[i]) 466 } 467 } 468 469 case '"': 470 return b[:i+1], b[i+1:], String, nil 471 472 default: 473 if b[i] < 0x20 { 474 return nil, b, Undefined, syntaxError(b, "invalid character '%c' in string escape code", b[i]) 475 } 476 } 477 } 478 479 return nil, b[len(b):], Undefined, syntaxError(b, "missing '\"' at the end of a string value") 480 } 481 482 func (d decoder) parseStringUnquote(b []byte, r []byte) ([]byte, []byte, bool, error) { 483 s, b, k, err := d.parseString(b) 484 if err != nil { 485 return s, b, false, err 486 } 487 488 s = s[1 : len(s)-1] // trim the quotes 489 490 if k == Unescaped { 491 return s, b, false, nil 492 } 493 494 if r == nil { 495 r = make([]byte, 0, len(s)) 496 } 497 498 for len(s) != 0 { 499 i := bytes.IndexByte(s, '\\') 500 501 if i < 0 { 502 r = appendCoerceInvalidUTF8(r, s) 503 break 504 } 505 506 r = appendCoerceInvalidUTF8(r, s[:i]) 507 s = s[i+1:] 508 509 c := s[0] 510 switch c { 511 case '"', '\\', '/': 512 // simple escaped character 513 case 'n': 514 c = '\n' 515 516 case 'r': 517 c = '\r' 518 519 case 't': 520 c = '\t' 521 522 case 'b': 523 c = '\b' 524 525 case 'f': 526 c = '\f' 527 528 case 'u': 529 s = s[1:] 530 531 r1, n1, err := d.parseUnicode(s) 532 if err != nil { 533 return r, b, true, err 534 } 535 s = s[n1:] 536 537 if utf16.IsSurrogate(r1) { 538 if !hasPrefix(s, `\u`) { 539 r1 = unicode.ReplacementChar 540 } else { 541 r2, n2, err := d.parseUnicode(s[2:]) 542 if err != nil { 543 return r, b, true, err 544 } 545 if r1 = utf16.DecodeRune(r1, r2); r1 != unicode.ReplacementChar { 546 s = s[2+n2:] 547 } 548 } 549 } 550 551 r = appendRune(r, r1) 552 continue 553 554 default: // not sure what this escape sequence is 555 return r, b, false, syntaxError(s, "invalid character '%c' in string escape code", c) 556 } 557 558 r = append(r, c) 559 s = s[1:] 560 } 561 562 return r, b, true, nil 563 } 564 565 func appendRune(b []byte, r rune) []byte { 566 n := len(b) 567 b = append(b, 0, 0, 0, 0) 568 return b[:n+utf8.EncodeRune(b[n:], r)] 569 } 570 571 func appendCoerceInvalidUTF8(b []byte, s []byte) []byte { 572 c := [4]byte{} 573 574 for _, r := range string(s) { 575 b = append(b, c[:utf8.EncodeRune(c[:], r)]...) 576 } 577 578 return b 579 } 580 581 func (d decoder) parseObject(b []byte) ([]byte, []byte, Kind, error) { 582 if len(b) < 2 { 583 return nil, b[len(b):], Undefined, unexpectedEOF(b) 584 } 585 586 if b[0] != '{' { 587 return nil, b, Undefined, syntaxError(b, "expected '{' at the beginning of an object value") 588 } 589 590 var err error 591 var a = b 592 var n = len(b) 593 var i = 0 594 595 b = b[1:] 596 for { 597 b = skipSpaces(b) 598 599 if len(b) == 0 { 600 return nil, b, Undefined, syntaxError(b, "cannot decode object from empty input") 601 } 602 603 if b[0] == '}' { 604 j := (n - len(b)) + 1 605 return a[:j], a[j:], Object, nil 606 } 607 608 if i != 0 { 609 if len(b) == 0 { 610 return nil, b, Undefined, syntaxError(b, "unexpected EOF after object field value") 611 } 612 if b[0] != ',' { 613 return nil, b, Undefined, syntaxError(b, "expected ',' after object field value but found '%c'", b[0]) 614 } 615 b = skipSpaces(b[1:]) 616 if len(b) == 0 { 617 return nil, b, Undefined, unexpectedEOF(b) 618 } 619 if b[0] == '}' { 620 return nil, b, Undefined, syntaxError(b, "unexpected trailing comma after object field") 621 } 622 } 623 624 _, b, _, err = d.parseString(b) 625 if err != nil { 626 return nil, b, Undefined, err 627 } 628 b = skipSpaces(b) 629 630 if len(b) == 0 { 631 return nil, b, Undefined, syntaxError(b, "unexpected EOF after object field key") 632 } 633 if b[0] != ':' { 634 return nil, b, Undefined, syntaxError(b, "expected ':' after object field key but found '%c'", b[0]) 635 } 636 b = skipSpaces(b[1:]) 637 638 _, b, _, err = d.parseValue(b) 639 if err != nil { 640 return nil, b, Undefined, err 641 } 642 643 i++ 644 } 645 } 646 647 func (d decoder) parseArray(b []byte) ([]byte, []byte, Kind, error) { 648 if len(b) < 2 { 649 return nil, b[len(b):], Undefined, unexpectedEOF(b) 650 } 651 652 if b[0] != '[' { 653 return nil, b, Undefined, syntaxError(b, "expected '[' at the beginning of array value") 654 } 655 656 var err error 657 var a = b 658 var n = len(b) 659 var i = 0 660 661 b = b[1:] 662 for { 663 b = skipSpaces(b) 664 665 if len(b) == 0 { 666 return nil, b, Undefined, syntaxError(b, "missing closing ']' after array value") 667 } 668 669 if b[0] == ']' { 670 j := (n - len(b)) + 1 671 return a[:j], a[j:], Array, nil 672 } 673 674 if i != 0 { 675 if len(b) == 0 { 676 return nil, b, Undefined, syntaxError(b, "unexpected EOF after array element") 677 } 678 if b[0] != ',' { 679 return nil, b, Undefined, syntaxError(b, "expected ',' after array element but found '%c'", b[0]) 680 } 681 b = skipSpaces(b[1:]) 682 if len(b) == 0 { 683 return nil, b, Undefined, unexpectedEOF(b) 684 } 685 if b[0] == ']' { 686 return nil, b, Undefined, syntaxError(b, "unexpected trailing comma after object field") 687 } 688 } 689 690 _, b, _, err = d.parseValue(b) 691 if err != nil { 692 return nil, b, Undefined, err 693 } 694 695 i++ 696 } 697 } 698 699 func (d decoder) parseValue(b []byte) ([]byte, []byte, Kind, error) { 700 if len(b) == 0 { 701 return nil, b, Undefined, syntaxError(b, "unexpected end of JSON input") 702 } 703 704 var v []byte 705 var k Kind 706 var err error 707 708 switch b[0] { 709 case '{': 710 v, b, k, err = d.parseObject(b) 711 case '[': 712 v, b, k, err = d.parseArray(b) 713 case '"': 714 v, b, k, err = d.parseString(b) 715 case 'n': 716 v, b, k, err = d.parseNull(b) 717 case 't': 718 v, b, k, err = d.parseTrue(b) 719 case 'f': 720 v, b, k, err = d.parseFalse(b) 721 case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 722 v, b, k, err = d.parseNumber(b) 723 default: 724 err = syntaxError(b, "invalid character '%c' looking for beginning of value", b[0]) 725 } 726 727 return v, b, k, err 728 } 729 730 func hasNullPrefix(b []byte) bool { 731 return len(b) >= 4 && string(b[:4]) == "null" 732 } 733 734 func hasTruePrefix(b []byte) bool { 735 return len(b) >= 4 && string(b[:4]) == "true" 736 } 737 738 func hasFalsePrefix(b []byte) bool { 739 return len(b) >= 5 && string(b[:5]) == "false" 740 } 741 742 func hasPrefix(b []byte, s string) bool { 743 return len(b) >= len(s) && s == string(b[:len(s)]) 744 } 745 746 func hasLeadingSign(b []byte) bool { 747 return len(b) > 0 && (b[0] == '+' || b[0] == '-') 748 } 749 750 func hasLeadingZeroes(b []byte) bool { 751 if hasLeadingSign(b) { 752 b = b[1:] 753 } 754 return len(b) > 1 && b[0] == '0' && '0' <= b[1] && b[1] <= '9' 755 } 756 757 func appendToLower(b, s []byte) []byte { 758 if ascii.Valid(s) { // fast path for ascii strings 759 i := 0 760 761 for j := range s { 762 c := s[j] 763 764 if 'A' <= c && c <= 'Z' { 765 b = append(b, s[i:j]...) 766 b = append(b, c+('a'-'A')) 767 i = j + 1 768 } 769 } 770 771 return append(b, s[i:]...) 772 } 773 774 for _, r := range string(s) { 775 b = appendRune(b, foldRune(r)) 776 } 777 778 return b 779 } 780 781 func foldRune(r rune) rune { 782 if r = unicode.SimpleFold(r); 'A' <= r && r <= 'Z' { 783 r = r + ('a' - 'A') 784 } 785 return r 786 }