github.com/urso/go-structform@v0.0.2/json/parse.go (about) 1 package json 2 3 import ( 4 "bytes" 5 "errors" 6 "io" 7 "strconv" 8 "unicode" 9 "unicode/utf16" 10 "unicode/utf8" 11 12 structform "github.com/urso/go-structform" 13 ) 14 15 type Parser struct { 16 visitor structform.Visitor 17 strVisitor structform.StringRefVisitor 18 19 // last fail state 20 err error 21 22 // parser state machine 23 states []state // state stack for nested arrays/objects 24 currentState state 25 26 // preallocate stack memory for up to 32 nested arrays/objects 27 statesBuf [32]state 28 29 literalBuffer []byte 30 literalBuffer0 [64]byte 31 isDouble bool 32 inEscape bool 33 required int 34 } 35 36 var ( 37 errFailing = errors.New("JSON parser failed") 38 errIncomplete = errors.New("Incomplete JSON input") 39 errUnknownChar = errors.New("unknown character") 40 errQuoteMissing = errors.New("missing closing quote") 41 errExpectColon = errors.New("expected ':' after map key") 42 errUnexpectedDictClose = errors.New("unexpected '}'") 43 errUnexpectedArrClose = errors.New("unexpected ']'") 44 errExpectedDigit = errors.New("expected a digit") 45 errExpectedObject = errors.New("expected JSON object") 46 errExpectedArray = errors.New("expected JSON array") 47 errExpectedFieldName = errors.New("expected JSON object field name") 48 errExpectedInteger = errors.New("expected integer value") 49 errExpectedNull = errors.New("expected null value") 50 errExpectedFalse = errors.New("expected false value") 51 errExpectedTrue = errors.New("expected true value") 52 errExpectedArrayField = errors.New("expected ']' or ','") 53 errUnquoteInEscape = errors.New("incomplete escape at end of string") 54 errUnquoteInvalidChar = errors.New("invalid character found in string") 55 errUnquoteInvalidUnicode = errors.New("unicode escape is no hex number") 56 errUnquoteUnknownEscape = errors.New("unknown escape sequence") 57 ) 58 59 type state uint8 60 61 //go:generate stringer -type=state 62 const ( 63 failedState state = iota 64 startState 65 66 arrState 67 arrStateValue 68 arrStateNext 69 70 dictState 71 dictFieldState 72 dictNextFieldState 73 dictFieldValue 74 dictFieldValueSep 75 dictFieldStateEnd 76 77 nullState 78 trueState 79 falseState 80 stringState 81 numberState 82 ) 83 84 func ParseReader(in io.Reader, vs structform.Visitor) (int64, error) { 85 p := NewParser(vs) 86 i, err := io.Copy(p, in) 87 if err == nil { 88 err = p.finalize() 89 } 90 return i, err 91 } 92 93 func Parse(b []byte, vs structform.Visitor) error { 94 return NewParser(vs).Parse(b) 95 } 96 97 func ParseString(str string, vs structform.Visitor) error { 98 return NewParser(vs).ParseString(str) 99 } 100 101 func NewParser(vs structform.Visitor) *Parser { 102 p := &Parser{} 103 p.init(vs) 104 return p 105 } 106 107 func (p *Parser) init(vs structform.Visitor) { 108 *p = Parser{ 109 visitor: vs, 110 strVisitor: structform.MakeStringRefVisitor(vs), 111 currentState: startState, 112 } 113 p.states = p.statesBuf[:0] 114 p.literalBuffer = p.literalBuffer0[:0] 115 } 116 117 func (p *Parser) Parse(b []byte) error { 118 p.states = p.states[:0] 119 p.literalBuffer = p.literalBuffer[:0] 120 p.currentState = startState 121 122 p.err = p.feed(b) 123 if p.err == nil { 124 p.err = p.finalize() 125 } 126 return p.err 127 } 128 129 func (p *Parser) ParseString(str string) error { 130 return p.Parse(str2Bytes(str)) 131 } 132 133 func (p *Parser) Write(b []byte) (int, error) { 134 p.err = p.feed(b) 135 if p.err != nil { 136 return 0, p.err 137 } 138 return len(b), nil 139 } 140 141 func (p *Parser) feed(b []byte) error { 142 for len(b) > 0 { 143 n, _, err := p.feedUntil(b) 144 if err != nil { 145 return err 146 } 147 148 b = b[n:] 149 } 150 151 return nil 152 } 153 154 func (p *Parser) feedUntil(b []byte) (int, bool, error) { 155 var ( 156 err error 157 reported bool 158 orig = b 159 ) 160 161 for !reported && len(b) > 0 { 162 switch p.currentState { 163 case failedState: 164 if p.err == nil { 165 p.err = errors.New("invalid parser state") 166 } 167 return 0, false, p.err 168 case startState: 169 b, reported, err = p.stepStart(b) 170 171 case dictState: 172 b, reported, err = p.stepDict(b, true) 173 174 case dictNextFieldState: 175 b, reported, err = p.stepDict(b, false) 176 177 case dictFieldState: 178 b, err = p.stepDictKey(b) 179 180 case dictFieldValueSep: 181 if b = trimLeft(b); len(b) > 0 { 182 if b[0] != ':' { 183 err = errExpectColon 184 } 185 b = b[1:] 186 p.currentState = dictFieldValue 187 } 188 189 case dictFieldValue: 190 b, reported, err = p.stepValue(b, dictFieldStateEnd) 191 192 case dictFieldStateEnd: 193 b, reported, err = p.stepDictValueEnd(b) 194 195 case arrState: 196 b, reported, err = p.stepArray(b, true) 197 198 case arrStateValue: 199 b, _, err = p.stepValue(b, arrStateNext) 200 201 case arrStateNext: 202 b, reported, err = p.stepArrValueEnd(b) 203 204 case nullState: 205 b, reported, err = p.stepNULL(b) 206 207 case trueState: 208 b, reported, err = p.stepTRUE(b) 209 210 case falseState: 211 b, reported, err = p.stepFALSE(b) 212 213 case stringState: 214 b, reported, err = p.stepString(b) 215 216 case numberState: 217 b, reported, err = p.stepNumber(b) 218 219 default: 220 return 0, false, errFailing 221 } 222 223 reported = reported && len(p.states) == 0 224 } 225 226 consumed := len(orig) - len(b) 227 return consumed, reported, err 228 } 229 230 func (p *Parser) finalize() error { 231 if p.currentState == numberState { 232 err := p.reportNumber(p.literalBuffer, p.isDouble) 233 if err != nil { 234 return err 235 } 236 p.popState() 237 } 238 239 if len(p.states) > 0 && p.currentState != startState { 240 return errIncomplete 241 } 242 243 return nil 244 } 245 246 func (p *Parser) pushState(next state) { 247 if p.currentState != failedState { 248 p.states = append(p.states, p.currentState) 249 } 250 p.currentState = next 251 } 252 253 func (p *Parser) popState() { 254 if len(p.states) == 0 { 255 p.currentState = failedState 256 } else { 257 last := len(p.states) - 1 258 p.currentState = p.states[last] 259 p.states = p.states[:last] 260 } 261 } 262 263 func (p *Parser) stepStart(b []byte) ([]byte, bool, error) { 264 return p.stepValue(b, p.currentState) 265 } 266 267 func (p *Parser) stepValue(b []byte, retState state) ([]byte, bool, error) { 268 b = trimLeft(b) 269 if len(b) == 0 { 270 return b, false, nil 271 } 272 273 p.currentState = retState 274 c := b[0] 275 switch c { 276 case '{': // start dictionary 277 p.pushState(dictState) 278 return b[1:], false, p.visitor.OnObjectStart(-1, structform.AnyType) 279 280 case '[': // start array 281 p.pushState(arrState) 282 return b[1:], false, p.visitor.OnArrayStart(-1, structform.AnyType) 283 284 case 'n': // parse "null" 285 p.pushState(nullState) 286 p.required = 3 287 return p.stepNULL(b[1:]) 288 289 case 'f': // parse "false" 290 p.pushState(falseState) 291 p.required = 4 292 return p.stepFALSE(b[1:]) 293 294 case 't': // parse "true" 295 p.pushState(trueState) 296 p.required = 3 297 return p.stepTRUE(b[1:]) 298 299 case '"': // parse string 300 p.literalBuffer = p.literalBuffer[:0] 301 p.pushState(stringState) 302 p.inEscape = false 303 return p.stepString(b[:]) 304 305 default: 306 // parse number? 307 isNumber := c == '-' || c == '+' || c == '.' || isDigit(c) 308 if !isNumber { 309 return b, false, errUnknownChar 310 } 311 312 p.literalBuffer = p.literalBuffer0[:0] 313 p.pushState(numberState) 314 p.isDouble = false 315 return p.stepNumber(b) 316 } 317 } 318 319 func (p *Parser) stepDict(b []byte, allowEnd bool) ([]byte, bool, error) { 320 b = trimLeft(b) 321 if len(b) == 0 { 322 return b, false, nil 323 } 324 325 c := b[0] 326 switch c { 327 case '}': 328 if !allowEnd { 329 return nil, false, errUnexpectedDictClose 330 } 331 return p.endDict(b) 332 333 case '"': 334 p.currentState = dictFieldState 335 return b, false, nil 336 337 default: 338 return nil, false, errExpectedFieldName 339 } 340 } 341 342 func (p *Parser) stepDictKey(b []byte) ([]byte, error) { 343 ref, allocated, done, b, err := p.doString(b) 344 if done && err == nil { 345 p.currentState = dictFieldValueSep 346 347 if !allocated { 348 err = p.strVisitor.OnKeyRef(ref) 349 } else { 350 err = p.visitor.OnKey(bytes2Str(ref)) 351 } 352 } 353 return b, err 354 } 355 356 func (p *Parser) stepDictValueEnd(b []byte) ([]byte, bool, error) { 357 b = trimLeft(b) 358 if len(b) == 0 { 359 return b, false, nil 360 } 361 362 c := b[0] 363 switch c { 364 case '}': 365 return p.endDict(b) 366 case ',': 367 p.currentState = dictNextFieldState 368 return b[1:], false, nil 369 default: 370 return nil, false, errUnknownChar 371 } 372 } 373 374 func (p *Parser) endDict(b []byte) ([]byte, bool, error) { 375 p.popState() 376 return b[1:], true, p.visitor.OnObjectFinished() 377 } 378 379 func (p *Parser) stepArray(b []byte, allowEnd bool) ([]byte, bool, error) { 380 b = trimLeft(b) 381 if len(b) == 0 { 382 return b, false, nil 383 } 384 385 c := b[0] 386 switch c { 387 case ']': 388 if !allowEnd { 389 return nil, false, errUnexpectedArrClose 390 } 391 return p.endArray(b) 392 } 393 394 p.currentState = arrStateValue 395 return b, false, nil 396 } 397 398 func (p *Parser) stepArrValueEnd(b []byte) ([]byte, bool, error) { 399 b = trimLeft(b) 400 if len(b) == 0 { 401 return b, false, nil 402 } 403 404 c := b[0] 405 switch c { 406 case ']': 407 return p.endArray(b) 408 case ',': 409 p.currentState = arrStateValue 410 return b[1:], false, nil 411 default: 412 return nil, false, errUnknownChar 413 } 414 } 415 416 func (p *Parser) endArray(b []byte) ([]byte, bool, error) { 417 p.popState() 418 return b[1:], true, p.visitor.OnArrayFinished() 419 } 420 421 func (p *Parser) stepString(b []byte) ([]byte, bool, error) { 422 ref, allocated, done, b, err := p.doString(b) 423 if done && err == nil { 424 p.popState() 425 426 if !allocated { 427 err = p.strVisitor.OnStringRef(ref) 428 } else { 429 err = p.visitor.OnString(bytes2Str(ref)) 430 } 431 } 432 return b, done, err 433 } 434 435 func (p *Parser) doString(b []byte) ([]byte, bool, bool, []byte, error) { 436 stop := -1 437 done := false 438 439 delta := 1 440 buf := b 441 atStart := len(p.literalBuffer) == 0 442 if atStart { 443 delta = 2 444 buf = b[1:] 445 } 446 447 for i, c := range buf { 448 if p.inEscape { 449 p.inEscape = false 450 continue 451 } 452 453 if c == '"' { 454 done = true 455 stop = i + delta 456 break 457 } 458 if c == '\\' { 459 p.inEscape = true 460 } 461 } 462 463 if !done { 464 p.literalBuffer = append(p.literalBuffer, b...) 465 return nil, false, false, nil, nil 466 } 467 468 rest := b[stop:] 469 b = b[:stop] 470 if len(p.literalBuffer) > 0 { 471 b = append(p.literalBuffer, b...) 472 p.literalBuffer = b[:0] // reset buffer 473 } 474 475 var err error 476 var allocated bool 477 b = b[1 : len(b)-1] 478 b, allocated, err = p.unquote(b) 479 if err != nil { 480 return nil, false, false, nil, err 481 } 482 483 return b, allocated, done, rest, nil 484 } 485 486 func (p *Parser) unquote(in []byte) ([]byte, bool, error) { 487 if len(in) == 0 { 488 return in, false, nil 489 } 490 491 // Check for unusual characters and escape sequence. If none is found, 492 // return slice as is: 493 i := 0 494 for i < len(in) { 495 c := in[i] 496 if c == '\\' || c == '"' || c < ' ' { 497 break 498 } 499 500 if c < utf8.RuneSelf { 501 i++ 502 continue 503 } 504 505 r, sz := utf8.DecodeRune(in[i:]) 506 if r == utf8.RuneError && sz == 1 { 507 break 508 } 509 510 i += sz 511 } 512 513 // no special character found -> return as is 514 if i == len(in) { 515 return in, false, nil 516 } 517 518 // found escape character (or other unusual character) -> 519 // allocate output buffer (try to use literalBuffer) 520 out := p.literalBuffer[:0] 521 allocated := false 522 utf8Delta := 2 * utf8.UTFMax 523 minLen := len(in) + utf8Delta 524 if cap(out) < minLen { 525 // TODO: is minLen < some upper bound, store in literalBuffer 526 out = make([]byte, minLen) 527 allocated = true 528 } else { 529 out = out[:minLen] 530 } 531 532 // init output buffer 533 written := copy(out, in[:i]) 534 535 for i < len(in) { 536 if written > len(out)-utf8Delta { 537 // out of room -> increase write buffer 538 newLen := len(out) * 2 539 if cap(out) < newLen { 540 tmp := make([]byte, len(out)*2) 541 copy(tmp, out[:written]) 542 out = tmp 543 allocated = true 544 } else { 545 out = out[:newLen] 546 } 547 } 548 549 c := in[i] 550 switch { 551 case c == '\\': 552 i++ 553 if i >= len(in) { 554 return nil, false, errUnquoteInEscape 555 } 556 557 switch in[i] { 558 default: 559 return nil, false, errUnquoteUnknownEscape 560 case '"', '\\', '/', '\'': 561 out[written] = in[i] 562 i++ 563 written++ 564 case 'b': 565 out[written] = '\b' 566 i++ 567 written++ 568 case 'f': 569 out[written] = '\f' 570 i++ 571 written++ 572 case 'n': 573 out[written] = '\n' 574 i++ 575 written++ 576 case 'r': 577 out[written] = '\r' 578 i++ 579 written++ 580 case 't': 581 out[written] = '\t' 582 i++ 583 written++ 584 case 'u': 585 i++ 586 code, err := strconv.ParseUint(string(in[i:i+4]), 16, 64) 587 if err != nil { 588 return nil, false, errUnquoteInvalidUnicode 589 } 590 591 i += 4 592 r := rune(code) 593 if utf16.IsSurrogate(r) { 594 var dec rune = unicode.ReplacementChar 595 596 valid := in[i] == '\\' && in[i+1] == 'u' 597 if valid { 598 code, err := strconv.ParseUint(string(in[i+2:i+6]), 16, 64) 599 if err == nil { 600 dec = utf16.DecodeRune(r, rune(code)) 601 if dec != unicode.ReplacementChar { 602 i += 6 603 } 604 } 605 } 606 607 r = dec 608 } 609 written += utf8.EncodeRune(out[written:], r) 610 } 611 612 case c == '"', c < ' ': 613 return nil, false, errUnquoteInvalidChar 614 615 case c < utf8.RuneSelf: 616 out[written] = c 617 i++ 618 written++ 619 620 default: 621 _, sz := utf8.DecodeRune(in[i:]) 622 i += sz 623 written += copy(out[written:], in[i:i+sz]) 624 } 625 } 626 627 return out[:written], allocated, nil 628 } 629 630 func (p *Parser) stepNumber(b []byte) ([]byte, bool, error) { 631 // search for char in stop-set 632 stop := -1 633 done := false 634 for i, c := range b { 635 isStopChar := c == ' ' || c == '\t' || c == '\f' || c == '\n' || c == '\r' || 636 c == ',' || 637 c == ']' || 638 c == '}' 639 if isStopChar { 640 stop = i 641 done = true 642 break 643 } 644 645 p.isDouble = p.isDouble || c == '.' || c == 'e' || c == 'E' 646 } 647 648 if !done { 649 p.literalBuffer = append(p.literalBuffer, b...) 650 return nil, false, nil 651 } 652 653 rest := b[stop:] 654 b = b[:stop] 655 if len(p.literalBuffer) > 0 { 656 b = append(p.literalBuffer, b...) 657 p.literalBuffer = b[:0] // reset buffer 658 } 659 660 err := p.reportNumber(b, p.isDouble) 661 p.popState() 662 return rest, true, err 663 } 664 665 func (p *Parser) reportNumber(b []byte, isDouble bool) error { 666 // parse number 667 var err error 668 if isDouble { 669 var f float64 670 if f, err = strconv.ParseFloat(bytes2Str(b), 64); err == nil { 671 err = p.visitor.OnFloat64(f) 672 } 673 } else { 674 var i int64 675 if i, err = strconv.ParseInt(bytes2Str(b), 10, 64); err == nil { 676 err = p.visitor.OnInt64(i) 677 } 678 } 679 680 return err 681 } 682 683 func (p *Parser) stepNULL(b []byte) ([]byte, bool, error) { 684 b, done, err := p.stepKind(b, []byte("null"), errExpectedNull) 685 if done { 686 err = p.visitor.OnNil() 687 } 688 return b, done, err 689 } 690 691 func (p *Parser) stepTRUE(b []byte) ([]byte, bool, error) { 692 b, done, err := p.stepKind(b, []byte("true"), errExpectedTrue) 693 if done { 694 err = p.visitor.OnBool(true) 695 } 696 return b, done, err 697 } 698 699 func (p *Parser) stepFALSE(b []byte) ([]byte, bool, error) { 700 b, done, err := p.stepKind(b, []byte("false"), errExpectedFalse) 701 if done { 702 err = p.visitor.OnBool(false) 703 } 704 return b, done, err 705 } 706 707 func (p *Parser) stepKind(b []byte, kind []byte, err error) ([]byte, bool, error) { 708 n := p.required 709 s := kind[len(kind)-n:] 710 done := true 711 if L := len(b); L < n { 712 done = false 713 p.required = n - L 714 n = L 715 s = s[:L] 716 } 717 718 if !bytes.HasPrefix(b, s) { 719 return b, false, err 720 } 721 722 if done { 723 p.popState() 724 } 725 return b[n:], done, nil 726 } 727 728 func isDigit(c byte) bool { 729 return '0' <= c && c <= '9' 730 } 731 732 func trimLeft(b []byte) []byte { 733 for i, c := range b { 734 if !unicode.IsSpace(rune(c)) { 735 return b[i:] 736 } 737 } 738 return nil 739 } 740 741 var whitespace = " \t\r\n"