github.com/urso/go-structform@v0.0.2/json/parse.go (about)

     1  package json
     2  
     3  import (
     4  	"bytes"
     5  	"errors"
     6  	"io"
     7  	"strconv"
     8  	"unicode"
     9  	"unicode/utf16"
    10  	"unicode/utf8"
    11  
    12  	structform "github.com/urso/go-structform"
    13  )
    14  
    15  type Parser struct {
    16  	visitor    structform.Visitor
    17  	strVisitor structform.StringRefVisitor
    18  
    19  	// last fail state
    20  	err error
    21  
    22  	// parser state machine
    23  	states       []state // state stack for nested arrays/objects
    24  	currentState state
    25  
    26  	// preallocate stack memory for up to 32 nested arrays/objects
    27  	statesBuf [32]state
    28  
    29  	literalBuffer  []byte
    30  	literalBuffer0 [64]byte
    31  	isDouble       bool
    32  	inEscape       bool
    33  	required       int
    34  }
    35  
    36  var (
    37  	errFailing               = errors.New("JSON parser failed")
    38  	errIncomplete            = errors.New("Incomplete JSON input")
    39  	errUnknownChar           = errors.New("unknown character")
    40  	errQuoteMissing          = errors.New("missing closing quote")
    41  	errExpectColon           = errors.New("expected ':' after map key")
    42  	errUnexpectedDictClose   = errors.New("unexpected '}'")
    43  	errUnexpectedArrClose    = errors.New("unexpected ']'")
    44  	errExpectedDigit         = errors.New("expected a digit")
    45  	errExpectedObject        = errors.New("expected JSON object")
    46  	errExpectedArray         = errors.New("expected JSON array")
    47  	errExpectedFieldName     = errors.New("expected JSON object field name")
    48  	errExpectedInteger       = errors.New("expected integer value")
    49  	errExpectedNull          = errors.New("expected null value")
    50  	errExpectedFalse         = errors.New("expected false value")
    51  	errExpectedTrue          = errors.New("expected true value")
    52  	errExpectedArrayField    = errors.New("expected ']' or ','")
    53  	errUnquoteInEscape       = errors.New("incomplete escape at end of string")
    54  	errUnquoteInvalidChar    = errors.New("invalid character found in string")
    55  	errUnquoteInvalidUnicode = errors.New("unicode escape is no hex number")
    56  	errUnquoteUnknownEscape  = errors.New("unknown escape sequence")
    57  )
    58  
    59  type state uint8
    60  
    61  //go:generate stringer -type=state
    62  const (
    63  	failedState state = iota
    64  	startState
    65  
    66  	arrState
    67  	arrStateValue
    68  	arrStateNext
    69  
    70  	dictState
    71  	dictFieldState
    72  	dictNextFieldState
    73  	dictFieldValue
    74  	dictFieldValueSep
    75  	dictFieldStateEnd
    76  
    77  	nullState
    78  	trueState
    79  	falseState
    80  	stringState
    81  	numberState
    82  )
    83  
    84  func ParseReader(in io.Reader, vs structform.Visitor) (int64, error) {
    85  	p := NewParser(vs)
    86  	i, err := io.Copy(p, in)
    87  	if err == nil {
    88  		err = p.finalize()
    89  	}
    90  	return i, err
    91  }
    92  
    93  func Parse(b []byte, vs structform.Visitor) error {
    94  	return NewParser(vs).Parse(b)
    95  }
    96  
    97  func ParseString(str string, vs structform.Visitor) error {
    98  	return NewParser(vs).ParseString(str)
    99  }
   100  
   101  func NewParser(vs structform.Visitor) *Parser {
   102  	p := &Parser{}
   103  	p.init(vs)
   104  	return p
   105  }
   106  
   107  func (p *Parser) init(vs structform.Visitor) {
   108  	*p = Parser{
   109  		visitor:      vs,
   110  		strVisitor:   structform.MakeStringRefVisitor(vs),
   111  		currentState: startState,
   112  	}
   113  	p.states = p.statesBuf[:0]
   114  	p.literalBuffer = p.literalBuffer0[:0]
   115  }
   116  
   117  func (p *Parser) Parse(b []byte) error {
   118  	p.states = p.states[:0]
   119  	p.literalBuffer = p.literalBuffer[:0]
   120  	p.currentState = startState
   121  
   122  	p.err = p.feed(b)
   123  	if p.err == nil {
   124  		p.err = p.finalize()
   125  	}
   126  	return p.err
   127  }
   128  
   129  func (p *Parser) ParseString(str string) error {
   130  	return p.Parse(str2Bytes(str))
   131  }
   132  
   133  func (p *Parser) Write(b []byte) (int, error) {
   134  	p.err = p.feed(b)
   135  	if p.err != nil {
   136  		return 0, p.err
   137  	}
   138  	return len(b), nil
   139  }
   140  
   141  func (p *Parser) feed(b []byte) error {
   142  	for len(b) > 0 {
   143  		n, _, err := p.feedUntil(b)
   144  		if err != nil {
   145  			return err
   146  		}
   147  
   148  		b = b[n:]
   149  	}
   150  
   151  	return nil
   152  }
   153  
   154  func (p *Parser) feedUntil(b []byte) (int, bool, error) {
   155  	var (
   156  		err      error
   157  		reported bool
   158  		orig     = b
   159  	)
   160  
   161  	for !reported && len(b) > 0 {
   162  		switch p.currentState {
   163  		case failedState:
   164  			if p.err == nil {
   165  				p.err = errors.New("invalid parser state")
   166  			}
   167  			return 0, false, p.err
   168  		case startState:
   169  			b, reported, err = p.stepStart(b)
   170  
   171  		case dictState:
   172  			b, reported, err = p.stepDict(b, true)
   173  
   174  		case dictNextFieldState:
   175  			b, reported, err = p.stepDict(b, false)
   176  
   177  		case dictFieldState:
   178  			b, err = p.stepDictKey(b)
   179  
   180  		case dictFieldValueSep:
   181  			if b = trimLeft(b); len(b) > 0 {
   182  				if b[0] != ':' {
   183  					err = errExpectColon
   184  				}
   185  				b = b[1:]
   186  				p.currentState = dictFieldValue
   187  			}
   188  
   189  		case dictFieldValue:
   190  			b, reported, err = p.stepValue(b, dictFieldStateEnd)
   191  
   192  		case dictFieldStateEnd:
   193  			b, reported, err = p.stepDictValueEnd(b)
   194  
   195  		case arrState:
   196  			b, reported, err = p.stepArray(b, true)
   197  
   198  		case arrStateValue:
   199  			b, _, err = p.stepValue(b, arrStateNext)
   200  
   201  		case arrStateNext:
   202  			b, reported, err = p.stepArrValueEnd(b)
   203  
   204  		case nullState:
   205  			b, reported, err = p.stepNULL(b)
   206  
   207  		case trueState:
   208  			b, reported, err = p.stepTRUE(b)
   209  
   210  		case falseState:
   211  			b, reported, err = p.stepFALSE(b)
   212  
   213  		case stringState:
   214  			b, reported, err = p.stepString(b)
   215  
   216  		case numberState:
   217  			b, reported, err = p.stepNumber(b)
   218  
   219  		default:
   220  			return 0, false, errFailing
   221  		}
   222  
   223  		reported = reported && len(p.states) == 0
   224  	}
   225  
   226  	consumed := len(orig) - len(b)
   227  	return consumed, reported, err
   228  }
   229  
   230  func (p *Parser) finalize() error {
   231  	if p.currentState == numberState {
   232  		err := p.reportNumber(p.literalBuffer, p.isDouble)
   233  		if err != nil {
   234  			return err
   235  		}
   236  		p.popState()
   237  	}
   238  
   239  	if len(p.states) > 0 && p.currentState != startState {
   240  		return errIncomplete
   241  	}
   242  
   243  	return nil
   244  }
   245  
   246  func (p *Parser) pushState(next state) {
   247  	if p.currentState != failedState {
   248  		p.states = append(p.states, p.currentState)
   249  	}
   250  	p.currentState = next
   251  }
   252  
   253  func (p *Parser) popState() {
   254  	if len(p.states) == 0 {
   255  		p.currentState = failedState
   256  	} else {
   257  		last := len(p.states) - 1
   258  		p.currentState = p.states[last]
   259  		p.states = p.states[:last]
   260  	}
   261  }
   262  
   263  func (p *Parser) stepStart(b []byte) ([]byte, bool, error) {
   264  	return p.stepValue(b, p.currentState)
   265  }
   266  
   267  func (p *Parser) stepValue(b []byte, retState state) ([]byte, bool, error) {
   268  	b = trimLeft(b)
   269  	if len(b) == 0 {
   270  		return b, false, nil
   271  	}
   272  
   273  	p.currentState = retState
   274  	c := b[0]
   275  	switch c {
   276  	case '{': // start dictionary
   277  		p.pushState(dictState)
   278  		return b[1:], false, p.visitor.OnObjectStart(-1, structform.AnyType)
   279  
   280  	case '[': // start array
   281  		p.pushState(arrState)
   282  		return b[1:], false, p.visitor.OnArrayStart(-1, structform.AnyType)
   283  
   284  	case 'n': // parse "null"
   285  		p.pushState(nullState)
   286  		p.required = 3
   287  		return p.stepNULL(b[1:])
   288  
   289  	case 'f': // parse "false"
   290  		p.pushState(falseState)
   291  		p.required = 4
   292  		return p.stepFALSE(b[1:])
   293  
   294  	case 't': // parse "true"
   295  		p.pushState(trueState)
   296  		p.required = 3
   297  		return p.stepTRUE(b[1:])
   298  
   299  	case '"': // parse string
   300  		p.literalBuffer = p.literalBuffer[:0]
   301  		p.pushState(stringState)
   302  		p.inEscape = false
   303  		return p.stepString(b[:])
   304  
   305  	default:
   306  		// parse number?
   307  		isNumber := c == '-' || c == '+' || c == '.' || isDigit(c)
   308  		if !isNumber {
   309  			return b, false, errUnknownChar
   310  		}
   311  
   312  		p.literalBuffer = p.literalBuffer0[:0]
   313  		p.pushState(numberState)
   314  		p.isDouble = false
   315  		return p.stepNumber(b)
   316  	}
   317  }
   318  
   319  func (p *Parser) stepDict(b []byte, allowEnd bool) ([]byte, bool, error) {
   320  	b = trimLeft(b)
   321  	if len(b) == 0 {
   322  		return b, false, nil
   323  	}
   324  
   325  	c := b[0]
   326  	switch c {
   327  	case '}':
   328  		if !allowEnd {
   329  			return nil, false, errUnexpectedDictClose
   330  		}
   331  		return p.endDict(b)
   332  
   333  	case '"':
   334  		p.currentState = dictFieldState
   335  		return b, false, nil
   336  
   337  	default:
   338  		return nil, false, errExpectedFieldName
   339  	}
   340  }
   341  
   342  func (p *Parser) stepDictKey(b []byte) ([]byte, error) {
   343  	ref, allocated, done, b, err := p.doString(b)
   344  	if done && err == nil {
   345  		p.currentState = dictFieldValueSep
   346  
   347  		if !allocated {
   348  			err = p.strVisitor.OnKeyRef(ref)
   349  		} else {
   350  			err = p.visitor.OnKey(bytes2Str(ref))
   351  		}
   352  	}
   353  	return b, err
   354  }
   355  
   356  func (p *Parser) stepDictValueEnd(b []byte) ([]byte, bool, error) {
   357  	b = trimLeft(b)
   358  	if len(b) == 0 {
   359  		return b, false, nil
   360  	}
   361  
   362  	c := b[0]
   363  	switch c {
   364  	case '}':
   365  		return p.endDict(b)
   366  	case ',':
   367  		p.currentState = dictNextFieldState
   368  		return b[1:], false, nil
   369  	default:
   370  		return nil, false, errUnknownChar
   371  	}
   372  }
   373  
   374  func (p *Parser) endDict(b []byte) ([]byte, bool, error) {
   375  	p.popState()
   376  	return b[1:], true, p.visitor.OnObjectFinished()
   377  }
   378  
   379  func (p *Parser) stepArray(b []byte, allowEnd bool) ([]byte, bool, error) {
   380  	b = trimLeft(b)
   381  	if len(b) == 0 {
   382  		return b, false, nil
   383  	}
   384  
   385  	c := b[0]
   386  	switch c {
   387  	case ']':
   388  		if !allowEnd {
   389  			return nil, false, errUnexpectedArrClose
   390  		}
   391  		return p.endArray(b)
   392  	}
   393  
   394  	p.currentState = arrStateValue
   395  	return b, false, nil
   396  }
   397  
   398  func (p *Parser) stepArrValueEnd(b []byte) ([]byte, bool, error) {
   399  	b = trimLeft(b)
   400  	if len(b) == 0 {
   401  		return b, false, nil
   402  	}
   403  
   404  	c := b[0]
   405  	switch c {
   406  	case ']':
   407  		return p.endArray(b)
   408  	case ',':
   409  		p.currentState = arrStateValue
   410  		return b[1:], false, nil
   411  	default:
   412  		return nil, false, errUnknownChar
   413  	}
   414  }
   415  
   416  func (p *Parser) endArray(b []byte) ([]byte, bool, error) {
   417  	p.popState()
   418  	return b[1:], true, p.visitor.OnArrayFinished()
   419  }
   420  
   421  func (p *Parser) stepString(b []byte) ([]byte, bool, error) {
   422  	ref, allocated, done, b, err := p.doString(b)
   423  	if done && err == nil {
   424  		p.popState()
   425  
   426  		if !allocated {
   427  			err = p.strVisitor.OnStringRef(ref)
   428  		} else {
   429  			err = p.visitor.OnString(bytes2Str(ref))
   430  		}
   431  	}
   432  	return b, done, err
   433  }
   434  
   435  func (p *Parser) doString(b []byte) ([]byte, bool, bool, []byte, error) {
   436  	stop := -1
   437  	done := false
   438  
   439  	delta := 1
   440  	buf := b
   441  	atStart := len(p.literalBuffer) == 0
   442  	if atStart {
   443  		delta = 2
   444  		buf = b[1:]
   445  	}
   446  
   447  	for i, c := range buf {
   448  		if p.inEscape {
   449  			p.inEscape = false
   450  			continue
   451  		}
   452  
   453  		if c == '"' {
   454  			done = true
   455  			stop = i + delta
   456  			break
   457  		}
   458  		if c == '\\' {
   459  			p.inEscape = true
   460  		}
   461  	}
   462  
   463  	if !done {
   464  		p.literalBuffer = append(p.literalBuffer, b...)
   465  		return nil, false, false, nil, nil
   466  	}
   467  
   468  	rest := b[stop:]
   469  	b = b[:stop]
   470  	if len(p.literalBuffer) > 0 {
   471  		b = append(p.literalBuffer, b...)
   472  		p.literalBuffer = b[:0] // reset buffer
   473  	}
   474  
   475  	var err error
   476  	var allocated bool
   477  	b = b[1 : len(b)-1]
   478  	b, allocated, err = p.unquote(b)
   479  	if err != nil {
   480  		return nil, false, false, nil, err
   481  	}
   482  
   483  	return b, allocated, done, rest, nil
   484  }
   485  
   486  func (p *Parser) unquote(in []byte) ([]byte, bool, error) {
   487  	if len(in) == 0 {
   488  		return in, false, nil
   489  	}
   490  
   491  	// Check for unusual characters and escape sequence. If none is found,
   492  	// return slice as is:
   493  	i := 0
   494  	for i < len(in) {
   495  		c := in[i]
   496  		if c == '\\' || c == '"' || c < ' ' {
   497  			break
   498  		}
   499  
   500  		if c < utf8.RuneSelf {
   501  			i++
   502  			continue
   503  		}
   504  
   505  		r, sz := utf8.DecodeRune(in[i:])
   506  		if r == utf8.RuneError && sz == 1 {
   507  			break
   508  		}
   509  
   510  		i += sz
   511  	}
   512  
   513  	// no special character found -> return as is
   514  	if i == len(in) {
   515  		return in, false, nil
   516  	}
   517  
   518  	// found escape character (or other unusual character) ->
   519  	// allocate output buffer (try to use literalBuffer)
   520  	out := p.literalBuffer[:0]
   521  	allocated := false
   522  	utf8Delta := 2 * utf8.UTFMax
   523  	minLen := len(in) + utf8Delta
   524  	if cap(out) < minLen {
   525  		// TODO: is minLen < some upper bound, store in literalBuffer
   526  		out = make([]byte, minLen)
   527  		allocated = true
   528  	} else {
   529  		out = out[:minLen]
   530  	}
   531  
   532  	// init output buffer
   533  	written := copy(out, in[:i])
   534  
   535  	for i < len(in) {
   536  		if written > len(out)-utf8Delta {
   537  			// out of room -> increase write buffer
   538  			newLen := len(out) * 2
   539  			if cap(out) < newLen {
   540  				tmp := make([]byte, len(out)*2)
   541  				copy(tmp, out[:written])
   542  				out = tmp
   543  				allocated = true
   544  			} else {
   545  				out = out[:newLen]
   546  			}
   547  		}
   548  
   549  		c := in[i]
   550  		switch {
   551  		case c == '\\':
   552  			i++
   553  			if i >= len(in) {
   554  				return nil, false, errUnquoteInEscape
   555  			}
   556  
   557  			switch in[i] {
   558  			default:
   559  				return nil, false, errUnquoteUnknownEscape
   560  			case '"', '\\', '/', '\'':
   561  				out[written] = in[i]
   562  				i++
   563  				written++
   564  			case 'b':
   565  				out[written] = '\b'
   566  				i++
   567  				written++
   568  			case 'f':
   569  				out[written] = '\f'
   570  				i++
   571  				written++
   572  			case 'n':
   573  				out[written] = '\n'
   574  				i++
   575  				written++
   576  			case 'r':
   577  				out[written] = '\r'
   578  				i++
   579  				written++
   580  			case 't':
   581  				out[written] = '\t'
   582  				i++
   583  				written++
   584  			case 'u':
   585  				i++
   586  				code, err := strconv.ParseUint(string(in[i:i+4]), 16, 64)
   587  				if err != nil {
   588  					return nil, false, errUnquoteInvalidUnicode
   589  				}
   590  
   591  				i += 4
   592  				r := rune(code)
   593  				if utf16.IsSurrogate(r) {
   594  					var dec rune = unicode.ReplacementChar
   595  
   596  					valid := in[i] == '\\' && in[i+1] == 'u'
   597  					if valid {
   598  						code, err := strconv.ParseUint(string(in[i+2:i+6]), 16, 64)
   599  						if err == nil {
   600  							dec = utf16.DecodeRune(r, rune(code))
   601  							if dec != unicode.ReplacementChar {
   602  								i += 6
   603  							}
   604  						}
   605  					}
   606  
   607  					r = dec
   608  				}
   609  				written += utf8.EncodeRune(out[written:], r)
   610  			}
   611  
   612  		case c == '"', c < ' ':
   613  			return nil, false, errUnquoteInvalidChar
   614  
   615  		case c < utf8.RuneSelf:
   616  			out[written] = c
   617  			i++
   618  			written++
   619  
   620  		default:
   621  			_, sz := utf8.DecodeRune(in[i:])
   622  			i += sz
   623  			written += copy(out[written:], in[i:i+sz])
   624  		}
   625  	}
   626  
   627  	return out[:written], allocated, nil
   628  }
   629  
   630  func (p *Parser) stepNumber(b []byte) ([]byte, bool, error) {
   631  	// search for char in stop-set
   632  	stop := -1
   633  	done := false
   634  	for i, c := range b {
   635  		isStopChar := c == ' ' || c == '\t' || c == '\f' || c == '\n' || c == '\r' ||
   636  			c == ',' ||
   637  			c == ']' ||
   638  			c == '}'
   639  		if isStopChar {
   640  			stop = i
   641  			done = true
   642  			break
   643  		}
   644  
   645  		p.isDouble = p.isDouble || c == '.' || c == 'e' || c == 'E'
   646  	}
   647  
   648  	if !done {
   649  		p.literalBuffer = append(p.literalBuffer, b...)
   650  		return nil, false, nil
   651  	}
   652  
   653  	rest := b[stop:]
   654  	b = b[:stop]
   655  	if len(p.literalBuffer) > 0 {
   656  		b = append(p.literalBuffer, b...)
   657  		p.literalBuffer = b[:0] // reset buffer
   658  	}
   659  
   660  	err := p.reportNumber(b, p.isDouble)
   661  	p.popState()
   662  	return rest, true, err
   663  }
   664  
   665  func (p *Parser) reportNumber(b []byte, isDouble bool) error {
   666  	// parse number
   667  	var err error
   668  	if isDouble {
   669  		var f float64
   670  		if f, err = strconv.ParseFloat(bytes2Str(b), 64); err == nil {
   671  			err = p.visitor.OnFloat64(f)
   672  		}
   673  	} else {
   674  		var i int64
   675  		if i, err = strconv.ParseInt(bytes2Str(b), 10, 64); err == nil {
   676  			err = p.visitor.OnInt64(i)
   677  		}
   678  	}
   679  
   680  	return err
   681  }
   682  
   683  func (p *Parser) stepNULL(b []byte) ([]byte, bool, error) {
   684  	b, done, err := p.stepKind(b, []byte("null"), errExpectedNull)
   685  	if done {
   686  		err = p.visitor.OnNil()
   687  	}
   688  	return b, done, err
   689  }
   690  
   691  func (p *Parser) stepTRUE(b []byte) ([]byte, bool, error) {
   692  	b, done, err := p.stepKind(b, []byte("true"), errExpectedTrue)
   693  	if done {
   694  		err = p.visitor.OnBool(true)
   695  	}
   696  	return b, done, err
   697  }
   698  
   699  func (p *Parser) stepFALSE(b []byte) ([]byte, bool, error) {
   700  	b, done, err := p.stepKind(b, []byte("false"), errExpectedFalse)
   701  	if done {
   702  		err = p.visitor.OnBool(false)
   703  	}
   704  	return b, done, err
   705  }
   706  
   707  func (p *Parser) stepKind(b []byte, kind []byte, err error) ([]byte, bool, error) {
   708  	n := p.required
   709  	s := kind[len(kind)-n:]
   710  	done := true
   711  	if L := len(b); L < n {
   712  		done = false
   713  		p.required = n - L
   714  		n = L
   715  		s = s[:L]
   716  	}
   717  
   718  	if !bytes.HasPrefix(b, s) {
   719  		return b, false, err
   720  	}
   721  
   722  	if done {
   723  		p.popState()
   724  	}
   725  	return b[n:], done, nil
   726  }
   727  
   728  func isDigit(c byte) bool {
   729  	return '0' <= c && c <= '9'
   730  }
   731  
   732  func trimLeft(b []byte) []byte {
   733  	for i, c := range b {
   734  		if !unicode.IsSpace(rune(c)) {
   735  			return b[i:]
   736  		}
   737  	}
   738  	return nil
   739  }
   740  
   741  var whitespace = " \t\r\n"