github.com/neilotoole/jsoncolor@v0.6.0/parse.go

github.com/neilotoole/jsoncolor@v0.6.0/parse.go (about)

     1  package jsoncolor
     2  
     3  import (
     4  	"bytes"
     5  	"math"
     6  	"reflect"
     7  	"unicode"
     8  	"unicode/utf16"
     9  	"unicode/utf8"
    10  
    11  	"github.com/segmentio/encoding/ascii"
    12  )
    13  
    14  // All spaces characters defined in the json specification.
    15  const (
    16  	sp = ' '
    17  	ht = '\t'
    18  	nl = '\n'
    19  	cr = '\r'
    20  )
    21  
    22  const (
    23  	escape = '\\'
    24  	quote  = '"' //nolint:varcheck // from original code
    25  )
    26  
    27  func skipSpaces(b []byte) []byte {
    28  	b, _ = skipSpacesN(b)
    29  	return b
    30  }
    31  
    32  func skipSpacesN(b []byte) ([]byte, int) {
    33  	for i := range b {
    34  		switch b[i] {
    35  		case sp, ht, nl, cr:
    36  		default:
    37  			return b[i:], i
    38  		}
    39  	}
    40  	return nil, 0
    41  }
    42  
    43  // parseInt parses a decimanl representation of an int64 from b.
    44  //
    45  // The function is equivalent to calling strconv.ParseInt(string(b), 10, 64) but
    46  // it prevents Go from making a memory allocation for converting a byte slice to
    47  // a string (escape analysis fails due to the error returned by strconv.ParseInt).
    48  //
    49  // Because it only works with base 10 the function is also significantly faster
    50  // than strconv.ParseInt.
    51  func parseInt(b []byte, t reflect.Type) (int64, []byte, error) {
    52  	var value int64
    53  	var count int
    54  
    55  	if len(b) == 0 {
    56  		return 0, b, syntaxError(b, "cannot decode integer from an empty input")
    57  	}
    58  
    59  	if b[0] == '-' {
    60  		const max = math.MinInt64
    61  		const lim = max / 10
    62  
    63  		if len(b) == 1 {
    64  			return 0, b, syntaxError(b, "cannot decode integer from '-'")
    65  		}
    66  
    67  		if len(b) > 2 && b[1] == '0' && '0' <= b[2] && b[2] <= '9' {
    68  			return 0, b, syntaxError(b, "invalid leading character '0' in integer")
    69  		}
    70  
    71  		for _, d := range b[1:] {
    72  			if !(d >= '0' && d <= '9') {
    73  				if count == 0 {
    74  					b, err := inputError(b, t)
    75  					return 0, b, err
    76  				}
    77  				break
    78  			}
    79  
    80  			if value < lim {
    81  				return 0, b, unmarshalOverflow(b, t)
    82  			}
    83  
    84  			value *= 10
    85  			x := int64(d - '0')
    86  
    87  			if value < (max + x) {
    88  				return 0, b, unmarshalOverflow(b, t)
    89  			}
    90  
    91  			value -= x
    92  			count++
    93  		}
    94  
    95  		count++
    96  	} else {
    97  		const max = math.MaxInt64
    98  		const lim = max / 10
    99  
   100  		if len(b) > 1 && b[0] == '0' && '0' <= b[1] && b[1] <= '9' {
   101  			return 0, b, syntaxError(b, "invalid leading character '0' in integer")
   102  		}
   103  
   104  		for _, d := range b {
   105  			if !(d >= '0' && d <= '9') {
   106  				if count == 0 {
   107  					b, err := inputError(b, t)
   108  					return 0, b, err
   109  				}
   110  				break
   111  			}
   112  			x := int64(d - '0')
   113  
   114  			if value > lim {
   115  				return 0, b, unmarshalOverflow(b, t)
   116  			}
   117  
   118  			if value *= 10; value > (max - x) {
   119  				return 0, b, unmarshalOverflow(b, t)
   120  			}
   121  
   122  			value += x
   123  			count++
   124  		}
   125  	}
   126  
   127  	if count < len(b) {
   128  		switch b[count] {
   129  		case '.', 'e', 'E': // was this actually a float?
   130  			v, r, err := parseNumber(b)
   131  			if err != nil {
   132  				v, r = b[:count+1], b[count+1:]
   133  			}
   134  			return 0, r, unmarshalTypeError(v, t)
   135  		}
   136  	}
   137  
   138  	return value, b[count:], nil
   139  }
   140  
   141  // parseUint is like parseInt but for unsigned integers.
   142  func parseUint(b []byte, t reflect.Type) (uint64, []byte, error) {
   143  	const max = math.MaxUint64
   144  	const lim = max / 10
   145  
   146  	var value uint64
   147  	var count int
   148  
   149  	if len(b) == 0 {
   150  		return 0, b, syntaxError(b, "cannot decode integer value from an empty input")
   151  	}
   152  
   153  	if len(b) > 1 && b[0] == '0' && '0' <= b[1] && b[1] <= '9' {
   154  		return 0, b, syntaxError(b, "invalid leading character '0' in integer")
   155  	}
   156  
   157  	for _, d := range b {
   158  		if !(d >= '0' && d <= '9') {
   159  			if count == 0 {
   160  				b, err := inputError(b, t)
   161  				return 0, b, err
   162  			}
   163  			break
   164  		}
   165  		x := uint64(d - '0')
   166  
   167  		if value > lim {
   168  			return 0, b, unmarshalOverflow(b, t)
   169  		}
   170  
   171  		if value *= 10; value > (max - x) {
   172  			return 0, b, unmarshalOverflow(b, t)
   173  		}
   174  
   175  		value += x
   176  		count++
   177  	}
   178  
   179  	if count < len(b) {
   180  		switch b[count] {
   181  		case '.', 'e', 'E': // was this actually a float?
   182  			v, r, err := parseNumber(b)
   183  			if err != nil {
   184  				v, r = b[:count+1], b[count+1:]
   185  			}
   186  			return 0, r, unmarshalTypeError(v, t)
   187  		}
   188  	}
   189  
   190  	return value, b[count:], nil
   191  }
   192  
   193  // parseUintHex parses a hexadecimanl representation of a uint64 from b.
   194  //
   195  // The function is equivalent to calling strconv.ParseUint(string(b), 16, 64) but
   196  // it prevents Go from making a memory allocation for converting a byte slice to
   197  // a string (escape analysis fails due to the error returned by strconv.ParseUint).
   198  //
   199  // Because it only works with base 16 the function is also significantly faster
   200  // than strconv.ParseUint.
   201  func parseUintHex(b []byte) (uint64, []byte, error) {
   202  	const max = math.MaxUint64
   203  	const lim = max / 0x10
   204  
   205  	var value uint64
   206  	var count int
   207  
   208  	if len(b) == 0 {
   209  		return 0, b, syntaxError(b, "cannot decode hexadecimal value from an empty input")
   210  	}
   211  
   212  parseLoop:
   213  	for i, d := range b {
   214  		var x uint64
   215  
   216  		switch {
   217  		case d >= '0' && d <= '9':
   218  			x = uint64(d - '0')
   219  
   220  		case d >= 'A' && d <= 'F':
   221  			x = uint64(d-'A') + 0xA
   222  
   223  		case d >= 'a' && d <= 'f':
   224  			x = uint64(d-'a') + 0xA
   225  
   226  		default:
   227  			if i == 0 {
   228  				return 0, b, syntaxError(b, "expected hexadecimal digit but found '%c'", d)
   229  			}
   230  			break parseLoop
   231  		}
   232  
   233  		if value > lim {
   234  			return 0, b, syntaxError(b, "hexadecimal value out of range")
   235  		}
   236  
   237  		if value *= 0x10; value > (max - x) {
   238  			return 0, b, syntaxError(b, "hexadecimal value out of range")
   239  		}
   240  
   241  		value += x
   242  		count++
   243  	}
   244  
   245  	return value, b[count:], nil
   246  }
   247  
   248  func parseNull(b []byte) ([]byte, []byte, error) {
   249  	if hasNullPrefix(b) {
   250  		return b[:4], b[4:], nil
   251  	}
   252  	if len(b) < 4 {
   253  		return nil, b[len(b):], unexpectedEOF(b)
   254  	}
   255  	return nil, b, syntaxError(b, "expected 'null' but found invalid token")
   256  }
   257  
   258  func parseTrue(b []byte) ([]byte, []byte, error) {
   259  	if hasTruePrefix(b) {
   260  		return b[:4], b[4:], nil
   261  	}
   262  	if len(b) < 4 {
   263  		return nil, b[len(b):], unexpectedEOF(b)
   264  	}
   265  	return nil, b, syntaxError(b, "expected 'true' but found invalid token")
   266  }
   267  
   268  func parseFalse(b []byte) ([]byte, []byte, error) {
   269  	if hasFalsePrefix(b) {
   270  		return b[:5], b[5:], nil
   271  	}
   272  	if len(b) < 5 {
   273  		return nil, b[len(b):], unexpectedEOF(b)
   274  	}
   275  	return nil, b, syntaxError(b, "expected 'false' but found invalid token")
   276  }
   277  
   278  func parseNumber(b []byte) (v, r []byte, err error) {
   279  	if len(b) == 0 {
   280  		r, err = b, unexpectedEOF(b)
   281  		return
   282  	}
   283  
   284  	i := 0
   285  	// sign
   286  	if b[i] == '-' {
   287  		i++
   288  	}
   289  
   290  	if i == len(b) {
   291  		r, err = b[i:], syntaxError(b, "missing number value after sign")
   292  		return
   293  	}
   294  
   295  	if b[i] < '0' || b[i] > '9' {
   296  		r, err = b[i:], syntaxError(b, "expected digit but got '%c'", b[i])
   297  		return
   298  	}
   299  
   300  	// integer part
   301  	if b[i] == '0' {
   302  		i++
   303  		if i == len(b) || (b[i] != '.' && b[i] != 'e' && b[i] != 'E') {
   304  			v, r = b[:i], b[i:]
   305  			return
   306  		}
   307  		if '0' <= b[i] && b[i] <= '9' {
   308  			r, err = b[i:], syntaxError(b, "cannot decode number with leading '0' character")
   309  			return
   310  		}
   311  	}
   312  
   313  	for i < len(b) && '0' <= b[i] && b[i] <= '9' {
   314  		i++
   315  	}
   316  
   317  	// decimal part
   318  	if i < len(b) && b[i] == '.' {
   319  		i++
   320  		decimalStart := i
   321  
   322  		for i < len(b) {
   323  			if c := b[i]; !('0' <= c && c <= '9') {
   324  				if i == decimalStart {
   325  					r, err = b[i:], syntaxError(b, "expected digit but found '%c'", c)
   326  					return
   327  				}
   328  				break
   329  			}
   330  			i++
   331  		}
   332  
   333  		if i == decimalStart {
   334  			r, err = b[i:], syntaxError(b, "expected decimal part after '.'")
   335  			return
   336  		}
   337  	}
   338  
   339  	// exponent part
   340  	if i < len(b) && (b[i] == 'e' || b[i] == 'E') {
   341  		i++
   342  
   343  		if i < len(b) {
   344  			if c := b[i]; c == '+' || c == '-' {
   345  				i++
   346  			}
   347  		}
   348  
   349  		if i == len(b) {
   350  			r, err = b[i:], syntaxError(b, "missing exponent in number")
   351  			return
   352  		}
   353  
   354  		exponentStart := i
   355  
   356  		for i < len(b) {
   357  			if c := b[i]; !('0' <= c && c <= '9') {
   358  				if i == exponentStart {
   359  					err = syntaxError(b, "expected digit but found '%c'", c)
   360  					return
   361  				}
   362  				break
   363  			}
   364  			i++
   365  		}
   366  	}
   367  
   368  	v, r = b[:i], b[i:]
   369  	return
   370  }
   371  
   372  func parseUnicode(b []byte) (rune, int, error) {
   373  	if len(b) < 4 {
   374  		return 0, 0, syntaxError(b, "unicode code point must have at least 4 characters")
   375  	}
   376  
   377  	u, r, err := parseUintHex(b[:4])
   378  	if err != nil {
   379  		return 0, 0, syntaxError(b, "parsing unicode code point: %s", err)
   380  	}
   381  
   382  	if len(r) != 0 {
   383  		return 0, 0, syntaxError(b, "invalid unicode code point")
   384  	}
   385  
   386  	return rune(u), 4, nil
   387  }
   388  
   389  func parseStringFast(b []byte) ([]byte, []byte, bool, error) {
   390  	if len(b) < 2 {
   391  		return nil, b[len(b):], false, unexpectedEOF(b)
   392  	}
   393  	if b[0] != '"' {
   394  		return nil, b, false, syntaxError(b, "expected '\"' at the beginning of a string value")
   395  	}
   396  
   397  	n := bytes.IndexByte(b[1:], '"') + 2
   398  	if n <= 1 {
   399  		return nil, b[len(b):], false, syntaxError(b, "missing '\"' at the end of a string value")
   400  	}
   401  	if bytes.IndexByte(b[1:n], '\\') < 0 && ascii.ValidPrint(b[1:n]) {
   402  		return b[:n], b[n:], false, nil
   403  	}
   404  
   405  	for i := 1; i < len(b); i++ {
   406  		switch b[i] {
   407  		case '\\':
   408  			if i++; i < len(b) {
   409  				switch b[i] {
   410  				case '"', '\\', '/', 'n', 'r', 't', 'f', 'b':
   411  				case 'u':
   412  					_, n, err := parseUnicode(b[i+1:])
   413  					if err != nil {
   414  						return nil, b, false, err
   415  					}
   416  					i += n
   417  				default:
   418  					return nil, b, false, syntaxError(b, "invalid character '%c' in string escape code", b[i])
   419  				}
   420  			}
   421  
   422  		case '"':
   423  			return b[:i+1], b[i+1:], true, nil
   424  
   425  		default:
   426  			if b[i] < 0x20 {
   427  				return nil, b, false, syntaxError(b, "invalid character '%c' in string escape code", b[i])
   428  			}
   429  		}
   430  	}
   431  
   432  	return nil, b[len(b):], false, syntaxError(b, "missing '\"' at the end of a string value")
   433  }
   434  
   435  func parseString(b []byte) ([]byte, []byte, error) {
   436  	s, b, _, err := parseStringFast(b)
   437  	return s, b, err
   438  }
   439  
   440  func parseStringUnquote(b []byte, r []byte) ([]byte, []byte, bool, error) {
   441  	s, b, escaped, err := parseStringFast(b)
   442  	if err != nil {
   443  		return s, b, false, err
   444  	}
   445  
   446  	s = s[1 : len(s)-1] // trim the quotes
   447  
   448  	if !escaped {
   449  		return s, b, false, nil
   450  	}
   451  
   452  	if r == nil {
   453  		r = make([]byte, 0, len(s))
   454  	}
   455  
   456  	for len(s) != 0 {
   457  		i := bytes.IndexByte(s, '\\')
   458  
   459  		if i < 0 {
   460  			r = appendCoerceInvalidUTF8(r, s)
   461  			break
   462  		}
   463  
   464  		r = appendCoerceInvalidUTF8(r, s[:i])
   465  		s = s[i+1:]
   466  
   467  		c := s[0]
   468  		switch c {
   469  		case '"', '\\', '/':
   470  			// simple escaped character
   471  		case 'n':
   472  			c = '\n'
   473  
   474  		case 'r':
   475  			c = '\r'
   476  
   477  		case 't':
   478  			c = '\t'
   479  
   480  		case 'b':
   481  			c = '\b'
   482  
   483  		case 'f':
   484  			c = '\f'
   485  
   486  		case 'u':
   487  			s = s[1:]
   488  
   489  			r1, n1, err := parseUnicode(s)
   490  			if err != nil {
   491  				return r, b, true, err
   492  			}
   493  			s = s[n1:]
   494  
   495  			if utf16.IsSurrogate(r1) {
   496  				if !hasPrefix(s, `\u`) {
   497  					r1 = unicode.ReplacementChar
   498  				} else {
   499  					r2, n2, err := parseUnicode(s[2:])
   500  					if err != nil {
   501  						return r, b, true, err
   502  					}
   503  					if r1 = utf16.DecodeRune(r1, r2); r1 != unicode.ReplacementChar {
   504  						s = s[2+n2:]
   505  					}
   506  				}
   507  			}
   508  
   509  			r = appendRune(r, r1)
   510  			continue
   511  
   512  		default: // not sure what this escape sequence is
   513  			return r, b, false, syntaxError(s, "invalid character '%c' in string escape code", c)
   514  		}
   515  
   516  		r = append(r, c)
   517  		s = s[1:]
   518  	}
   519  
   520  	return r, b, true, nil
   521  }
   522  
   523  func appendRune(b []byte, r rune) []byte {
   524  	n := len(b)
   525  	b = append(b, 0, 0, 0, 0)
   526  	return b[:n+utf8.EncodeRune(b[n:], r)]
   527  }
   528  
   529  func appendCoerceInvalidUTF8(b []byte, s []byte) []byte {
   530  	c := [4]byte{}
   531  
   532  	for _, r := range string(s) {
   533  		b = append(b, c[:utf8.EncodeRune(c[:], r)]...)
   534  	}
   535  
   536  	return b
   537  }
   538  
   539  func parseObject(b []byte) ([]byte, []byte, error) {
   540  	if len(b) < 2 {
   541  		return nil, b[len(b):], unexpectedEOF(b)
   542  	}
   543  
   544  	if b[0] != '{' {
   545  		return nil, b, syntaxError(b, "expected '{' at the beginning of an object value")
   546  	}
   547  
   548  	var err error
   549  	var a = b
   550  	var n = len(b)
   551  	var i = 0
   552  
   553  	b = b[1:]
   554  	for {
   555  		b = skipSpaces(b)
   556  
   557  		if len(b) == 0 {
   558  			return nil, b, syntaxError(b, "cannot decode object from empty input")
   559  		}
   560  
   561  		if b[0] == '}' {
   562  			j := (n - len(b)) + 1
   563  			return a[:j], a[j:], nil
   564  		}
   565  
   566  		if i != 0 {
   567  			if len(b) == 0 {
   568  				return nil, b, syntaxError(b, "unexpected EOF after object field value")
   569  			}
   570  			if b[0] != ',' {
   571  				return nil, b, syntaxError(b, "expected ',' after object field value but found '%c'", b[0])
   572  			}
   573  			b = skipSpaces(b[1:])
   574  			if len(b) == 0 {
   575  				return nil, b, unexpectedEOF(b)
   576  			}
   577  			if b[0] == '}' {
   578  				return nil, b, syntaxError(b, "unexpected trailing comma after object field")
   579  			}
   580  		}
   581  
   582  		_, b, err = parseString(b)
   583  		if err != nil {
   584  			return nil, b, err
   585  		}
   586  		b = skipSpaces(b)
   587  
   588  		if len(b) == 0 {
   589  			return nil, b, syntaxError(b, "unexpected EOF after object field key")
   590  		}
   591  		if b[0] != ':' {
   592  			return nil, b, syntaxError(b, "expected ':' after object field key but found '%c'", b[0])
   593  		}
   594  		b = skipSpaces(b[1:])
   595  
   596  		_, b, err = parseValue(b)
   597  		if err != nil {
   598  			return nil, b, err
   599  		}
   600  
   601  		i++
   602  	}
   603  }
   604  
   605  func parseArray(b []byte) ([]byte, []byte, error) {
   606  	if len(b) < 2 {
   607  		return nil, b[len(b):], unexpectedEOF(b)
   608  	}
   609  
   610  	if b[0] != '[' {
   611  		return nil, b, syntaxError(b, "expected '[' at the beginning of array value")
   612  	}
   613  
   614  	var err error
   615  	var a = b
   616  	var n = len(b)
   617  	var i = 0
   618  
   619  	b = b[1:]
   620  	for {
   621  		b = skipSpaces(b)
   622  
   623  		if len(b) == 0 {
   624  			return nil, b, syntaxError(b, "missing closing ']' after array value")
   625  		}
   626  
   627  		if b[0] == ']' {
   628  			j := (n - len(b)) + 1
   629  			return a[:j], a[j:], nil
   630  		}
   631  
   632  		if i != 0 {
   633  			if len(b) == 0 {
   634  				return nil, b, syntaxError(b, "unexpected EOF after array element")
   635  			}
   636  			if b[0] != ',' {
   637  				return nil, b, syntaxError(b, "expected ',' after array element but found '%c'", b[0])
   638  			}
   639  			b = skipSpaces(b[1:])
   640  			if len(b) == 0 {
   641  				return nil, b, unexpectedEOF(b)
   642  			}
   643  			if b[0] == ']' {
   644  				return nil, b, syntaxError(b, "unexpected trailing comma after object field")
   645  			}
   646  		}
   647  
   648  		_, b, err = parseValue(b)
   649  		if err != nil {
   650  			return nil, b, err
   651  		}
   652  
   653  		i++
   654  	}
   655  }
   656  
   657  func parseValue(b []byte) ([]byte, []byte, error) {
   658  	if len(b) != 0 {
   659  		switch b[0] {
   660  		case '{':
   661  			return parseObject(b)
   662  		case '[':
   663  			return parseArray(b)
   664  		case '"':
   665  			return parseString(b)
   666  		case 'n':
   667  			return parseNull(b)
   668  		case 't':
   669  			return parseTrue(b)
   670  		case 'f':
   671  			return parseFalse(b)
   672  		case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   673  			return parseNumber(b)
   674  		default:
   675  			return nil, b, syntaxError(b, "invalid character '%c' looking for beginning of value", b[0])
   676  		}
   677  	}
   678  	return nil, b, syntaxError(b, "unexpected end of JSON input")
   679  }
   680  
   681  func hasNullPrefix(b []byte) bool {
   682  	return len(b) >= 4 && string(b[:4]) == "null"
   683  }
   684  
   685  func hasTruePrefix(b []byte) bool {
   686  	return len(b) >= 4 && string(b[:4]) == "true"
   687  }
   688  
   689  func hasFalsePrefix(b []byte) bool {
   690  	return len(b) >= 5 && string(b[:5]) == "false"
   691  }
   692  
   693  func hasPrefix(b []byte, s string) bool {
   694  	return len(b) >= len(s) && s == string(b[:len(s)])
   695  }
   696  
   697  func hasLeadingSign(b []byte) bool {
   698  	return len(b) > 0 && (b[0] == '+' || b[0] == '-')
   699  }
   700  
   701  func hasLeadingZeroes(b []byte) bool {
   702  	if hasLeadingSign(b) {
   703  		b = b[1:]
   704  	}
   705  	return len(b) > 1 && b[0] == '0' && '0' <= b[1] && b[1] <= '9'
   706  }
   707  
   708  func appendToLower(b, s []byte) []byte {
   709  	if ascii.Valid(s) { // fast path for ascii strings
   710  		i := 0
   711  
   712  		for j := range s {
   713  			c := s[j]
   714  
   715  			if 'A' <= c && c <= 'Z' {
   716  				b = append(b, s[i:j]...)
   717  				b = append(b, c+('a'-'A'))
   718  				i = j + 1
   719  			}
   720  		}
   721  
   722  		return append(b, s[i:]...)
   723  	}
   724  
   725  	for _, r := range string(s) {
   726  		b = appendRune(b, foldRune(r))
   727  	}
   728  
   729  	return b
   730  }
   731  
   732  func foldRune(r rune) rune {
   733  	if r = unicode.SimpleFold(r); 'A' <= r && r <= 'Z' {
   734  		r = r + ('a' - 'A')
   735  	}
   736  	return r
   737  }