github.com/3JoB/go-json@v0.10.4/internal/decoder/string.go (about)

     1  package decoder
     2  
     3  import (
     4  	"fmt"
     5  	"unicode/utf8"
     6  	"unsafe"
     7  
     8  	"github.com/3JoB/go-reflect"
     9  
    10  	"github.com/3JoB/go-json/internal/errors"
    11  )
    12  
    13  type stringDecoder struct {
    14  	structName string
    15  	fieldName  string
    16  }
    17  
    18  func newStringDecoder(structName, fieldName string) *stringDecoder {
    19  	return &stringDecoder{
    20  		structName: structName,
    21  		fieldName:  fieldName,
    22  	}
    23  }
    24  
    25  func (d *stringDecoder) errUnmarshalType(typeName string, offset int64) *errors.UnmarshalTypeError {
    26  	return &errors.UnmarshalTypeError{
    27  		Value:  typeName,
    28  		Type:   reflect.TypeOf(""),
    29  		Offset: offset,
    30  		Struct: d.structName,
    31  		Field:  d.fieldName,
    32  	}
    33  }
    34  
    35  func (d *stringDecoder) DecodeStream(s *Stream, depth int64, p unsafe.Pointer) error {
    36  	bytes, err := d.decodeStreamByte(s)
    37  	if err != nil {
    38  		return err
    39  	}
    40  	if bytes == nil {
    41  		return nil
    42  	}
    43  	**(**string)(unsafe.Pointer(&p)) = *(*string)(unsafe.Pointer(&bytes))
    44  	s.reset()
    45  	return nil
    46  }
    47  
    48  func (d *stringDecoder) Decode(ctx *RuntimeContext, cursor, depth int64, p unsafe.Pointer) (int64, error) {
    49  	bytes, c, err := d.decodeByte(ctx.Buf, cursor)
    50  	if err != nil {
    51  		return 0, err
    52  	}
    53  	if bytes == nil {
    54  		return c, nil
    55  	}
    56  	cursor = c
    57  	**(**string)(unsafe.Pointer(&p)) = *(*string)(unsafe.Pointer(&bytes))
    58  	return cursor, nil
    59  }
    60  
    61  func (d *stringDecoder) DecodePath(ctx *RuntimeContext, cursor, depth int64) ([][]byte, int64, error) {
    62  	bytes, c, err := d.decodeByte(ctx.Buf, cursor)
    63  	if err != nil {
    64  		return nil, 0, err
    65  	}
    66  	if bytes == nil {
    67  		return [][]byte{nullbytes}, c, nil
    68  	}
    69  	return [][]byte{bytes}, c, nil
    70  }
    71  
    72  var (
    73  	hexToInt = [256]int{
    74  		'0': 0,
    75  		'1': 1,
    76  		'2': 2,
    77  		'3': 3,
    78  		'4': 4,
    79  		'5': 5,
    80  		'6': 6,
    81  		'7': 7,
    82  		'8': 8,
    83  		'9': 9,
    84  		'A': 10,
    85  		'B': 11,
    86  		'C': 12,
    87  		'D': 13,
    88  		'E': 14,
    89  		'F': 15,
    90  		'a': 10,
    91  		'b': 11,
    92  		'c': 12,
    93  		'd': 13,
    94  		'e': 14,
    95  		'f': 15,
    96  	}
    97  )
    98  
    99  func unicodeToRune(code []byte) rune {
   100  	var r rune
   101  	for i := 0; i < len(code); i++ {
   102  		r = r*16 + rune(hexToInt[code[i]])
   103  	}
   104  	return r
   105  }
   106  
   107  var isHex = [256]int8{
   108  	'0': 1,
   109  	'1': 1,
   110  	'2': 2,
   111  	'3': 3,
   112  	'4': 4,
   113  	'5': 5,
   114  	'6': 6,
   115  	'7': 7,
   116  	'8': 8,
   117  	'9': 9,
   118  	'A': 10,
   119  	'B': 11,
   120  	'C': 12,
   121  	'D': 13,
   122  	'E': 14,
   123  	'F': 15,
   124  	'a': 10,
   125  	'b': 11,
   126  	'c': 12,
   127  	'd': 13,
   128  	'e': 14,
   129  	'f': 15,
   130  }
   131  
   132  var utf8First = [256]uint8{
   133  	0xC2: 0x02, 0xC3: 0x02, 0xC4: 0x02, 0xC5: 0x02, 0xC6: 0x02, 0xC7: 0x02, 0xC8: 0x02, 0xC9: 0x02, 0xCA: 0x02, 0xCB: 0x02, 0xCC: 0x02, 0xCD: 0x02, 0xCE: 0x02, 0xCF: 0x02, 0xD0: 0x02, 0xD1: 0x02, 0xD2: 0x02, 0xD3: 0x02, 0xD4: 0x02, 0xD5: 0x02, 0xD6: 0x02, 0xD7: 0x02, 0xD8: 0x02, 0xD9: 0x02, 0xDA: 0x02, 0xDB: 0x02, 0xDC: 0x02, 0xDD: 0x02, 0xDE: 0x02, 0xDF: 0x02,
   134  	0xE0: 0x13,
   135  	0xE1: 0x03, 0xE2: 0x03, 0xE3: 0x03, 0xE4: 0x03, 0xE5: 0x03, 0xE6: 0x03, 0xE7: 0x03, 0xE8: 0x03, 0xE9: 0x03, 0xEA: 0x03, 0xEB: 0x03, 0xEC: 0x03, 0xEE: 0x03, 0xEF: 0x3,
   136  	0xED: 0x23,
   137  	0xF0: 0x34,
   138  	0xF1: 0x04, 0xF2: 0x04, 0xF3: 0x04,
   139  	0xF4: 0x44,
   140  }
   141  
   142  var utf8AcceptRanges = [16]struct{ lo, hi uint8 }{
   143  	0: {lo: 0x80, hi: 0xBF},
   144  	1: {lo: 0xA0, hi: 0xBF},
   145  	2: {lo: 0x80, hi: 0x9F},
   146  	3: {lo: 0x90, hi: 0xBF},
   147  	4: {lo: 0x80, hi: 0x8F},
   148  }
   149  
   150  var unescapeMap = [256]byte{
   151  	'"':  '"',
   152  	'\\': '\\',
   153  	'/':  '/',
   154  	'b':  '\b',
   155  	'f':  '\f',
   156  	'n':  '\n',
   157  	'r':  '\r',
   158  	't':  '\t',
   159  	'u':  'u',
   160  }
   161  
   162  const (
   163  	inStringInvalidUTF8 = 0
   164  	inStringASCII       = 1
   165  	inStringSentinel    = 2
   166  	inStringStartEscape = 3
   167  	inStringEnd         = 4
   168  	inStringStartMB     = 5
   169  )
   170  
   171  var inStringTypes [256]uint8
   172  
   173  func init() {
   174  	for i := range inStringTypes {
   175  		inStringTypes[i] = inStringInvalidUTF8
   176  	}
   177  	for i := 0; i < 0x80; i++ {
   178  		inStringTypes[i] = inStringASCII
   179  	}
   180  	inStringTypes[nul] = inStringSentinel
   181  	inStringTypes['\\'] = inStringStartEscape
   182  	inStringTypes['"'] = inStringEnd
   183  	for i := 0xC2; i <= 0xF4; i++ {
   184  		inStringTypes[i] = inStringStartMB
   185  	}
   186  }
   187  
   188  func stringBytes(s *Stream) ([]byte, int64, error) {
   189  	_, cursor, p := s.stat()
   190  	cursor++ // skip double quote char
   191  
   192  	start := cursor
   193  	dst := cursor
   194  	inplace := true
   195  	first := int64(-1)
   196  	for {
   197  		c := char(p, cursor)
   198  		if t := inStringTypes[c]; t == inStringASCII {
   199  			cursor++
   200  			dst++
   201  			continue
   202  		} else if t == inStringStartMB {
   203  			x := utf8First[c]
   204  			sz := int64(x & 7)
   205  			if s.syncBufptr(s.requires(cursor, sz), &p) < 0 {
   206  				goto RuneError
   207  			}
   208  			accept := utf8AcceptRanges[x>>4]
   209  			c1 := char(p, cursor+1)
   210  			if c1 < accept.lo || accept.hi < c1 {
   211  				goto RuneError
   212  			}
   213  			if sz > 2 {
   214  				c2 := char(p, cursor+2)
   215  				if c2 < 0x80 || c2 > 0xBF {
   216  					goto RuneError
   217  				}
   218  			}
   219  			if sz > 3 {
   220  				c3 := char(p, cursor+3)
   221  				if c3 < 0x80 || c3 > 0xBF {
   222  					goto RuneError
   223  				}
   224  			}
   225  			cursor += sz
   226  			dst += sz
   227  			continue
   228  		} else if t == inStringStartEscape {
   229  			if first < 0 {
   230  				first = cursor
   231  			}
   232  			cursor++
   233  			if s.syncBufptr(s.requires(cursor, 1), &p) < 0 {
   234  				goto ERROR
   235  			}
   236  			ec := char(p, cursor)
   237  			if unescapeMap[ec] == 0 {
   238  				return nil, cursor, errors.ErrInvalidCharacter(char(p, cursor), "in string escape code", cursor)
   239  			}
   240  			if ec != 'u' {
   241  				cursor++
   242  				dst++
   243  				continue
   244  			}
   245  			if s.syncBufptr(s.requires(cursor, 5), &p) < 0 {
   246  				goto ERROR
   247  			}
   248  			c1, c2, c3, c4 := char4(p, cursor+1)
   249  			if o := checkHex(c1, c2, c3, c4); o > 0 {
   250  				return nil, cursor + o, errors.ErrSyntax(fmt.Sprintf("json: invalid character %c in \\u hexadecimal character escape", char(p, cursor+o)), cursor+o)
   251  			}
   252  			r := decodeHexRune(c1, c2, c3, c4)
   253  			*ptrUint16(p, cursor+1) = uint16(r)
   254  		NextUnicode:
   255  			if r >= 0xD800 && r < 0xE000 {
   256  				const runeError = 65533
   257  				if s.syncBufptr(s.requires(cursor, 5+6), &p) >= 0 && char(p, cursor+5) == '\\' && char(p, cursor+6) == 'u' {
   258  					cursor2 := cursor + 6
   259  					c1, c2, c3, c4 := char4(p, cursor2+1)
   260  					if o := checkHex(c1, c2, c3, c4); o > 0 {
   261  						return nil, cursor2 + o, errors.ErrSyntax(fmt.Sprintf("json: invalid character %c in \\u hexadecimal character escape", char(p, cursor2+o)), cursor2+o)
   262  					}
   263  					r2 := decodeHexRune(c1, c2, c3, c4)
   264  					*ptrUint16(p, cursor2+1) = uint16(r2)
   265  					if r2 < 0xDC00 || r2 >= 0xE000 {
   266  						*ptrUint16(p, cursor+1) = runeError
   267  						dst += 3
   268  						cursor = cursor2
   269  						r = r2
   270  						goto NextUnicode
   271  					}
   272  					dst += 4
   273  					cursor = cursor2 + 5
   274  				} else {
   275  					*ptrUint16(p, cursor+1) = runeError
   276  					dst += 3
   277  					cursor += 5
   278  				}
   279  			} else {
   280  				cursor += 5
   281  				dst += runeLen(r)
   282  			}
   283  			continue
   284  		} else if t == inStringEnd {
   285  			if first < 0 {
   286  				return s.buf[start:cursor], cursor + 1, nil
   287  			}
   288  			if inplace {
   289  				src := unsafeAdd(p, int(first))
   290  				unescapeString(src, src)
   291  				return s.buf[start:dst], cursor + 1, nil
   292  			}
   293  			src := unsafeAdd(p, int(start))
   294  			b := make([]byte, dst-start+1)
   295  			data := (*sliceHeader)(unsafe.Pointer(&b)).data
   296  			unescapeString(src, data)
   297  			return b[:len(b)-1], cursor + 1, nil
   298  		} else if t == inStringSentinel {
   299  			if s.read() {
   300  				p = s.bufptr()
   301  				continue
   302  			}
   303  			goto ERROR
   304  		}
   305  	RuneError:
   306  		if first < 0 {
   307  			first = cursor
   308  		}
   309  		*(*byte)(unsafeAdd(p, int(cursor))) = nul
   310  		cursor++
   311  		dst += 3
   312  		if cursor < dst {
   313  			inplace = false
   314  		}
   315  	}
   316  ERROR:
   317  	return nil, s.length, errors.ErrUnexpectedEndOfJSON("string", s.offset+s.length)
   318  }
   319  
   320  func (d *stringDecoder) decodeStreamByte(s *Stream) ([]byte, error) {
   321  	for {
   322  		switch s.char() {
   323  		case ' ', '\n', '\t', '\r':
   324  			s.cursor++
   325  			continue
   326  		case '[':
   327  			return nil, d.errUnmarshalType("array", s.totalOffset())
   328  		case '{':
   329  			return nil, d.errUnmarshalType("object", s.totalOffset())
   330  		case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   331  			return nil, d.errUnmarshalType("number", s.totalOffset())
   332  		case '"':
   333  			b, cursor, err := stringBytes(s)
   334  			s.cursor = cursor
   335  			if err != nil {
   336  				return nil, err
   337  			}
   338  			return b, nil
   339  		case 'n':
   340  			if err := nullBytes(s); err != nil {
   341  				return nil, err
   342  			}
   343  			return nil, nil
   344  		case nul:
   345  			if s.read() {
   346  				continue
   347  			}
   348  		}
   349  		break
   350  	}
   351  	return nil, errors.ErrInvalidBeginningOfValue(s.char(), s.totalOffset())
   352  }
   353  
   354  func (d *stringDecoder) decodeByte(buf []byte, cursor int64) ([]byte, int64, error) {
   355  	for {
   356  		switch buf[cursor] {
   357  		case ' ', '\n', '\t', '\r':
   358  			cursor++
   359  		case '[':
   360  			return nil, cursor, d.errUnmarshalType("array", cursor)
   361  		case '{':
   362  			return nil, cursor, d.errUnmarshalType("object", cursor)
   363  		case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   364  			return nil, cursor, d.errUnmarshalType("number", cursor)
   365  		case '"':
   366  			s := bytesStream{buf: buf, length: int64(len(buf))}
   367  			cursor++
   368  			p := (*sliceHeader)(unsafe.Pointer(&buf)).data
   369  
   370  			start := cursor
   371  			dst := cursor
   372  			inplace := true
   373  			first := int64(-1)
   374  			for {
   375  				c := char(p, cursor)
   376  				if t := inStringTypes[c]; t == inStringASCII {
   377  					cursor++
   378  					dst++
   379  					continue
   380  				} else if t == inStringStartMB {
   381  					x := utf8First[c]
   382  					sz := int64(x & 7)
   383  					if s.syncBufptr(s.requires(cursor, sz), &p) < 0 {
   384  						goto RuneError
   385  					}
   386  					accept := utf8AcceptRanges[x>>4]
   387  					c1 := char(p, cursor+1)
   388  					if c1 < accept.lo || accept.hi < c1 {
   389  						goto RuneError
   390  					}
   391  					if sz > 2 {
   392  						c2 := char(p, cursor+2)
   393  						if c2 < 0x80 || c2 > 0xBF {
   394  							goto RuneError
   395  						}
   396  					}
   397  					if sz > 3 {
   398  						c3 := char(p, cursor+3)
   399  						if c3 < 0x80 || c3 > 0xBF {
   400  							goto RuneError
   401  						}
   402  					}
   403  					cursor += sz
   404  					dst += sz
   405  					continue
   406  				} else if t == inStringStartEscape {
   407  					if first < 0 {
   408  						first = cursor
   409  					}
   410  					cursor++
   411  					if s.syncBufptr(s.requires(cursor, 1), &p) < 0 {
   412  						goto ERROR
   413  					}
   414  					ec := char(p, cursor)
   415  					if unescapeMap[ec] == 0 {
   416  						return nil, cursor, errors.ErrInvalidCharacter(char(p, cursor), "in string escape code", cursor)
   417  					}
   418  					if ec != 'u' {
   419  						cursor++
   420  						dst++
   421  						continue
   422  					}
   423  					if s.syncBufptr(s.requires(cursor, 5), &p) < 0 {
   424  						goto ERROR
   425  					}
   426  					c1, c2, c3, c4 := char4(p, cursor+1)
   427  					if o := checkHex(c1, c2, c3, c4); o > 0 {
   428  						return nil, cursor + o, errors.ErrSyntax(fmt.Sprintf("json: invalid character %c in \\u hexadecimal character escape", char(p, cursor+o)), cursor+o)
   429  					}
   430  					r := decodeHexRune(c1, c2, c3, c4)
   431  					*ptrUint16(p, cursor+1) = uint16(r)
   432  				NextUnicode:
   433  					if r >= 0xD800 && r < 0xE000 {
   434  						const runeError = 65533
   435  						if s.syncBufptr(s.requires(cursor, 5+6), &p) >= 0 && char(p, cursor+5) == '\\' && char(p, cursor+6) == 'u' {
   436  							cursor2 := cursor + 6
   437  							c1, c2, c3, c4 := char4(p, cursor2+1)
   438  							if o := checkHex(c1, c2, c3, c4); o > 0 {
   439  								return nil, cursor2 + o, errors.ErrSyntax(fmt.Sprintf("json: invalid character %c in \\u hexadecimal character escape", char(p, cursor2+o)), cursor2+o)
   440  							}
   441  							r2 := decodeHexRune(c1, c2, c3, c4)
   442  							*ptrUint16(p, cursor2+1) = uint16(r2)
   443  							if r2 < 0xDC00 || r2 >= 0xE000 {
   444  								*ptrUint16(p, cursor+1) = runeError
   445  								dst += 3
   446  								cursor = cursor2
   447  								r = r2
   448  								goto NextUnicode
   449  							}
   450  							dst += 4
   451  							cursor = cursor2 + 5
   452  						} else {
   453  							*ptrUint16(p, cursor+1) = runeError
   454  							dst += 3
   455  							cursor += 5
   456  						}
   457  					} else {
   458  						cursor += 5
   459  						dst += runeLen(r)
   460  					}
   461  					continue
   462  				} else if t == inStringEnd {
   463  					if first < 0 {
   464  						return s.buf[start:cursor], cursor + 1, nil
   465  					}
   466  					if inplace {
   467  						src := unsafeAdd(p, int(first))
   468  						unescapeString(src, src)
   469  						return s.buf[start:dst], cursor + 1, nil
   470  					}
   471  					src := unsafeAdd(p, int(start))
   472  					b := make([]byte, dst-start+1)
   473  					data := (*sliceHeader)(unsafe.Pointer(&b)).data
   474  					unescapeString(src, data)
   475  					return b[:len(b)-1], cursor + 1, nil
   476  				} else if t == inStringSentinel {
   477  					if s.read() {
   478  						p = s.bufptr()
   479  						continue
   480  					}
   481  					goto ERROR
   482  				}
   483  			RuneError:
   484  				if first < 0 {
   485  					first = cursor
   486  				}
   487  				*(*byte)(unsafeAdd(p, int(cursor))) = nul
   488  				cursor++
   489  				dst += 3
   490  				if cursor < dst {
   491  					inplace = false
   492  				}
   493  			}
   494  		ERROR:
   495  			return nil, s.length, errors.ErrUnexpectedEndOfJSON("string", s.offset+s.length)
   496  		case nul:
   497  			return nil, cursor, errors.ErrUnexpectedEndOfJSON("string", cursor)
   498  		case 'n':
   499  			if err := validateNull(buf, cursor); err != nil {
   500  				return nil, cursor, err
   501  			}
   502  			return nil, cursor + 4, nil
   503  		default:
   504  			return nil, cursor, errors.ErrInvalidBeginningOfValue(buf[cursor], cursor)
   505  		}
   506  	}
   507  }
   508  
   509  func unsafeAdd(ptr unsafe.Pointer, offset int) unsafe.Pointer {
   510  	return unsafe.Pointer(uintptr(ptr) + uintptr(offset))
   511  }
   512  
   513  func unescapeString(src, dst unsafe.Pointer) {
   514  	for {
   515  		c := char(src, 0)
   516  		switch c {
   517  		case '"':
   518  			return
   519  		case '\\':
   520  			escapeChar := char(src, 1)
   521  			if escapeChar != 'u' {
   522  				*(*byte)(dst) = unescapeMap[escapeChar]
   523  				src = unsafeAdd(src, 2)
   524  				dst = unsafeAdd(dst, 1)
   525  			} else {
   526  				code := rune(*ptrUint16(src, 2))
   527  				if code >= 0xD800 && code < 0xDC00 {
   528  					lo := rune(*ptrUint16(src, 8))
   529  					code = (code-0xD800)<<10 | (lo - 0xDC00) + 0x10000
   530  					src = unsafeAdd(src, 6)
   531  				}
   532  				var b [utf8.UTFMax]byte
   533  				n := utf8.EncodeRune(b[:], code)
   534  				switch n {
   535  				case 4:
   536  					*(*byte)(unsafeAdd(dst, 3)) = b[3]
   537  					fallthrough
   538  				case 3:
   539  					*(*byte)(unsafeAdd(dst, 2)) = b[2]
   540  					fallthrough
   541  				case 2:
   542  					*(*byte)(unsafeAdd(dst, 1)) = b[1]
   543  					fallthrough
   544  				case 1:
   545  					*(*byte)(unsafeAdd(dst, 0)) = b[0]
   546  				}
   547  				src = unsafeAdd(src, 6)
   548  				dst = unsafeAdd(dst, n)
   549  			}
   550  		case nul:
   551  			*(*byte)(unsafeAdd(dst, 0)) = 0xEF
   552  			*(*byte)(unsafeAdd(dst, 1)) = 0xBF
   553  			*(*byte)(unsafeAdd(dst, 2)) = 0xBD
   554  			src = unsafeAdd(src, 1)
   555  			dst = unsafeAdd(dst, 3)
   556  		default:
   557  			*(*byte)(dst) = c
   558  			src = unsafeAdd(src, 1)
   559  			dst = unsafeAdd(dst, 1)
   560  		}
   561  	}
   562  }
   563  
   564  func char4(p unsafe.Pointer, offset int64) (byte, byte, byte, byte) {
   565  	return char(p, offset), char(p, offset+1), char(p, offset+2), char(p, offset+3)
   566  }
   567  
   568  func checkHex(v1, v2, v3, v4 byte) int64 {
   569  	if isHex[v1] == 0 {
   570  		return 1
   571  	}
   572  	if isHex[v2] == 0 {
   573  		return 2
   574  	}
   575  	if isHex[v3] == 0 {
   576  		return 3
   577  	}
   578  	if isHex[v4] == 0 {
   579  		return 4
   580  	}
   581  	return 0
   582  }
   583  
   584  func decodeHexRune(v1, v2, v3, v4 byte) rune {
   585  	return rune(hexToInt[v1]<<12 | hexToInt[v2]<<8 | hexToInt[v3]<<4 | hexToInt[v4])
   586  }
   587  
   588  func runeLen(r rune) int64 {
   589  	if r <= 127 {
   590  		return 1
   591  	} else if r <= 2047 {
   592  		return 2
   593  	} else {
   594  		return 3
   595  	}
   596  }
   597  
   598  type bytesStream struct {
   599  	buf    []byte
   600  	length int64
   601  	offset int64
   602  }
   603  
   604  func (b *bytesStream) read() bool {
   605  	return false
   606  }
   607  
   608  func (b *bytesStream) requires(cursor, n int64) int {
   609  	if cursor+n >= b.length {
   610  		return -1
   611  	}
   612  	return 0
   613  }
   614  
   615  func (b *bytesStream) syncBufptr(r int, p *unsafe.Pointer) int {
   616  	return r
   617  }
   618  
   619  func (b *bytesStream) bufptr() unsafe.Pointer {
   620  	panic("unreachable")
   621  }