github.com/grbit/go-json@v0.11.0/internal/decoder/string.go (about)

     1  package decoder
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"reflect"
     7  	"unicode"
     8  	"unicode/utf16"
     9  	"unicode/utf8"
    10  	"unsafe"
    11  
    12  	"github.com/grbit/go-json/internal/errors"
    13  )
    14  
    15  type stringDecoder struct {
    16  	structName string
    17  	fieldName  string
    18  }
    19  
    20  func newStringDecoder(structName, fieldName string) *stringDecoder {
    21  	return &stringDecoder{
    22  		structName: structName,
    23  		fieldName:  fieldName,
    24  	}
    25  }
    26  
    27  func (d *stringDecoder) errUnmarshalType(typeName string, offset int64) *errors.UnmarshalTypeError {
    28  	return &errors.UnmarshalTypeError{
    29  		Value:  typeName,
    30  		Type:   reflect.TypeOf(""),
    31  		Offset: offset,
    32  		Struct: d.structName,
    33  		Field:  d.fieldName,
    34  	}
    35  }
    36  
    37  func (d *stringDecoder) DecodeStream(s *Stream, depth int64, p unsafe.Pointer) error {
    38  	bytes, err := d.decodeStreamByte(s)
    39  	if err != nil {
    40  		return err
    41  	}
    42  	if bytes == nil {
    43  		return nil
    44  	}
    45  	**(**string)(unsafe.Pointer(&p)) = *(*string)(unsafe.Pointer(&bytes))
    46  	s.reset()
    47  	return nil
    48  }
    49  
    50  func (d *stringDecoder) Decode(ctx *RuntimeContext, cursor, depth int64, p unsafe.Pointer) (int64, error) {
    51  	bytes, c, err := d.decodeByte(ctx.Buf, cursor)
    52  	if err != nil {
    53  		return 0, err
    54  	}
    55  	if bytes == nil {
    56  		return c, nil
    57  	}
    58  	cursor = c
    59  	**(**string)(unsafe.Pointer(&p)) = *(*string)(unsafe.Pointer(&bytes))
    60  	return cursor, nil
    61  }
    62  
    63  func (d *stringDecoder) DecodePath(ctx *RuntimeContext, cursor, depth int64) ([][]byte, int64, error) {
    64  	bytes, c, err := d.decodeByte(ctx.Buf, cursor)
    65  	if err != nil {
    66  		return nil, 0, err
    67  	}
    68  	if bytes == nil {
    69  		return [][]byte{nullbytes}, c, nil
    70  	}
    71  	return [][]byte{bytes}, c, nil
    72  }
    73  
    74  var (
    75  	hexToInt = [256]int{
    76  		'0': 0,
    77  		'1': 1,
    78  		'2': 2,
    79  		'3': 3,
    80  		'4': 4,
    81  		'5': 5,
    82  		'6': 6,
    83  		'7': 7,
    84  		'8': 8,
    85  		'9': 9,
    86  		'A': 10,
    87  		'B': 11,
    88  		'C': 12,
    89  		'D': 13,
    90  		'E': 14,
    91  		'F': 15,
    92  		'a': 10,
    93  		'b': 11,
    94  		'c': 12,
    95  		'd': 13,
    96  		'e': 14,
    97  		'f': 15,
    98  	}
    99  )
   100  
   101  func unicodeToRune(code []byte) rune {
   102  	var r rune
   103  	for i := 0; i < len(code); i++ {
   104  		r = r*16 + rune(hexToInt[code[i]])
   105  	}
   106  	return r
   107  }
   108  
   109  func readAtLeast(s *Stream, n int64, p *unsafe.Pointer) bool {
   110  	for s.cursor+n >= s.length {
   111  		if !s.read() {
   112  			return false
   113  		}
   114  		*p = s.bufptr()
   115  	}
   116  	return true
   117  }
   118  
   119  func decodeUnicodeRune(s *Stream, p unsafe.Pointer) (rune, int64, unsafe.Pointer, error) {
   120  	const defaultOffset = 5
   121  	const surrogateOffset = 11
   122  
   123  	if !readAtLeast(s, defaultOffset, &p) {
   124  		return rune(0), 0, nil, errors.ErrInvalidCharacter(s.char(), "escaped string", s.totalOffset())
   125  	}
   126  
   127  	r := unicodeToRune(s.buf[s.cursor+1 : s.cursor+defaultOffset])
   128  	if utf16.IsSurrogate(r) {
   129  		if !readAtLeast(s, surrogateOffset, &p) {
   130  			return unicode.ReplacementChar, defaultOffset, p, nil
   131  		}
   132  		if s.buf[s.cursor+defaultOffset] != '\\' || s.buf[s.cursor+defaultOffset+1] != 'u' {
   133  			return unicode.ReplacementChar, defaultOffset, p, nil
   134  		}
   135  		r2 := unicodeToRune(s.buf[s.cursor+defaultOffset+2 : s.cursor+surrogateOffset])
   136  		if r := utf16.DecodeRune(r, r2); r != unicode.ReplacementChar {
   137  			return r, surrogateOffset, p, nil
   138  		}
   139  	}
   140  	return r, defaultOffset, p, nil
   141  }
   142  
   143  func decodeUnicode(s *Stream, p unsafe.Pointer) (unsafe.Pointer, error) {
   144  	const backSlashAndULen = 2 // length of \u
   145  
   146  	r, offset, pp, err := decodeUnicodeRune(s, p)
   147  	if err != nil {
   148  		return nil, err
   149  	}
   150  	unicode := []byte(string(r))
   151  	unicodeLen := int64(len(unicode))
   152  	s.buf = append(append(s.buf[:s.cursor-1], unicode...), s.buf[s.cursor+offset:]...)
   153  	unicodeOrgLen := offset - 1
   154  	s.length = s.length - (backSlashAndULen + (unicodeOrgLen - unicodeLen))
   155  	s.cursor = s.cursor - backSlashAndULen + unicodeLen
   156  	return pp, nil
   157  }
   158  
   159  func decodeEscapeString(s *Stream, p unsafe.Pointer) (unsafe.Pointer, error) {
   160  	s.cursor++
   161  RETRY:
   162  	switch s.buf[s.cursor] {
   163  	case '"':
   164  		s.buf[s.cursor] = '"'
   165  	case '\\':
   166  		s.buf[s.cursor] = '\\'
   167  	case '/':
   168  		s.buf[s.cursor] = '/'
   169  	case 'b':
   170  		s.buf[s.cursor] = '\b'
   171  	case 'f':
   172  		s.buf[s.cursor] = '\f'
   173  	case 'n':
   174  		s.buf[s.cursor] = '\n'
   175  	case 'r':
   176  		s.buf[s.cursor] = '\r'
   177  	case 't':
   178  		s.buf[s.cursor] = '\t'
   179  	case 'u':
   180  		return decodeUnicode(s, p)
   181  	case nul:
   182  		if !s.read() {
   183  			return nil, errors.ErrInvalidCharacter(s.char(), "escaped string", s.totalOffset())
   184  		}
   185  		p = s.bufptr()
   186  		goto RETRY
   187  	default:
   188  		return nil, errors.ErrUnexpectedEndOfJSON("string", s.totalOffset())
   189  	}
   190  	s.buf = append(s.buf[:s.cursor-1], s.buf[s.cursor:]...)
   191  	s.length--
   192  	s.cursor--
   193  	p = s.bufptr()
   194  	return p, nil
   195  }
   196  
   197  var (
   198  	runeErrBytes    = []byte(string(utf8.RuneError))
   199  	runeErrBytesLen = int64(len(runeErrBytes))
   200  )
   201  
   202  func stringBytes(s *Stream) ([]byte, error) {
   203  	_, cursor, p := s.stat()
   204  	cursor++ // skip double quote char
   205  	start := cursor
   206  	for {
   207  		switch char(p, cursor) {
   208  		case '\\':
   209  			s.cursor = cursor
   210  			pp, err := decodeEscapeString(s, p)
   211  			if err != nil {
   212  				return nil, err
   213  			}
   214  			p = pp
   215  			cursor = s.cursor
   216  		case '"':
   217  			literal := s.buf[start:cursor]
   218  			cursor++
   219  			s.cursor = cursor
   220  			return literal, nil
   221  		case
   222  			// 0x00 is nul, 0x5c is '\\', 0x22 is '"' .
   223  			0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, // 0x00-0x0F
   224  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, // 0x10-0x1F
   225  			0x20, 0x21 /*0x22,*/, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, // 0x20-0x2F
   226  			0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, // 0x30-0x3F
   227  			0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, // 0x40-0x4F
   228  			0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B /*0x5C,*/, 0x5D, 0x5E, 0x5F, // 0x50-0x5F
   229  			0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, // 0x60-0x6F
   230  			0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F: // 0x70-0x7F
   231  			// character is ASCII. skip to next char
   232  		case
   233  			0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, // 0x80-0x8F
   234  			0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, // 0x90-0x9F
   235  			0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, // 0xA0-0xAF
   236  			0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, // 0xB0-0xBF
   237  			0xC0, 0xC1, // 0xC0-0xC1
   238  			0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF: // 0xF5-0xFE
   239  			// character is invalid
   240  			s.buf = append(append(append([]byte{}, s.buf[:cursor]...), runeErrBytes...), s.buf[cursor+1:]...)
   241  			_, _, p = s.stat()
   242  			cursor += runeErrBytesLen
   243  			s.length += runeErrBytesLen
   244  			continue
   245  		case nul:
   246  			s.cursor = cursor
   247  			if s.read() {
   248  				_, cursor, p = s.stat()
   249  				continue
   250  			}
   251  			goto ERROR
   252  		case 0xEF:
   253  			// RuneError is {0xEF, 0xBF, 0xBD}
   254  			if s.buf[cursor+1] == 0xBF && s.buf[cursor+2] == 0xBD {
   255  				// found RuneError: skip
   256  				cursor += 2
   257  				break
   258  			}
   259  			fallthrough
   260  		default:
   261  			// multi bytes character
   262  			if !utf8.FullRune(s.buf[cursor : len(s.buf)-1]) {
   263  				s.cursor = cursor
   264  				if s.read() {
   265  					_, cursor, p = s.stat()
   266  					continue
   267  				}
   268  				goto ERROR
   269  			}
   270  			r, size := utf8.DecodeRune(s.buf[cursor:])
   271  			if r == utf8.RuneError {
   272  				s.buf = append(append(append([]byte{}, s.buf[:cursor]...), runeErrBytes...), s.buf[cursor+1:]...)
   273  				cursor += runeErrBytesLen
   274  				s.length += runeErrBytesLen
   275  				_, _, p = s.stat()
   276  			} else {
   277  				cursor += int64(size)
   278  			}
   279  			continue
   280  		}
   281  		cursor++
   282  	}
   283  ERROR:
   284  	return nil, errors.ErrUnexpectedEndOfJSON("string", s.totalOffset())
   285  }
   286  
   287  func (d *stringDecoder) decodeStreamByte(s *Stream) ([]byte, error) {
   288  	for {
   289  		switch s.char() {
   290  		case ' ', '\n', '\t', '\r':
   291  			s.cursor++
   292  			continue
   293  		case '[':
   294  			return nil, d.errUnmarshalType("array", s.totalOffset())
   295  		case '{':
   296  			return nil, d.errUnmarshalType("object", s.totalOffset())
   297  		case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   298  			return nil, d.errUnmarshalType("number", s.totalOffset())
   299  		case '"':
   300  			return stringBytes(s)
   301  		case 'n':
   302  			if err := nullBytes(s); err != nil {
   303  				return nil, err
   304  			}
   305  			return nil, nil
   306  		case nul:
   307  			if s.read() {
   308  				continue
   309  			}
   310  		}
   311  		break
   312  	}
   313  	return nil, errors.ErrInvalidBeginningOfValue(s.char(), s.totalOffset())
   314  }
   315  
   316  func (d *stringDecoder) decodeByte(buf []byte, cursor int64) ([]byte, int64, error) {
   317  	for {
   318  		switch buf[cursor] {
   319  		case ' ', '\n', '\t', '\r':
   320  			cursor++
   321  		case '[':
   322  			return nil, 0, d.errUnmarshalType("array", cursor)
   323  		case '{':
   324  			return nil, 0, d.errUnmarshalType("object", cursor)
   325  		case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   326  			return nil, 0, d.errUnmarshalType("number", cursor)
   327  		case '"':
   328  			cursor++
   329  			start := cursor
   330  			b := (*sliceHeader)(unsafe.Pointer(&buf)).data
   331  			escaped := 0
   332  			for {
   333  				switch char(b, cursor) {
   334  				case '\\':
   335  					escaped++
   336  					cursor++
   337  					switch char(b, cursor) {
   338  					case '"', '\\', '/', 'b', 'f', 'n', 'r', 't':
   339  						cursor++
   340  					case 'u':
   341  						buflen := int64(len(buf))
   342  						if cursor+5 >= buflen {
   343  							return nil, 0, errors.ErrUnexpectedEndOfJSON("escaped string", cursor)
   344  						}
   345  						for i := int64(1); i <= 4; i++ {
   346  							c := char(b, cursor+i)
   347  							if !(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) {
   348  								return nil, 0, errors.ErrSyntax(fmt.Sprintf("json: invalid character %c in \\u hexadecimal character escape", c), cursor+i)
   349  							}
   350  						}
   351  						cursor += 5
   352  					default:
   353  						return nil, 0, errors.ErrUnexpectedEndOfJSON("escaped string", cursor)
   354  					}
   355  					continue
   356  				case '"':
   357  					literal := buf[start:cursor]
   358  					if escaped > 0 {
   359  						literal = literal[:unescapeString(literal)]
   360  					}
   361  					cursor++
   362  					return literal, cursor, nil
   363  				case nul:
   364  					return nil, 0, errors.ErrUnexpectedEndOfJSON("string", cursor)
   365  				}
   366  				cursor++
   367  			}
   368  		case 'n':
   369  			if err := validateNull(buf, cursor); err != nil {
   370  				return nil, 0, err
   371  			}
   372  			cursor += 4
   373  			return nil, cursor, nil
   374  		default:
   375  			return nil, 0, errors.ErrInvalidBeginningOfValue(buf[cursor], cursor)
   376  		}
   377  	}
   378  }
   379  
   380  var unescapeMap = [256]byte{
   381  	'"':  '"',
   382  	'\\': '\\',
   383  	'/':  '/',
   384  	'b':  '\b',
   385  	'f':  '\f',
   386  	'n':  '\n',
   387  	'r':  '\r',
   388  	't':  '\t',
   389  }
   390  
   391  func unsafeAdd(ptr unsafe.Pointer, offset int) unsafe.Pointer {
   392  	return unsafe.Pointer(uintptr(ptr) + uintptr(offset))
   393  }
   394  
   395  func unescapeString(buf []byte) int {
   396  	p := (*sliceHeader)(unsafe.Pointer(&buf)).data
   397  	end := unsafeAdd(p, len(buf))
   398  	src := unsafeAdd(p, bytes.IndexByte(buf, '\\'))
   399  	dst := src
   400  	for src != end {
   401  		c := char(src, 0)
   402  		if c == '\\' {
   403  			escapeChar := char(src, 1)
   404  			if escapeChar != 'u' {
   405  				*(*byte)(dst) = unescapeMap[escapeChar]
   406  				src = unsafeAdd(src, 2)
   407  				dst = unsafeAdd(dst, 1)
   408  			} else {
   409  				v1 := hexToInt[char(src, 2)]
   410  				v2 := hexToInt[char(src, 3)]
   411  				v3 := hexToInt[char(src, 4)]
   412  				v4 := hexToInt[char(src, 5)]
   413  				code := rune((v1 << 12) | (v2 << 8) | (v3 << 4) | v4)
   414  				if code >= 0xd800 && code < 0xdc00 && uintptr(unsafeAdd(src, 11)) < uintptr(end) {
   415  					if char(src, 6) == '\\' && char(src, 7) == 'u' {
   416  						v1 := hexToInt[char(src, 8)]
   417  						v2 := hexToInt[char(src, 9)]
   418  						v3 := hexToInt[char(src, 10)]
   419  						v4 := hexToInt[char(src, 11)]
   420  						lo := rune((v1 << 12) | (v2 << 8) | (v3 << 4) | v4)
   421  						if lo >= 0xdc00 && lo < 0xe000 {
   422  							code = (code-0xd800)<<10 | (lo - 0xdc00) + 0x10000
   423  							src = unsafeAdd(src, 6)
   424  						}
   425  					}
   426  				}
   427  				var b [utf8.UTFMax]byte
   428  				n := utf8.EncodeRune(b[:], code)
   429  				switch n {
   430  				case 4:
   431  					*(*byte)(unsafeAdd(dst, 3)) = b[3]
   432  					fallthrough
   433  				case 3:
   434  					*(*byte)(unsafeAdd(dst, 2)) = b[2]
   435  					fallthrough
   436  				case 2:
   437  					*(*byte)(unsafeAdd(dst, 1)) = b[1]
   438  					fallthrough
   439  				case 1:
   440  					*(*byte)(unsafeAdd(dst, 0)) = b[0]
   441  				}
   442  				src = unsafeAdd(src, 6)
   443  				dst = unsafeAdd(dst, n)
   444  			}
   445  		} else {
   446  			*(*byte)(dst) = c
   447  			src = unsafeAdd(src, 1)
   448  			dst = unsafeAdd(dst, 1)
   449  		}
   450  	}
   451  	return int(uintptr(dst) - uintptr(p))
   452  }