github.com/night-codes/go-json@v0.9.15/internal/decoder/string.go (about)

     1  package decoder
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"reflect"
     7  	"unicode"
     8  	"unicode/utf16"
     9  	"unicode/utf8"
    10  	"unsafe"
    11  
    12  	"github.com/night-codes/go-json/internal/errors"
    13  )
    14  
    15  type stringDecoder struct {
    16  	structName string
    17  	fieldName  string
    18  }
    19  
    20  func newStringDecoder(structName, fieldName string) *stringDecoder {
    21  	return &stringDecoder{
    22  		structName: structName,
    23  		fieldName:  fieldName,
    24  	}
    25  }
    26  
    27  func (d *stringDecoder) errUnmarshalType(typeName string, offset int64) *errors.UnmarshalTypeError {
    28  	return &errors.UnmarshalTypeError{
    29  		Value:  typeName,
    30  		Type:   reflect.TypeOf(""),
    31  		Offset: offset,
    32  		Struct: d.structName,
    33  		Field:  d.fieldName,
    34  	}
    35  }
    36  
    37  func (d *stringDecoder) DecodeStream(s *Stream, depth int64, p unsafe.Pointer) error {
    38  	bytes, err := d.decodeStreamByte(s)
    39  	if err != nil {
    40  		return err
    41  	}
    42  	if bytes == nil {
    43  		return nil
    44  	}
    45  	**(**string)(unsafe.Pointer(&p)) = *(*string)(unsafe.Pointer(&bytes))
    46  	s.reset()
    47  	return nil
    48  }
    49  
    50  func (d *stringDecoder) Decode(ctx *RuntimeContext, cursor, depth int64, p unsafe.Pointer) (int64, error) {
    51  	bytes, c, err := d.decodeByte(ctx.Buf, cursor)
    52  	if err != nil {
    53  		return 0, err
    54  	}
    55  	if bytes == nil {
    56  		return c, nil
    57  	}
    58  	cursor = c
    59  	**(**string)(unsafe.Pointer(&p)) = *(*string)(unsafe.Pointer(&bytes))
    60  	return cursor, nil
    61  }
    62  
    63  var (
    64  	hexToInt = [256]int{
    65  		'0': 0,
    66  		'1': 1,
    67  		'2': 2,
    68  		'3': 3,
    69  		'4': 4,
    70  		'5': 5,
    71  		'6': 6,
    72  		'7': 7,
    73  		'8': 8,
    74  		'9': 9,
    75  		'A': 10,
    76  		'B': 11,
    77  		'C': 12,
    78  		'D': 13,
    79  		'E': 14,
    80  		'F': 15,
    81  		'a': 10,
    82  		'b': 11,
    83  		'c': 12,
    84  		'd': 13,
    85  		'e': 14,
    86  		'f': 15,
    87  	}
    88  )
    89  
    90  func unicodeToRune(code []byte) rune {
    91  	var r rune
    92  	for i := 0; i < len(code); i++ {
    93  		r = r*16 + rune(hexToInt[code[i]])
    94  	}
    95  	return r
    96  }
    97  
    98  func readAtLeast(s *Stream, n int64, p *unsafe.Pointer) bool {
    99  	for s.cursor+n >= s.length {
   100  		if !s.read() {
   101  			return false
   102  		}
   103  		*p = s.bufptr()
   104  	}
   105  	return true
   106  }
   107  
   108  func decodeUnicodeRune(s *Stream, p unsafe.Pointer) (rune, int64, unsafe.Pointer, error) {
   109  	const defaultOffset = 5
   110  	const surrogateOffset = 11
   111  
   112  	if !readAtLeast(s, defaultOffset, &p) {
   113  		return rune(0), 0, nil, errors.ErrInvalidCharacter(s.char(), "escaped string", s.totalOffset())
   114  	}
   115  
   116  	r := unicodeToRune(s.buf[s.cursor+1 : s.cursor+defaultOffset])
   117  	if utf16.IsSurrogate(r) {
   118  		if !readAtLeast(s, surrogateOffset, &p) {
   119  			return unicode.ReplacementChar, defaultOffset, p, nil
   120  		}
   121  		if s.buf[s.cursor+defaultOffset] != '\\' || s.buf[s.cursor+defaultOffset+1] != 'u' {
   122  			return unicode.ReplacementChar, defaultOffset, p, nil
   123  		}
   124  		r2 := unicodeToRune(s.buf[s.cursor+defaultOffset+2 : s.cursor+surrogateOffset])
   125  		if r := utf16.DecodeRune(r, r2); r != unicode.ReplacementChar {
   126  			return r, surrogateOffset, p, nil
   127  		}
   128  	}
   129  	return r, defaultOffset, p, nil
   130  }
   131  
   132  func decodeUnicode(s *Stream, p unsafe.Pointer) (unsafe.Pointer, error) {
   133  	const backSlashAndULen = 2 // length of \u
   134  
   135  	r, offset, pp, err := decodeUnicodeRune(s, p)
   136  	if err != nil {
   137  		return nil, err
   138  	}
   139  	unicode := []byte(string(r))
   140  	unicodeLen := int64(len(unicode))
   141  	s.buf = append(append(s.buf[:s.cursor-1], unicode...), s.buf[s.cursor+offset:]...)
   142  	unicodeOrgLen := offset - 1
   143  	s.length = s.length - (backSlashAndULen + (unicodeOrgLen - unicodeLen))
   144  	s.cursor = s.cursor - backSlashAndULen + unicodeLen
   145  	return pp, nil
   146  }
   147  
   148  func decodeEscapeString(s *Stream, p unsafe.Pointer) (unsafe.Pointer, error) {
   149  	s.cursor++
   150  RETRY:
   151  	switch s.buf[s.cursor] {
   152  	case '"':
   153  		s.buf[s.cursor] = '"'
   154  	case '\\':
   155  		s.buf[s.cursor] = '\\'
   156  	case '/':
   157  		s.buf[s.cursor] = '/'
   158  	case 'b':
   159  		s.buf[s.cursor] = '\b'
   160  	case 'f':
   161  		s.buf[s.cursor] = '\f'
   162  	case 'n':
   163  		s.buf[s.cursor] = '\n'
   164  	case 'r':
   165  		s.buf[s.cursor] = '\r'
   166  	case 't':
   167  		s.buf[s.cursor] = '\t'
   168  	case 'u':
   169  		return decodeUnicode(s, p)
   170  	case nul:
   171  		if !s.read() {
   172  			return nil, errors.ErrInvalidCharacter(s.char(), "escaped string", s.totalOffset())
   173  		}
   174  		p = s.bufptr()
   175  		goto RETRY
   176  	default:
   177  		return nil, errors.ErrUnexpectedEndOfJSON("string", s.totalOffset())
   178  	}
   179  	s.buf = append(s.buf[:s.cursor-1], s.buf[s.cursor:]...)
   180  	s.length--
   181  	s.cursor--
   182  	p = s.bufptr()
   183  	return p, nil
   184  }
   185  
   186  var (
   187  	runeErrBytes    = []byte(string(utf8.RuneError))
   188  	runeErrBytesLen = int64(len(runeErrBytes))
   189  )
   190  
   191  func stringBytes(s *Stream) ([]byte, error) {
   192  	_, cursor, p := s.stat()
   193  	cursor++ // skip double quote char
   194  	start := cursor
   195  	for {
   196  		switch char(p, cursor) {
   197  		case '\\':
   198  			s.cursor = cursor
   199  			pp, err := decodeEscapeString(s, p)
   200  			if err != nil {
   201  				return nil, err
   202  			}
   203  			p = pp
   204  			cursor = s.cursor
   205  		case '"':
   206  			literal := s.buf[start:cursor]
   207  			cursor++
   208  			s.cursor = cursor
   209  			return literal, nil
   210  		case
   211  			// 0x00 is nul, 0x5c is '\\', 0x22 is '"' .
   212  			0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, // 0x00-0x0F
   213  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, // 0x10-0x1F
   214  			0x20, 0x21 /*0x22,*/, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, // 0x20-0x2F
   215  			0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, // 0x30-0x3F
   216  			0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, // 0x40-0x4F
   217  			0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B /*0x5C,*/, 0x5D, 0x5E, 0x5F, // 0x50-0x5F
   218  			0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, // 0x60-0x6F
   219  			0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F: // 0x70-0x7F
   220  			// character is ASCII. skip to next char
   221  		case
   222  			0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, // 0x80-0x8F
   223  			0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, // 0x90-0x9F
   224  			0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, // 0xA0-0xAF
   225  			0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, // 0xB0-0xBF
   226  			0xC0, 0xC1, // 0xC0-0xC1
   227  			0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF: // 0xF5-0xFE
   228  			// character is invalid
   229  			s.buf = append(append(append([]byte{}, s.buf[:cursor]...), runeErrBytes...), s.buf[cursor+1:]...)
   230  			_, _, p = s.stat()
   231  			cursor += runeErrBytesLen
   232  			s.length += runeErrBytesLen
   233  			continue
   234  		case nul:
   235  			s.cursor = cursor
   236  			if s.read() {
   237  				_, cursor, p = s.stat()
   238  				continue
   239  			}
   240  			goto ERROR
   241  		case 0xEF:
   242  			// RuneError is {0xEF, 0xBF, 0xBD}
   243  			if s.buf[cursor+1] == 0xBF && s.buf[cursor+2] == 0xBD {
   244  				// found RuneError: skip
   245  				cursor += 2
   246  				break
   247  			}
   248  			fallthrough
   249  		default:
   250  			// multi bytes character
   251  			if !utf8.FullRune(s.buf[cursor : len(s.buf)-1]) {
   252  				s.cursor = cursor
   253  				if s.read() {
   254  					_, cursor, p = s.stat()
   255  					continue
   256  				}
   257  				goto ERROR
   258  			}
   259  			r, size := utf8.DecodeRune(s.buf[cursor:])
   260  			if r == utf8.RuneError {
   261  				s.buf = append(append(append([]byte{}, s.buf[:cursor]...), runeErrBytes...), s.buf[cursor+1:]...)
   262  				cursor += runeErrBytesLen
   263  				s.length += runeErrBytesLen
   264  				_, _, p = s.stat()
   265  			} else {
   266  				cursor += int64(size)
   267  			}
   268  			continue
   269  		}
   270  		cursor++
   271  	}
   272  ERROR:
   273  	return nil, errors.ErrUnexpectedEndOfJSON("string", s.totalOffset())
   274  }
   275  
   276  func (d *stringDecoder) decodeStreamByte(s *Stream) ([]byte, error) {
   277  	for {
   278  		switch s.char() {
   279  		case ' ', '\n', '\t', '\r':
   280  			s.cursor++
   281  			continue
   282  		case '[':
   283  			return nil, d.errUnmarshalType("array", s.totalOffset())
   284  		case '{':
   285  			return nil, d.errUnmarshalType("object", s.totalOffset())
   286  		case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   287  			return nil, d.errUnmarshalType("number", s.totalOffset())
   288  		case '"':
   289  			return stringBytes(s)
   290  		case 'n':
   291  			if err := nullBytes(s); err != nil {
   292  				return nil, err
   293  			}
   294  			return nil, nil
   295  		case nul:
   296  			if s.read() {
   297  				continue
   298  			}
   299  		}
   300  		break
   301  	}
   302  	return nil, errors.ErrInvalidBeginningOfValue(s.char(), s.totalOffset())
   303  }
   304  
   305  func (d *stringDecoder) decodeByte(buf []byte, cursor int64) ([]byte, int64, error) {
   306  	for {
   307  		switch buf[cursor] {
   308  		case ' ', '\n', '\t', '\r':
   309  			cursor++
   310  		case '[':
   311  			return nil, 0, d.errUnmarshalType("array", cursor)
   312  		case '{':
   313  			return nil, 0, d.errUnmarshalType("object", cursor)
   314  		case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   315  			return nil, 0, d.errUnmarshalType("number", cursor)
   316  		case '"':
   317  			cursor++
   318  			start := cursor
   319  			b := (*sliceHeader)(unsafe.Pointer(&buf)).data
   320  			escaped := 0
   321  			for {
   322  				switch char(b, cursor) {
   323  				case '\\':
   324  					escaped++
   325  					cursor++
   326  					switch char(b, cursor) {
   327  					case '"', '\\', '/', 'b', 'f', 'n', 'r', 't':
   328  						cursor++
   329  					case 'u':
   330  						buflen := int64(len(buf))
   331  						if cursor+5 >= buflen {
   332  							return nil, 0, errors.ErrUnexpectedEndOfJSON("escaped string", cursor)
   333  						}
   334  						for i := int64(1); i <= 4; i++ {
   335  							c := char(b, cursor+i)
   336  							if !(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) {
   337  								return nil, 0, errors.ErrSyntax(fmt.Sprintf("json: invalid character %c in \\u hexadecimal character escape", c), cursor+i)
   338  							}
   339  						}
   340  						cursor += 5
   341  					default:
   342  						return nil, 0, errors.ErrUnexpectedEndOfJSON("escaped string", cursor)
   343  					}
   344  					continue
   345  				case '"':
   346  					literal := buf[start:cursor]
   347  					if escaped > 0 {
   348  						literal = literal[:unescapeString(literal)]
   349  					}
   350  					cursor++
   351  					return literal, cursor, nil
   352  				case nul:
   353  					return nil, 0, errors.ErrUnexpectedEndOfJSON("string", cursor)
   354  				}
   355  				cursor++
   356  			}
   357  		case 'n':
   358  			if err := validateNull(buf, cursor); err != nil {
   359  				return nil, 0, err
   360  			}
   361  			cursor += 4
   362  			return nil, cursor, nil
   363  		default:
   364  			return nil, 0, errors.ErrInvalidBeginningOfValue(buf[cursor], cursor)
   365  		}
   366  	}
   367  }
   368  
   369  var unescapeMap = [256]byte{
   370  	'"':  '"',
   371  	'\\': '\\',
   372  	'/':  '/',
   373  	'b':  '\b',
   374  	'f':  '\f',
   375  	'n':  '\n',
   376  	'r':  '\r',
   377  	't':  '\t',
   378  }
   379  
   380  func unsafeAdd(ptr unsafe.Pointer, offset int) unsafe.Pointer {
   381  	return unsafe.Pointer(uintptr(ptr) + uintptr(offset))
   382  }
   383  
   384  func unescapeString(buf []byte) int {
   385  	p := (*sliceHeader)(unsafe.Pointer(&buf)).data
   386  	end := unsafeAdd(p, len(buf))
   387  	src := unsafeAdd(p, bytes.IndexByte(buf, '\\'))
   388  	dst := src
   389  	for src != end {
   390  		c := char(src, 0)
   391  		if c == '\\' {
   392  			escapeChar := char(src, 1)
   393  			if escapeChar != 'u' {
   394  				*(*byte)(dst) = unescapeMap[escapeChar]
   395  				src = unsafeAdd(src, 2)
   396  				dst = unsafeAdd(dst, 1)
   397  			} else {
   398  				v1 := hexToInt[char(src, 2)]
   399  				v2 := hexToInt[char(src, 3)]
   400  				v3 := hexToInt[char(src, 4)]
   401  				v4 := hexToInt[char(src, 5)]
   402  				code := rune((v1 << 12) | (v2 << 8) | (v3 << 4) | v4)
   403  				if code >= 0xd800 && code < 0xdc00 && uintptr(unsafeAdd(src, 11)) < uintptr(end) {
   404  					if char(src, 6) == '\\' && char(src, 7) == 'u' {
   405  						v1 := hexToInt[char(src, 8)]
   406  						v2 := hexToInt[char(src, 9)]
   407  						v3 := hexToInt[char(src, 10)]
   408  						v4 := hexToInt[char(src, 11)]
   409  						lo := rune((v1 << 12) | (v2 << 8) | (v3 << 4) | v4)
   410  						if lo >= 0xdc00 && lo < 0xe000 {
   411  							code = (code-0xd800)<<10 | (lo - 0xdc00) + 0x10000
   412  							src = unsafeAdd(src, 6)
   413  						}
   414  					}
   415  				}
   416  				var b [utf8.UTFMax]byte
   417  				n := utf8.EncodeRune(b[:], code)
   418  				switch n {
   419  				case 4:
   420  					*(*byte)(unsafeAdd(dst, 3)) = b[3]
   421  					fallthrough
   422  				case 3:
   423  					*(*byte)(unsafeAdd(dst, 2)) = b[2]
   424  					fallthrough
   425  				case 2:
   426  					*(*byte)(unsafeAdd(dst, 1)) = b[1]
   427  					fallthrough
   428  				case 1:
   429  					*(*byte)(unsafeAdd(dst, 0)) = b[0]
   430  				}
   431  				src = unsafeAdd(src, 6)
   432  				dst = unsafeAdd(dst, n)
   433  			}
   434  		} else {
   435  			*(*byte)(dst) = c
   436  			src = unsafeAdd(src, 1)
   437  			dst = unsafeAdd(dst, 1)
   438  		}
   439  	}
   440  	return int(uintptr(dst) - uintptr(p))
   441  }