github.com/night-codes/go-json@v0.9.15/internal/encoder/string.go (about)

     1  package encoder
     2  
     3  import (
     4  	"math/bits"
     5  	"reflect"
     6  	"unsafe"
     7  )
     8  
     9  const (
    10  	lsb = 0x0101010101010101
    11  	msb = 0x8080808080808080
    12  )
    13  
    14  var hex = "0123456789abcdef"
    15  
    16  //nolint:govet
    17  func stringToUint64Slice(s string) []uint64 {
    18  	return *(*[]uint64)(unsafe.Pointer(&reflect.SliceHeader{
    19  		Data: ((*reflect.StringHeader)(unsafe.Pointer(&s))).Data,
    20  		Len:  len(s) / 8,
    21  		Cap:  len(s) / 8,
    22  	}))
    23  }
    24  
    25  func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte {
    26  	if ctx.Option.Flag&HTMLEscapeOption != 0 {
    27  		if ctx.Option.Flag&NormalizeUTF8Option != 0 {
    28  			return appendNormalizedHTMLString(buf, s)
    29  		}
    30  		return appendHTMLString(buf, s)
    31  	}
    32  	if ctx.Option.Flag&NormalizeUTF8Option != 0 {
    33  		return appendNormalizedString(buf, s)
    34  	}
    35  	return appendString(buf, s)
    36  }
    37  
    38  func appendNormalizedHTMLString(buf []byte, s string) []byte {
    39  	valLen := len(s)
    40  	if valLen == 0 {
    41  		return append(buf, `""`...)
    42  	}
    43  	buf = append(buf, '"')
    44  	var (
    45  		i, j int
    46  	)
    47  	if valLen >= 8 {
    48  		chunks := stringToUint64Slice(s)
    49  		for _, n := range chunks {
    50  			// combine masks before checking for the MSB of each byte. We include
    51  			// `n` in the mask to check whether any of the *input* byte MSBs were
    52  			// set (i.e. the byte was outside the ASCII range).
    53  			mask := n | (n - (lsb * 0x20)) |
    54  				((n ^ (lsb * '"')) - lsb) |
    55  				((n ^ (lsb * '\\')) - lsb) |
    56  				((n ^ (lsb * '<')) - lsb) |
    57  				((n ^ (lsb * '>')) - lsb) |
    58  				((n ^ (lsb * '&')) - lsb)
    59  			if (mask & msb) != 0 {
    60  				j = bits.TrailingZeros64(mask&msb) / 8
    61  				goto ESCAPE_END
    62  			}
    63  		}
    64  		for i := len(chunks) * 8; i < valLen; i++ {
    65  			if needEscapeHTMLNormalizeUTF8[s[i]] {
    66  				j = i
    67  				goto ESCAPE_END
    68  			}
    69  		}
    70  		// no found any escape characters.
    71  		return append(append(buf, s...), '"')
    72  	}
    73  ESCAPE_END:
    74  	for j < valLen {
    75  		c := s[j]
    76  
    77  		if !needEscapeHTMLNormalizeUTF8[c] {
    78  			// fast path: most of the time, printable ascii characters are used
    79  			j++
    80  			continue
    81  		}
    82  
    83  		switch c {
    84  		case '\\', '"':
    85  			buf = append(buf, s[i:j]...)
    86  			buf = append(buf, '\\', c)
    87  			i = j + 1
    88  			j = j + 1
    89  			continue
    90  
    91  		case '\n':
    92  			buf = append(buf, s[i:j]...)
    93  			buf = append(buf, '\\', 'n')
    94  			i = j + 1
    95  			j = j + 1
    96  			continue
    97  
    98  		case '\r':
    99  			buf = append(buf, s[i:j]...)
   100  			buf = append(buf, '\\', 'r')
   101  			i = j + 1
   102  			j = j + 1
   103  			continue
   104  
   105  		case '\t':
   106  			buf = append(buf, s[i:j]...)
   107  			buf = append(buf, '\\', 't')
   108  			i = j + 1
   109  			j = j + 1
   110  			continue
   111  
   112  		case '<', '>', '&':
   113  			buf = append(buf, s[i:j]...)
   114  			buf = append(buf, `\u00`...)
   115  			buf = append(buf, hex[c>>4], hex[c&0xF])
   116  			i = j + 1
   117  			j = j + 1
   118  			continue
   119  
   120  		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
   121  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
   122  			buf = append(buf, s[i:j]...)
   123  			buf = append(buf, `\u00`...)
   124  			buf = append(buf, hex[c>>4], hex[c&0xF])
   125  			i = j + 1
   126  			j = j + 1
   127  			continue
   128  		}
   129  		state, size := decodeRuneInString(s[j:])
   130  		switch state {
   131  		case runeErrorState:
   132  			buf = append(buf, s[i:j]...)
   133  			buf = append(buf, `\ufffd`...)
   134  			i = j + 1
   135  			j = j + 1
   136  			continue
   137  			// U+2028 is LINE SEPARATOR.
   138  			// U+2029 is PARAGRAPH SEPARATOR.
   139  			// They are both technically valid characters in JSON strings,
   140  			// but don't work in JSONP, which has to be evaluated as JavaScript,
   141  			// and can lead to security holes there. It is valid JSON to
   142  			// escape them, so we do so unconditionally.
   143  			// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
   144  		case lineSepState:
   145  			buf = append(buf, s[i:j]...)
   146  			buf = append(buf, `\u2028`...)
   147  			i = j + 3
   148  			j = j + 3
   149  			continue
   150  		case paragraphSepState:
   151  			buf = append(buf, s[i:j]...)
   152  			buf = append(buf, `\u2029`...)
   153  			i = j + 3
   154  			j = j + 3
   155  			continue
   156  		}
   157  		j += size
   158  	}
   159  
   160  	return append(append(buf, s[i:]...), '"')
   161  }
   162  
   163  func appendHTMLString(buf []byte, s string) []byte {
   164  	valLen := len(s)
   165  	if valLen == 0 {
   166  		return append(buf, `""`...)
   167  	}
   168  	buf = append(buf, '"')
   169  	var (
   170  		i, j int
   171  	)
   172  	if valLen >= 8 {
   173  		chunks := stringToUint64Slice(s)
   174  		for _, n := range chunks {
   175  			// combine masks before checking for the MSB of each byte. We include
   176  			// `n` in the mask to check whether any of the *input* byte MSBs were
   177  			// set (i.e. the byte was outside the ASCII range).
   178  			mask := n | (n - (lsb * 0x20)) |
   179  				((n ^ (lsb * '"')) - lsb) |
   180  				((n ^ (lsb * '\\')) - lsb) |
   181  				((n ^ (lsb * '<')) - lsb) |
   182  				((n ^ (lsb * '>')) - lsb) |
   183  				((n ^ (lsb * '&')) - lsb)
   184  			if (mask & msb) != 0 {
   185  				j = bits.TrailingZeros64(mask&msb) / 8
   186  				goto ESCAPE_END
   187  			}
   188  		}
   189  		for i := len(chunks) * 8; i < valLen; i++ {
   190  			if needEscapeHTML[s[i]] {
   191  				j = i
   192  				goto ESCAPE_END
   193  			}
   194  		}
   195  		// no found any escape characters.
   196  		return append(append(buf, s...), '"')
   197  	}
   198  ESCAPE_END:
   199  	for j < valLen {
   200  		c := s[j]
   201  
   202  		if !needEscapeHTML[c] {
   203  			// fast path: most of the time, printable ascii characters are used
   204  			j++
   205  			continue
   206  		}
   207  
   208  		switch c {
   209  		case '\\', '"':
   210  			buf = append(buf, s[i:j]...)
   211  			buf = append(buf, '\\', c)
   212  			i = j + 1
   213  			j = j + 1
   214  			continue
   215  
   216  		case '\n':
   217  			buf = append(buf, s[i:j]...)
   218  			buf = append(buf, '\\', 'n')
   219  			i = j + 1
   220  			j = j + 1
   221  			continue
   222  
   223  		case '\r':
   224  			buf = append(buf, s[i:j]...)
   225  			buf = append(buf, '\\', 'r')
   226  			i = j + 1
   227  			j = j + 1
   228  			continue
   229  
   230  		case '\t':
   231  			buf = append(buf, s[i:j]...)
   232  			buf = append(buf, '\\', 't')
   233  			i = j + 1
   234  			j = j + 1
   235  			continue
   236  
   237  		case '<', '>', '&':
   238  			buf = append(buf, s[i:j]...)
   239  			buf = append(buf, `\u00`...)
   240  			buf = append(buf, hex[c>>4], hex[c&0xF])
   241  			i = j + 1
   242  			j = j + 1
   243  			continue
   244  
   245  		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
   246  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
   247  			buf = append(buf, s[i:j]...)
   248  			buf = append(buf, `\u00`...)
   249  			buf = append(buf, hex[c>>4], hex[c&0xF])
   250  			i = j + 1
   251  			j = j + 1
   252  			continue
   253  		}
   254  		j++
   255  	}
   256  
   257  	return append(append(buf, s[i:]...), '"')
   258  }
   259  
   260  func appendNormalizedString(buf []byte, s string) []byte {
   261  	valLen := len(s)
   262  	if valLen == 0 {
   263  		return append(buf, `""`...)
   264  	}
   265  	buf = append(buf, '"')
   266  	var (
   267  		i, j int
   268  	)
   269  	if valLen >= 8 {
   270  		chunks := stringToUint64Slice(s)
   271  		for _, n := range chunks {
   272  			// combine masks before checking for the MSB of each byte. We include
   273  			// `n` in the mask to check whether any of the *input* byte MSBs were
   274  			// set (i.e. the byte was outside the ASCII range).
   275  			mask := n | (n - (lsb * 0x20)) |
   276  				((n ^ (lsb * '"')) - lsb) |
   277  				((n ^ (lsb * '\\')) - lsb)
   278  			if (mask & msb) != 0 {
   279  				j = bits.TrailingZeros64(mask&msb) / 8
   280  				goto ESCAPE_END
   281  			}
   282  		}
   283  		valLen := len(s)
   284  		for i := len(chunks) * 8; i < valLen; i++ {
   285  			if needEscapeNormalizeUTF8[s[i]] {
   286  				j = i
   287  				goto ESCAPE_END
   288  			}
   289  		}
   290  		return append(append(buf, s...), '"')
   291  	}
   292  ESCAPE_END:
   293  	for j < valLen {
   294  		c := s[j]
   295  
   296  		if !needEscapeNormalizeUTF8[c] {
   297  			// fast path: most of the time, printable ascii characters are used
   298  			j++
   299  			continue
   300  		}
   301  
   302  		switch c {
   303  		case '\\', '"':
   304  			buf = append(buf, s[i:j]...)
   305  			buf = append(buf, '\\', c)
   306  			i = j + 1
   307  			j = j + 1
   308  			continue
   309  
   310  		case '\n':
   311  			buf = append(buf, s[i:j]...)
   312  			buf = append(buf, '\\', 'n')
   313  			i = j + 1
   314  			j = j + 1
   315  			continue
   316  
   317  		case '\r':
   318  			buf = append(buf, s[i:j]...)
   319  			buf = append(buf, '\\', 'r')
   320  			i = j + 1
   321  			j = j + 1
   322  			continue
   323  
   324  		case '\t':
   325  			buf = append(buf, s[i:j]...)
   326  			buf = append(buf, '\\', 't')
   327  			i = j + 1
   328  			j = j + 1
   329  			continue
   330  
   331  		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
   332  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
   333  			buf = append(buf, s[i:j]...)
   334  			buf = append(buf, `\u00`...)
   335  			buf = append(buf, hex[c>>4], hex[c&0xF])
   336  			i = j + 1
   337  			j = j + 1
   338  			continue
   339  		}
   340  
   341  		state, size := decodeRuneInString(s[j:])
   342  		switch state {
   343  		case runeErrorState:
   344  			buf = append(buf, s[i:j]...)
   345  			buf = append(buf, `\ufffd`...)
   346  			i = j + 1
   347  			j = j + 1
   348  			continue
   349  			// U+2028 is LINE SEPARATOR.
   350  			// U+2029 is PARAGRAPH SEPARATOR.
   351  			// They are both technically valid characters in JSON strings,
   352  			// but don't work in JSONP, which has to be evaluated as JavaScript,
   353  			// and can lead to security holes there. It is valid JSON to
   354  			// escape them, so we do so unconditionally.
   355  			// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
   356  		case lineSepState:
   357  			buf = append(buf, s[i:j]...)
   358  			buf = append(buf, `\u2028`...)
   359  			i = j + 3
   360  			j = j + 3
   361  			continue
   362  		case paragraphSepState:
   363  			buf = append(buf, s[i:j]...)
   364  			buf = append(buf, `\u2029`...)
   365  			i = j + 3
   366  			j = j + 3
   367  			continue
   368  		}
   369  		j += size
   370  	}
   371  
   372  	return append(append(buf, s[i:]...), '"')
   373  }
   374  
   375  func appendString(buf []byte, s string) []byte {
   376  	valLen := len(s)
   377  	if valLen == 0 {
   378  		return append(buf, `""`...)
   379  	}
   380  	buf = append(buf, '"')
   381  	var (
   382  		i, j int
   383  	)
   384  	if valLen >= 8 {
   385  		chunks := stringToUint64Slice(s)
   386  		for _, n := range chunks {
   387  			// combine masks before checking for the MSB of each byte. We include
   388  			// `n` in the mask to check whether any of the *input* byte MSBs were
   389  			// set (i.e. the byte was outside the ASCII range).
   390  			mask := n | (n - (lsb * 0x20)) |
   391  				((n ^ (lsb * '"')) - lsb) |
   392  				((n ^ (lsb * '\\')) - lsb)
   393  			if (mask & msb) != 0 {
   394  				j = bits.TrailingZeros64(mask&msb) / 8
   395  				goto ESCAPE_END
   396  			}
   397  		}
   398  		valLen := len(s)
   399  		for i := len(chunks) * 8; i < valLen; i++ {
   400  			if needEscape[s[i]] {
   401  				j = i
   402  				goto ESCAPE_END
   403  			}
   404  		}
   405  		return append(append(buf, s...), '"')
   406  	}
   407  ESCAPE_END:
   408  	for j < valLen {
   409  		c := s[j]
   410  
   411  		if !needEscape[c] {
   412  			// fast path: most of the time, printable ascii characters are used
   413  			j++
   414  			continue
   415  		}
   416  
   417  		switch c {
   418  		case '\\', '"':
   419  			buf = append(buf, s[i:j]...)
   420  			buf = append(buf, '\\', c)
   421  			i = j + 1
   422  			j = j + 1
   423  			continue
   424  
   425  		case '\n':
   426  			buf = append(buf, s[i:j]...)
   427  			buf = append(buf, '\\', 'n')
   428  			i = j + 1
   429  			j = j + 1
   430  			continue
   431  
   432  		case '\r':
   433  			buf = append(buf, s[i:j]...)
   434  			buf = append(buf, '\\', 'r')
   435  			i = j + 1
   436  			j = j + 1
   437  			continue
   438  
   439  		case '\t':
   440  			buf = append(buf, s[i:j]...)
   441  			buf = append(buf, '\\', 't')
   442  			i = j + 1
   443  			j = j + 1
   444  			continue
   445  
   446  		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
   447  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
   448  			buf = append(buf, s[i:j]...)
   449  			buf = append(buf, `\u00`...)
   450  			buf = append(buf, hex[c>>4], hex[c&0xF])
   451  			i = j + 1
   452  			j = j + 1
   453  			continue
   454  		}
   455  		j++
   456  	}
   457  
   458  	return append(append(buf, s[i:]...), '"')
   459  }