github.com/3JoB/go-json@v0.10.4/internal/encoder/string.go (about)

     1  package encoder
     2  
     3  import (
     4  	"math/bits"
     5  	"unsafe"
     6  
     7  	"github.com/3JoB/go-reflect"
     8  )
     9  
    10  const (
    11  	lsb = 0x0101010101010101
    12  	msb = 0x8080808080808080
    13  )
    14  
    15  var hex = "0123456789abcdef"
    16  
    17  //nolint:govet
    18  func stringToUint64Slice(s string) []uint64 {
    19  	return *(*[]uint64)(unsafe.Pointer(&reflect.SliceHeader{
    20  		Data: ((*reflect.StringHeader)(unsafe.Pointer(&s))).Data,
    21  		Len:  len(s) / 8,
    22  		Cap:  len(s) / 8,
    23  	}))
    24  }
    25  
    26  func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte {
    27  	if ctx.Option.Flag&HTMLEscapeOption != 0 {
    28  		if ctx.Option.Flag&NormalizeUTF8Option != 0 {
    29  			return appendNormalizedHTMLString(buf, s)
    30  		}
    31  		return appendHTMLString(buf, s)
    32  	}
    33  	if ctx.Option.Flag&NormalizeUTF8Option != 0 {
    34  		return appendNormalizedString(buf, s)
    35  	}
    36  	return appendString(buf, s)
    37  }
    38  
    39  func appendNormalizedHTMLString(buf []byte, s string) []byte {
    40  	valLen := len(s)
    41  	if valLen == 0 {
    42  		return append(buf, `""`...)
    43  	}
    44  	buf = append(buf, '"')
    45  	var (
    46  		i, j int
    47  	)
    48  	if valLen >= 8 {
    49  		chunks := stringToUint64Slice(s)
    50  		for _, n := range chunks {
    51  			// combine masks before checking for the MSB of each byte. We include
    52  			// `n` in the mask to check whether any of the *input* byte MSBs were
    53  			// set (i.e. the byte was outside the ASCII range).
    54  			mask := n | (n - (lsb * 0x20)) |
    55  				((n ^ (lsb * '"')) - lsb) |
    56  				((n ^ (lsb * '\\')) - lsb) |
    57  				((n ^ (lsb * '<')) - lsb) |
    58  				((n ^ (lsb * '>')) - lsb) |
    59  				((n ^ (lsb * '&')) - lsb)
    60  			if (mask & msb) != 0 {
    61  				j = bits.TrailingZeros64(mask&msb) / 8
    62  				goto ESCAPE_END
    63  			}
    64  		}
    65  		for i := len(chunks) * 8; i < valLen; i++ {
    66  			if needEscapeHTMLNormalizeUTF8[s[i]] {
    67  				j = i
    68  				goto ESCAPE_END
    69  			}
    70  		}
    71  		// no found any escape characters.
    72  		return append(append(buf, s...), '"')
    73  	}
    74  ESCAPE_END:
    75  	for j < valLen {
    76  		c := s[j]
    77  
    78  		if !needEscapeHTMLNormalizeUTF8[c] {
    79  			// fast path: most of the time, printable ascii characters are used
    80  			j++
    81  			continue
    82  		}
    83  
    84  		switch c {
    85  		case '\\', '"':
    86  			buf = append(buf, s[i:j]...)
    87  			buf = append(buf, '\\', c)
    88  			i = j + 1
    89  			j = j + 1
    90  			continue
    91  
    92  		case '\n':
    93  			buf = append(buf, s[i:j]...)
    94  			buf = append(buf, '\\', 'n')
    95  			i = j + 1
    96  			j = j + 1
    97  			continue
    98  
    99  		case '\r':
   100  			buf = append(buf, s[i:j]...)
   101  			buf = append(buf, '\\', 'r')
   102  			i = j + 1
   103  			j = j + 1
   104  			continue
   105  
   106  		case '\t':
   107  			buf = append(buf, s[i:j]...)
   108  			buf = append(buf, '\\', 't')
   109  			i = j + 1
   110  			j = j + 1
   111  			continue
   112  
   113  		case '<', '>', '&':
   114  			buf = append(buf, s[i:j]...)
   115  			buf = append(buf, `\u00`...)
   116  			buf = append(buf, hex[c>>4], hex[c&0xF])
   117  			i = j + 1
   118  			j = j + 1
   119  			continue
   120  
   121  		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
   122  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
   123  			buf = append(buf, s[i:j]...)
   124  			buf = append(buf, `\u00`...)
   125  			buf = append(buf, hex[c>>4], hex[c&0xF])
   126  			i = j + 1
   127  			j = j + 1
   128  			continue
   129  		}
   130  		state, size := decodeRuneInString(s[j:])
   131  		switch state {
   132  		case runeErrorState:
   133  			buf = append(buf, s[i:j]...)
   134  			buf = append(buf, `\ufffd`...)
   135  			i = j + 1
   136  			j = j + 1
   137  			continue
   138  			// U+2028 is LINE SEPARATOR.
   139  			// U+2029 is PARAGRAPH SEPARATOR.
   140  			// They are both technically valid characters in JSON strings,
   141  			// but don't work in JSONP, which has to be evaluated as JavaScript,
   142  			// and can lead to security holes there. It is valid JSON to
   143  			// escape them, so we do so unconditionally.
   144  			// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
   145  		case lineSepState:
   146  			buf = append(buf, s[i:j]...)
   147  			buf = append(buf, `\u2028`...)
   148  			i = j + 3
   149  			j = j + 3
   150  			continue
   151  		case paragraphSepState:
   152  			buf = append(buf, s[i:j]...)
   153  			buf = append(buf, `\u2029`...)
   154  			i = j + 3
   155  			j = j + 3
   156  			continue
   157  		}
   158  		j += size
   159  	}
   160  
   161  	return append(append(buf, s[i:]...), '"')
   162  }
   163  
   164  func appendHTMLString(buf []byte, s string) []byte {
   165  	valLen := len(s)
   166  	if valLen == 0 {
   167  		return append(buf, `""`...)
   168  	}
   169  	buf = append(buf, '"')
   170  	var (
   171  		i, j int
   172  	)
   173  	if valLen >= 8 {
   174  		chunks := stringToUint64Slice(s)
   175  		for _, n := range chunks {
   176  			// combine masks before checking for the MSB of each byte. We include
   177  			// `n` in the mask to check whether any of the *input* byte MSBs were
   178  			// set (i.e. the byte was outside the ASCII range).
   179  			mask := n | (n - (lsb * 0x20)) |
   180  				((n ^ (lsb * '"')) - lsb) |
   181  				((n ^ (lsb * '\\')) - lsb) |
   182  				((n ^ (lsb * '<')) - lsb) |
   183  				((n ^ (lsb * '>')) - lsb) |
   184  				((n ^ (lsb * '&')) - lsb)
   185  			if (mask & msb) != 0 {
   186  				j = bits.TrailingZeros64(mask&msb) / 8
   187  				goto ESCAPE_END
   188  			}
   189  		}
   190  		for i := len(chunks) * 8; i < valLen; i++ {
   191  			if needEscapeHTML[s[i]] {
   192  				j = i
   193  				goto ESCAPE_END
   194  			}
   195  		}
   196  		// no found any escape characters.
   197  		return append(append(buf, s...), '"')
   198  	}
   199  ESCAPE_END:
   200  	for j < valLen {
   201  		c := s[j]
   202  
   203  		if !needEscapeHTML[c] {
   204  			// fast path: most of the time, printable ascii characters are used
   205  			j++
   206  			continue
   207  		}
   208  
   209  		switch c {
   210  		case '\\', '"':
   211  			buf = append(buf, s[i:j]...)
   212  			buf = append(buf, '\\', c)
   213  			i = j + 1
   214  			j = j + 1
   215  			continue
   216  
   217  		case '\n':
   218  			buf = append(buf, s[i:j]...)
   219  			buf = append(buf, '\\', 'n')
   220  			i = j + 1
   221  			j = j + 1
   222  			continue
   223  
   224  		case '\r':
   225  			buf = append(buf, s[i:j]...)
   226  			buf = append(buf, '\\', 'r')
   227  			i = j + 1
   228  			j = j + 1
   229  			continue
   230  
   231  		case '\t':
   232  			buf = append(buf, s[i:j]...)
   233  			buf = append(buf, '\\', 't')
   234  			i = j + 1
   235  			j = j + 1
   236  			continue
   237  
   238  		case '<', '>', '&':
   239  			buf = append(buf, s[i:j]...)
   240  			buf = append(buf, `\u00`...)
   241  			buf = append(buf, hex[c>>4], hex[c&0xF])
   242  			i = j + 1
   243  			j = j + 1
   244  			continue
   245  
   246  		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
   247  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
   248  			buf = append(buf, s[i:j]...)
   249  			buf = append(buf, `\u00`...)
   250  			buf = append(buf, hex[c>>4], hex[c&0xF])
   251  			i = j + 1
   252  			j = j + 1
   253  			continue
   254  		}
   255  		j++
   256  	}
   257  
   258  	return append(append(buf, s[i:]...), '"')
   259  }
   260  
   261  func appendNormalizedString(buf []byte, s string) []byte {
   262  	valLen := len(s)
   263  	if valLen == 0 {
   264  		return append(buf, `""`...)
   265  	}
   266  	buf = append(buf, '"')
   267  	var (
   268  		i, j int
   269  	)
   270  	if valLen >= 8 {
   271  		chunks := stringToUint64Slice(s)
   272  		for _, n := range chunks {
   273  			// combine masks before checking for the MSB of each byte. We include
   274  			// `n` in the mask to check whether any of the *input* byte MSBs were
   275  			// set (i.e. the byte was outside the ASCII range).
   276  			mask := n | (n - (lsb * 0x20)) |
   277  				((n ^ (lsb * '"')) - lsb) |
   278  				((n ^ (lsb * '\\')) - lsb)
   279  			if (mask & msb) != 0 {
   280  				j = bits.TrailingZeros64(mask&msb) / 8
   281  				goto ESCAPE_END
   282  			}
   283  		}
   284  		valLen := len(s)
   285  		for i := len(chunks) * 8; i < valLen; i++ {
   286  			if needEscapeNormalizeUTF8[s[i]] {
   287  				j = i
   288  				goto ESCAPE_END
   289  			}
   290  		}
   291  		return append(append(buf, s...), '"')
   292  	}
   293  ESCAPE_END:
   294  	for j < valLen {
   295  		c := s[j]
   296  
   297  		if !needEscapeNormalizeUTF8[c] {
   298  			// fast path: most of the time, printable ascii characters are used
   299  			j++
   300  			continue
   301  		}
   302  
   303  		switch c {
   304  		case '\\', '"':
   305  			buf = append(buf, s[i:j]...)
   306  			buf = append(buf, '\\', c)
   307  			i = j + 1
   308  			j = j + 1
   309  			continue
   310  
   311  		case '\n':
   312  			buf = append(buf, s[i:j]...)
   313  			buf = append(buf, '\\', 'n')
   314  			i = j + 1
   315  			j = j + 1
   316  			continue
   317  
   318  		case '\r':
   319  			buf = append(buf, s[i:j]...)
   320  			buf = append(buf, '\\', 'r')
   321  			i = j + 1
   322  			j = j + 1
   323  			continue
   324  
   325  		case '\t':
   326  			buf = append(buf, s[i:j]...)
   327  			buf = append(buf, '\\', 't')
   328  			i = j + 1
   329  			j = j + 1
   330  			continue
   331  
   332  		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
   333  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
   334  			buf = append(buf, s[i:j]...)
   335  			buf = append(buf, `\u00`...)
   336  			buf = append(buf, hex[c>>4], hex[c&0xF])
   337  			i = j + 1
   338  			j = j + 1
   339  			continue
   340  		}
   341  
   342  		state, size := decodeRuneInString(s[j:])
   343  		switch state {
   344  		case runeErrorState:
   345  			buf = append(buf, s[i:j]...)
   346  			buf = append(buf, `\ufffd`...)
   347  			i = j + 1
   348  			j = j + 1
   349  			continue
   350  			// U+2028 is LINE SEPARATOR.
   351  			// U+2029 is PARAGRAPH SEPARATOR.
   352  			// They are both technically valid characters in JSON strings,
   353  			// but don't work in JSONP, which has to be evaluated as JavaScript,
   354  			// and can lead to security holes there. It is valid JSON to
   355  			// escape them, so we do so unconditionally.
   356  			// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
   357  		case lineSepState:
   358  			buf = append(buf, s[i:j]...)
   359  			buf = append(buf, `\u2028`...)
   360  			i = j + 3
   361  			j = j + 3
   362  			continue
   363  		case paragraphSepState:
   364  			buf = append(buf, s[i:j]...)
   365  			buf = append(buf, `\u2029`...)
   366  			i = j + 3
   367  			j = j + 3
   368  			continue
   369  		}
   370  		j += size
   371  	}
   372  
   373  	return append(append(buf, s[i:]...), '"')
   374  }
   375  
   376  func appendString(buf []byte, s string) []byte {
   377  	valLen := len(s)
   378  	if valLen == 0 {
   379  		return append(buf, `""`...)
   380  	}
   381  	buf = append(buf, '"')
   382  	var (
   383  		i, j int
   384  	)
   385  	if valLen >= 8 {
   386  		chunks := stringToUint64Slice(s)
   387  		for _, n := range chunks {
   388  			// combine masks before checking for the MSB of each byte. We include
   389  			// `n` in the mask to check whether any of the *input* byte MSBs were
   390  			// set (i.e. the byte was outside the ASCII range).
   391  			mask := n | (n - (lsb * 0x20)) |
   392  				((n ^ (lsb * '"')) - lsb) |
   393  				((n ^ (lsb * '\\')) - lsb)
   394  			if (mask & msb) != 0 {
   395  				j = bits.TrailingZeros64(mask&msb) / 8
   396  				goto ESCAPE_END
   397  			}
   398  		}
   399  		valLen := len(s)
   400  		for i := len(chunks) * 8; i < valLen; i++ {
   401  			if needEscape[s[i]] {
   402  				j = i
   403  				goto ESCAPE_END
   404  			}
   405  		}
   406  		return append(append(buf, s...), '"')
   407  	}
   408  ESCAPE_END:
   409  	for j < valLen {
   410  		c := s[j]
   411  
   412  		if !needEscape[c] {
   413  			// fast path: most of the time, printable ascii characters are used
   414  			j++
   415  			continue
   416  		}
   417  
   418  		switch c {
   419  		case '\\', '"':
   420  			buf = append(buf, s[i:j]...)
   421  			buf = append(buf, '\\', c)
   422  			i = j + 1
   423  			j = j + 1
   424  			continue
   425  
   426  		case '\n':
   427  			buf = append(buf, s[i:j]...)
   428  			buf = append(buf, '\\', 'n')
   429  			i = j + 1
   430  			j = j + 1
   431  			continue
   432  
   433  		case '\r':
   434  			buf = append(buf, s[i:j]...)
   435  			buf = append(buf, '\\', 'r')
   436  			i = j + 1
   437  			j = j + 1
   438  			continue
   439  
   440  		case '\t':
   441  			buf = append(buf, s[i:j]...)
   442  			buf = append(buf, '\\', 't')
   443  			i = j + 1
   444  			j = j + 1
   445  			continue
   446  
   447  		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
   448  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
   449  			buf = append(buf, s[i:j]...)
   450  			buf = append(buf, `\u00`...)
   451  			buf = append(buf, hex[c>>4], hex[c&0xF])
   452  			i = j + 1
   453  			j = j + 1
   454  			continue
   455  		}
   456  		j++
   457  	}
   458  
   459  	return append(append(buf, s[i:]...), '"')
   460  }