github.com/goccy/go-json@v0.10.3-0.20240509105655-5e2ae3f23c1d/internal/encoder/string.go

github.com/goccy/go-json@v0.10.3-0.20240509105655-5e2ae3f23c1d/internal/encoder/string.go (about)

     1  // This files's string processing codes are inspired by https://github.com/segmentio/encoding.
     2  // The license notation is as follows.
     3  //
     4  // # MIT License
     5  //
     6  // Copyright (c) 2019 Segment.io, Inc.
     7  //
     8  // Permission is hereby granted, free of charge, to any person obtaining a copy
     9  // of this software and associated documentation files (the "Software"), to deal
    10  // in the Software without restriction, including without limitation the rights
    11  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12  // copies of the Software, and to permit persons to whom the Software is
    13  // furnished to do so, subject to the following conditions:
    14  //
    15  // The above copyright notice and this permission notice shall be included in all
    16  // copies or substantial portions of the Software.
    17  //
    18  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    21  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    24  // SOFTWARE.
    25  package encoder
    26  
    27  import (
    28  	"math/bits"
    29  	"reflect"
    30  	"unsafe"
    31  )
    32  
    33  const (
    34  	lsb = 0x0101010101010101
    35  	msb = 0x8080808080808080
    36  )
    37  
    38  var hex = "0123456789abcdef"
    39  
    40  //nolint:govet
    41  func stringToUint64Slice(s string) []uint64 {
    42  	return *(*[]uint64)(unsafe.Pointer(&reflect.SliceHeader{
    43  		Data: ((*reflect.StringHeader)(unsafe.Pointer(&s))).Data,
    44  		Len:  len(s) / 8,
    45  		Cap:  len(s) / 8,
    46  	}))
    47  }
    48  
    49  func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte {
    50  	if ctx.Option.Flag&HTMLEscapeOption != 0 {
    51  		if ctx.Option.Flag&NormalizeUTF8Option != 0 {
    52  			return appendNormalizedHTMLString(buf, s)
    53  		}
    54  		return appendHTMLString(buf, s)
    55  	}
    56  	if ctx.Option.Flag&NormalizeUTF8Option != 0 {
    57  		return appendNormalizedString(buf, s)
    58  	}
    59  	return appendString(buf, s)
    60  }
    61  
    62  func appendNormalizedHTMLString(buf []byte, s string) []byte {
    63  	valLen := len(s)
    64  	if valLen == 0 {
    65  		return append(buf, `""`...)
    66  	}
    67  	buf = append(buf, '"')
    68  	var (
    69  		i, j int
    70  	)
    71  	if valLen >= 8 {
    72  		chunks := stringToUint64Slice(s)
    73  		for _, n := range chunks {
    74  			// combine masks before checking for the MSB of each byte. We include
    75  			// `n` in the mask to check whether any of the *input* byte MSBs were
    76  			// set (i.e. the byte was outside the ASCII range).
    77  			mask := n | (n - (lsb * 0x20)) |
    78  				((n ^ (lsb * '"')) - lsb) |
    79  				((n ^ (lsb * '\\')) - lsb) |
    80  				((n ^ (lsb * '<')) - lsb) |
    81  				((n ^ (lsb * '>')) - lsb) |
    82  				((n ^ (lsb * '&')) - lsb)
    83  			if (mask & msb) != 0 {
    84  				j = bits.TrailingZeros64(mask&msb) / 8
    85  				goto ESCAPE_END
    86  			}
    87  		}
    88  		for i := len(chunks) * 8; i < valLen; i++ {
    89  			if needEscapeHTMLNormalizeUTF8[s[i]] {
    90  				j = i
    91  				goto ESCAPE_END
    92  			}
    93  		}
    94  		// no found any escape characters.
    95  		return append(append(buf, s...), '"')
    96  	}
    97  ESCAPE_END:
    98  	for j < valLen {
    99  		c := s[j]
   100  
   101  		if !needEscapeHTMLNormalizeUTF8[c] {
   102  			// fast path: most of the time, printable ascii characters are used
   103  			j++
   104  			continue
   105  		}
   106  
   107  		switch c {
   108  		case '\\', '"':
   109  			buf = append(buf, s[i:j]...)
   110  			buf = append(buf, '\\', c)
   111  			i = j + 1
   112  			j = j + 1
   113  			continue
   114  
   115  		case '\n':
   116  			buf = append(buf, s[i:j]...)
   117  			buf = append(buf, '\\', 'n')
   118  			i = j + 1
   119  			j = j + 1
   120  			continue
   121  
   122  		case '\r':
   123  			buf = append(buf, s[i:j]...)
   124  			buf = append(buf, '\\', 'r')
   125  			i = j + 1
   126  			j = j + 1
   127  			continue
   128  
   129  		case '\t':
   130  			buf = append(buf, s[i:j]...)
   131  			buf = append(buf, '\\', 't')
   132  			i = j + 1
   133  			j = j + 1
   134  			continue
   135  
   136  		case '<', '>', '&':
   137  			buf = append(buf, s[i:j]...)
   138  			buf = append(buf, `\u00`...)
   139  			buf = append(buf, hex[c>>4], hex[c&0xF])
   140  			i = j + 1
   141  			j = j + 1
   142  			continue
   143  
   144  		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
   145  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
   146  			buf = append(buf, s[i:j]...)
   147  			buf = append(buf, `\u00`...)
   148  			buf = append(buf, hex[c>>4], hex[c&0xF])
   149  			i = j + 1
   150  			j = j + 1
   151  			continue
   152  		}
   153  		state, size := decodeRuneInString(s[j:])
   154  		switch state {
   155  		case runeErrorState:
   156  			buf = append(buf, s[i:j]...)
   157  			buf = append(buf, `\ufffd`...)
   158  			i = j + 1
   159  			j = j + 1
   160  			continue
   161  			// U+2028 is LINE SEPARATOR.
   162  			// U+2029 is PARAGRAPH SEPARATOR.
   163  			// They are both technically valid characters in JSON strings,
   164  			// but don't work in JSONP, which has to be evaluated as JavaScript,
   165  			// and can lead to security holes there. It is valid JSON to
   166  			// escape them, so we do so unconditionally.
   167  			// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
   168  		case lineSepState:
   169  			buf = append(buf, s[i:j]...)
   170  			buf = append(buf, `\u2028`...)
   171  			i = j + 3
   172  			j = j + 3
   173  			continue
   174  		case paragraphSepState:
   175  			buf = append(buf, s[i:j]...)
   176  			buf = append(buf, `\u2029`...)
   177  			i = j + 3
   178  			j = j + 3
   179  			continue
   180  		}
   181  		j += size
   182  	}
   183  
   184  	return append(append(buf, s[i:]...), '"')
   185  }
   186  
   187  func appendHTMLString(buf []byte, s string) []byte {
   188  	valLen := len(s)
   189  	if valLen == 0 {
   190  		return append(buf, `""`...)
   191  	}
   192  	buf = append(buf, '"')
   193  	var (
   194  		i, j int
   195  	)
   196  	if valLen >= 8 {
   197  		chunks := stringToUint64Slice(s)
   198  		for _, n := range chunks {
   199  			// combine masks before checking for the MSB of each byte. We include
   200  			// `n` in the mask to check whether any of the *input* byte MSBs were
   201  			// set (i.e. the byte was outside the ASCII range).
   202  			mask := n | (n - (lsb * 0x20)) |
   203  				((n ^ (lsb * '"')) - lsb) |
   204  				((n ^ (lsb * '\\')) - lsb) |
   205  				((n ^ (lsb * '<')) - lsb) |
   206  				((n ^ (lsb * '>')) - lsb) |
   207  				((n ^ (lsb * '&')) - lsb)
   208  			if (mask & msb) != 0 {
   209  				j = bits.TrailingZeros64(mask&msb) / 8
   210  				goto ESCAPE_END
   211  			}
   212  		}
   213  		for i := len(chunks) * 8; i < valLen; i++ {
   214  			if needEscapeHTML[s[i]] {
   215  				j = i
   216  				goto ESCAPE_END
   217  			}
   218  		}
   219  		// no found any escape characters.
   220  		return append(append(buf, s...), '"')
   221  	}
   222  ESCAPE_END:
   223  	for j < valLen {
   224  		c := s[j]
   225  
   226  		if !needEscapeHTML[c] {
   227  			// fast path: most of the time, printable ascii characters are used
   228  			j++
   229  			continue
   230  		}
   231  
   232  		switch c {
   233  		case '\\', '"':
   234  			buf = append(buf, s[i:j]...)
   235  			buf = append(buf, '\\', c)
   236  			i = j + 1
   237  			j = j + 1
   238  			continue
   239  
   240  		case '\n':
   241  			buf = append(buf, s[i:j]...)
   242  			buf = append(buf, '\\', 'n')
   243  			i = j + 1
   244  			j = j + 1
   245  			continue
   246  
   247  		case '\r':
   248  			buf = append(buf, s[i:j]...)
   249  			buf = append(buf, '\\', 'r')
   250  			i = j + 1
   251  			j = j + 1
   252  			continue
   253  
   254  		case '\t':
   255  			buf = append(buf, s[i:j]...)
   256  			buf = append(buf, '\\', 't')
   257  			i = j + 1
   258  			j = j + 1
   259  			continue
   260  
   261  		case '<', '>', '&':
   262  			buf = append(buf, s[i:j]...)
   263  			buf = append(buf, `\u00`...)
   264  			buf = append(buf, hex[c>>4], hex[c&0xF])
   265  			i = j + 1
   266  			j = j + 1
   267  			continue
   268  
   269  		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
   270  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
   271  			buf = append(buf, s[i:j]...)
   272  			buf = append(buf, `\u00`...)
   273  			buf = append(buf, hex[c>>4], hex[c&0xF])
   274  			i = j + 1
   275  			j = j + 1
   276  			continue
   277  		}
   278  		j++
   279  	}
   280  
   281  	return append(append(buf, s[i:]...), '"')
   282  }
   283  
   284  func appendNormalizedString(buf []byte, s string) []byte {
   285  	valLen := len(s)
   286  	if valLen == 0 {
   287  		return append(buf, `""`...)
   288  	}
   289  	buf = append(buf, '"')
   290  	var (
   291  		i, j int
   292  	)
   293  	if valLen >= 8 {
   294  		chunks := stringToUint64Slice(s)
   295  		for _, n := range chunks {
   296  			// combine masks before checking for the MSB of each byte. We include
   297  			// `n` in the mask to check whether any of the *input* byte MSBs were
   298  			// set (i.e. the byte was outside the ASCII range).
   299  			mask := n | (n - (lsb * 0x20)) |
   300  				((n ^ (lsb * '"')) - lsb) |
   301  				((n ^ (lsb * '\\')) - lsb)
   302  			if (mask & msb) != 0 {
   303  				j = bits.TrailingZeros64(mask&msb) / 8
   304  				goto ESCAPE_END
   305  			}
   306  		}
   307  		valLen := len(s)
   308  		for i := len(chunks) * 8; i < valLen; i++ {
   309  			if needEscapeNormalizeUTF8[s[i]] {
   310  				j = i
   311  				goto ESCAPE_END
   312  			}
   313  		}
   314  		return append(append(buf, s...), '"')
   315  	}
   316  ESCAPE_END:
   317  	for j < valLen {
   318  		c := s[j]
   319  
   320  		if !needEscapeNormalizeUTF8[c] {
   321  			// fast path: most of the time, printable ascii characters are used
   322  			j++
   323  			continue
   324  		}
   325  
   326  		switch c {
   327  		case '\\', '"':
   328  			buf = append(buf, s[i:j]...)
   329  			buf = append(buf, '\\', c)
   330  			i = j + 1
   331  			j = j + 1
   332  			continue
   333  
   334  		case '\n':
   335  			buf = append(buf, s[i:j]...)
   336  			buf = append(buf, '\\', 'n')
   337  			i = j + 1
   338  			j = j + 1
   339  			continue
   340  
   341  		case '\r':
   342  			buf = append(buf, s[i:j]...)
   343  			buf = append(buf, '\\', 'r')
   344  			i = j + 1
   345  			j = j + 1
   346  			continue
   347  
   348  		case '\t':
   349  			buf = append(buf, s[i:j]...)
   350  			buf = append(buf, '\\', 't')
   351  			i = j + 1
   352  			j = j + 1
   353  			continue
   354  
   355  		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
   356  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
   357  			buf = append(buf, s[i:j]...)
   358  			buf = append(buf, `\u00`...)
   359  			buf = append(buf, hex[c>>4], hex[c&0xF])
   360  			i = j + 1
   361  			j = j + 1
   362  			continue
   363  		}
   364  
   365  		state, size := decodeRuneInString(s[j:])
   366  		switch state {
   367  		case runeErrorState:
   368  			buf = append(buf, s[i:j]...)
   369  			buf = append(buf, `\ufffd`...)
   370  			i = j + 1
   371  			j = j + 1
   372  			continue
   373  			// U+2028 is LINE SEPARATOR.
   374  			// U+2029 is PARAGRAPH SEPARATOR.
   375  			// They are both technically valid characters in JSON strings,
   376  			// but don't work in JSONP, which has to be evaluated as JavaScript,
   377  			// and can lead to security holes there. It is valid JSON to
   378  			// escape them, so we do so unconditionally.
   379  			// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
   380  		case lineSepState:
   381  			buf = append(buf, s[i:j]...)
   382  			buf = append(buf, `\u2028`...)
   383  			i = j + 3
   384  			j = j + 3
   385  			continue
   386  		case paragraphSepState:
   387  			buf = append(buf, s[i:j]...)
   388  			buf = append(buf, `\u2029`...)
   389  			i = j + 3
   390  			j = j + 3
   391  			continue
   392  		}
   393  		j += size
   394  	}
   395  
   396  	return append(append(buf, s[i:]...), '"')
   397  }
   398  
   399  func appendString(buf []byte, s string) []byte {
   400  	valLen := len(s)
   401  	if valLen == 0 {
   402  		return append(buf, `""`...)
   403  	}
   404  	buf = append(buf, '"')
   405  	var (
   406  		i, j int
   407  	)
   408  	if valLen >= 8 {
   409  		chunks := stringToUint64Slice(s)
   410  		for _, n := range chunks {
   411  			// combine masks before checking for the MSB of each byte. We include
   412  			// `n` in the mask to check whether any of the *input* byte MSBs were
   413  			// set (i.e. the byte was outside the ASCII range).
   414  			mask := n | (n - (lsb * 0x20)) |
   415  				((n ^ (lsb * '"')) - lsb) |
   416  				((n ^ (lsb * '\\')) - lsb)
   417  			if (mask & msb) != 0 {
   418  				j = bits.TrailingZeros64(mask&msb) / 8
   419  				goto ESCAPE_END
   420  			}
   421  		}
   422  		valLen := len(s)
   423  		for i := len(chunks) * 8; i < valLen; i++ {
   424  			if needEscape[s[i]] {
   425  				j = i
   426  				goto ESCAPE_END
   427  			}
   428  		}
   429  		return append(append(buf, s...), '"')
   430  	}
   431  ESCAPE_END:
   432  	for j < valLen {
   433  		c := s[j]
   434  
   435  		if !needEscape[c] {
   436  			// fast path: most of the time, printable ascii characters are used
   437  			j++
   438  			continue
   439  		}
   440  
   441  		switch c {
   442  		case '\\', '"':
   443  			buf = append(buf, s[i:j]...)
   444  			buf = append(buf, '\\', c)
   445  			i = j + 1
   446  			j = j + 1
   447  			continue
   448  
   449  		case '\n':
   450  			buf = append(buf, s[i:j]...)
   451  			buf = append(buf, '\\', 'n')
   452  			i = j + 1
   453  			j = j + 1
   454  			continue
   455  
   456  		case '\r':
   457  			buf = append(buf, s[i:j]...)
   458  			buf = append(buf, '\\', 'r')
   459  			i = j + 1
   460  			j = j + 1
   461  			continue
   462  
   463  		case '\t':
   464  			buf = append(buf, s[i:j]...)
   465  			buf = append(buf, '\\', 't')
   466  			i = j + 1
   467  			j = j + 1
   468  			continue
   469  
   470  		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
   471  			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
   472  			buf = append(buf, s[i:j]...)
   473  			buf = append(buf, `\u00`...)
   474  			buf = append(buf, hex[c>>4], hex[c&0xF])
   475  			i = j + 1
   476  			j = j + 1
   477  			continue
   478  		}
   479  		j++
   480  	}
   481  
   482  	return append(append(buf, s[i:]...), '"')
   483  }