github.com/3JoB/go-json@v0.10.4/internal/encoder/string.go (about) 1 package encoder 2 3 import ( 4 "math/bits" 5 "unsafe" 6 7 "github.com/3JoB/go-reflect" 8 ) 9 10 const ( 11 lsb = 0x0101010101010101 12 msb = 0x8080808080808080 13 ) 14 15 var hex = "0123456789abcdef" 16 17 //nolint:govet 18 func stringToUint64Slice(s string) []uint64 { 19 return *(*[]uint64)(unsafe.Pointer(&reflect.SliceHeader{ 20 Data: ((*reflect.StringHeader)(unsafe.Pointer(&s))).Data, 21 Len: len(s) / 8, 22 Cap: len(s) / 8, 23 })) 24 } 25 26 func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte { 27 if ctx.Option.Flag&HTMLEscapeOption != 0 { 28 if ctx.Option.Flag&NormalizeUTF8Option != 0 { 29 return appendNormalizedHTMLString(buf, s) 30 } 31 return appendHTMLString(buf, s) 32 } 33 if ctx.Option.Flag&NormalizeUTF8Option != 0 { 34 return appendNormalizedString(buf, s) 35 } 36 return appendString(buf, s) 37 } 38 39 func appendNormalizedHTMLString(buf []byte, s string) []byte { 40 valLen := len(s) 41 if valLen == 0 { 42 return append(buf, `""`...) 43 } 44 buf = append(buf, '"') 45 var ( 46 i, j int 47 ) 48 if valLen >= 8 { 49 chunks := stringToUint64Slice(s) 50 for _, n := range chunks { 51 // combine masks before checking for the MSB of each byte. We include 52 // `n` in the mask to check whether any of the *input* byte MSBs were 53 // set (i.e. the byte was outside the ASCII range). 54 mask := n | (n - (lsb * 0x20)) | 55 ((n ^ (lsb * '"')) - lsb) | 56 ((n ^ (lsb * '\\')) - lsb) | 57 ((n ^ (lsb * '<')) - lsb) | 58 ((n ^ (lsb * '>')) - lsb) | 59 ((n ^ (lsb * '&')) - lsb) 60 if (mask & msb) != 0 { 61 j = bits.TrailingZeros64(mask&msb) / 8 62 goto ESCAPE_END 63 } 64 } 65 for i := len(chunks) * 8; i < valLen; i++ { 66 if needEscapeHTMLNormalizeUTF8[s[i]] { 67 j = i 68 goto ESCAPE_END 69 } 70 } 71 // no found any escape characters. 72 return append(append(buf, s...), '"') 73 } 74 ESCAPE_END: 75 for j < valLen { 76 c := s[j] 77 78 if !needEscapeHTMLNormalizeUTF8[c] { 79 // fast path: most of the time, printable ascii characters are used 80 j++ 81 continue 82 } 83 84 switch c { 85 case '\\', '"': 86 buf = append(buf, s[i:j]...) 87 buf = append(buf, '\\', c) 88 i = j + 1 89 j = j + 1 90 continue 91 92 case '\n': 93 buf = append(buf, s[i:j]...) 94 buf = append(buf, '\\', 'n') 95 i = j + 1 96 j = j + 1 97 continue 98 99 case '\r': 100 buf = append(buf, s[i:j]...) 101 buf = append(buf, '\\', 'r') 102 i = j + 1 103 j = j + 1 104 continue 105 106 case '\t': 107 buf = append(buf, s[i:j]...) 108 buf = append(buf, '\\', 't') 109 i = j + 1 110 j = j + 1 111 continue 112 113 case '<', '>', '&': 114 buf = append(buf, s[i:j]...) 115 buf = append(buf, `\u00`...) 116 buf = append(buf, hex[c>>4], hex[c&0xF]) 117 i = j + 1 118 j = j + 1 119 continue 120 121 case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 122 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F 123 buf = append(buf, s[i:j]...) 124 buf = append(buf, `\u00`...) 125 buf = append(buf, hex[c>>4], hex[c&0xF]) 126 i = j + 1 127 j = j + 1 128 continue 129 } 130 state, size := decodeRuneInString(s[j:]) 131 switch state { 132 case runeErrorState: 133 buf = append(buf, s[i:j]...) 134 buf = append(buf, `\ufffd`...) 135 i = j + 1 136 j = j + 1 137 continue 138 // U+2028 is LINE SEPARATOR. 139 // U+2029 is PARAGRAPH SEPARATOR. 140 // They are both technically valid characters in JSON strings, 141 // but don't work in JSONP, which has to be evaluated as JavaScript, 142 // and can lead to security holes there. It is valid JSON to 143 // escape them, so we do so unconditionally. 144 // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion. 145 case lineSepState: 146 buf = append(buf, s[i:j]...) 147 buf = append(buf, `\u2028`...) 148 i = j + 3 149 j = j + 3 150 continue 151 case paragraphSepState: 152 buf = append(buf, s[i:j]...) 153 buf = append(buf, `\u2029`...) 154 i = j + 3 155 j = j + 3 156 continue 157 } 158 j += size 159 } 160 161 return append(append(buf, s[i:]...), '"') 162 } 163 164 func appendHTMLString(buf []byte, s string) []byte { 165 valLen := len(s) 166 if valLen == 0 { 167 return append(buf, `""`...) 168 } 169 buf = append(buf, '"') 170 var ( 171 i, j int 172 ) 173 if valLen >= 8 { 174 chunks := stringToUint64Slice(s) 175 for _, n := range chunks { 176 // combine masks before checking for the MSB of each byte. We include 177 // `n` in the mask to check whether any of the *input* byte MSBs were 178 // set (i.e. the byte was outside the ASCII range). 179 mask := n | (n - (lsb * 0x20)) | 180 ((n ^ (lsb * '"')) - lsb) | 181 ((n ^ (lsb * '\\')) - lsb) | 182 ((n ^ (lsb * '<')) - lsb) | 183 ((n ^ (lsb * '>')) - lsb) | 184 ((n ^ (lsb * '&')) - lsb) 185 if (mask & msb) != 0 { 186 j = bits.TrailingZeros64(mask&msb) / 8 187 goto ESCAPE_END 188 } 189 } 190 for i := len(chunks) * 8; i < valLen; i++ { 191 if needEscapeHTML[s[i]] { 192 j = i 193 goto ESCAPE_END 194 } 195 } 196 // no found any escape characters. 197 return append(append(buf, s...), '"') 198 } 199 ESCAPE_END: 200 for j < valLen { 201 c := s[j] 202 203 if !needEscapeHTML[c] { 204 // fast path: most of the time, printable ascii characters are used 205 j++ 206 continue 207 } 208 209 switch c { 210 case '\\', '"': 211 buf = append(buf, s[i:j]...) 212 buf = append(buf, '\\', c) 213 i = j + 1 214 j = j + 1 215 continue 216 217 case '\n': 218 buf = append(buf, s[i:j]...) 219 buf = append(buf, '\\', 'n') 220 i = j + 1 221 j = j + 1 222 continue 223 224 case '\r': 225 buf = append(buf, s[i:j]...) 226 buf = append(buf, '\\', 'r') 227 i = j + 1 228 j = j + 1 229 continue 230 231 case '\t': 232 buf = append(buf, s[i:j]...) 233 buf = append(buf, '\\', 't') 234 i = j + 1 235 j = j + 1 236 continue 237 238 case '<', '>', '&': 239 buf = append(buf, s[i:j]...) 240 buf = append(buf, `\u00`...) 241 buf = append(buf, hex[c>>4], hex[c&0xF]) 242 i = j + 1 243 j = j + 1 244 continue 245 246 case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 247 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F 248 buf = append(buf, s[i:j]...) 249 buf = append(buf, `\u00`...) 250 buf = append(buf, hex[c>>4], hex[c&0xF]) 251 i = j + 1 252 j = j + 1 253 continue 254 } 255 j++ 256 } 257 258 return append(append(buf, s[i:]...), '"') 259 } 260 261 func appendNormalizedString(buf []byte, s string) []byte { 262 valLen := len(s) 263 if valLen == 0 { 264 return append(buf, `""`...) 265 } 266 buf = append(buf, '"') 267 var ( 268 i, j int 269 ) 270 if valLen >= 8 { 271 chunks := stringToUint64Slice(s) 272 for _, n := range chunks { 273 // combine masks before checking for the MSB of each byte. We include 274 // `n` in the mask to check whether any of the *input* byte MSBs were 275 // set (i.e. the byte was outside the ASCII range). 276 mask := n | (n - (lsb * 0x20)) | 277 ((n ^ (lsb * '"')) - lsb) | 278 ((n ^ (lsb * '\\')) - lsb) 279 if (mask & msb) != 0 { 280 j = bits.TrailingZeros64(mask&msb) / 8 281 goto ESCAPE_END 282 } 283 } 284 valLen := len(s) 285 for i := len(chunks) * 8; i < valLen; i++ { 286 if needEscapeNormalizeUTF8[s[i]] { 287 j = i 288 goto ESCAPE_END 289 } 290 } 291 return append(append(buf, s...), '"') 292 } 293 ESCAPE_END: 294 for j < valLen { 295 c := s[j] 296 297 if !needEscapeNormalizeUTF8[c] { 298 // fast path: most of the time, printable ascii characters are used 299 j++ 300 continue 301 } 302 303 switch c { 304 case '\\', '"': 305 buf = append(buf, s[i:j]...) 306 buf = append(buf, '\\', c) 307 i = j + 1 308 j = j + 1 309 continue 310 311 case '\n': 312 buf = append(buf, s[i:j]...) 313 buf = append(buf, '\\', 'n') 314 i = j + 1 315 j = j + 1 316 continue 317 318 case '\r': 319 buf = append(buf, s[i:j]...) 320 buf = append(buf, '\\', 'r') 321 i = j + 1 322 j = j + 1 323 continue 324 325 case '\t': 326 buf = append(buf, s[i:j]...) 327 buf = append(buf, '\\', 't') 328 i = j + 1 329 j = j + 1 330 continue 331 332 case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 333 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F 334 buf = append(buf, s[i:j]...) 335 buf = append(buf, `\u00`...) 336 buf = append(buf, hex[c>>4], hex[c&0xF]) 337 i = j + 1 338 j = j + 1 339 continue 340 } 341 342 state, size := decodeRuneInString(s[j:]) 343 switch state { 344 case runeErrorState: 345 buf = append(buf, s[i:j]...) 346 buf = append(buf, `\ufffd`...) 347 i = j + 1 348 j = j + 1 349 continue 350 // U+2028 is LINE SEPARATOR. 351 // U+2029 is PARAGRAPH SEPARATOR. 352 // They are both technically valid characters in JSON strings, 353 // but don't work in JSONP, which has to be evaluated as JavaScript, 354 // and can lead to security holes there. It is valid JSON to 355 // escape them, so we do so unconditionally. 356 // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion. 357 case lineSepState: 358 buf = append(buf, s[i:j]...) 359 buf = append(buf, `\u2028`...) 360 i = j + 3 361 j = j + 3 362 continue 363 case paragraphSepState: 364 buf = append(buf, s[i:j]...) 365 buf = append(buf, `\u2029`...) 366 i = j + 3 367 j = j + 3 368 continue 369 } 370 j += size 371 } 372 373 return append(append(buf, s[i:]...), '"') 374 } 375 376 func appendString(buf []byte, s string) []byte { 377 valLen := len(s) 378 if valLen == 0 { 379 return append(buf, `""`...) 380 } 381 buf = append(buf, '"') 382 var ( 383 i, j int 384 ) 385 if valLen >= 8 { 386 chunks := stringToUint64Slice(s) 387 for _, n := range chunks { 388 // combine masks before checking for the MSB of each byte. We include 389 // `n` in the mask to check whether any of the *input* byte MSBs were 390 // set (i.e. the byte was outside the ASCII range). 391 mask := n | (n - (lsb * 0x20)) | 392 ((n ^ (lsb * '"')) - lsb) | 393 ((n ^ (lsb * '\\')) - lsb) 394 if (mask & msb) != 0 { 395 j = bits.TrailingZeros64(mask&msb) / 8 396 goto ESCAPE_END 397 } 398 } 399 valLen := len(s) 400 for i := len(chunks) * 8; i < valLen; i++ { 401 if needEscape[s[i]] { 402 j = i 403 goto ESCAPE_END 404 } 405 } 406 return append(append(buf, s...), '"') 407 } 408 ESCAPE_END: 409 for j < valLen { 410 c := s[j] 411 412 if !needEscape[c] { 413 // fast path: most of the time, printable ascii characters are used 414 j++ 415 continue 416 } 417 418 switch c { 419 case '\\', '"': 420 buf = append(buf, s[i:j]...) 421 buf = append(buf, '\\', c) 422 i = j + 1 423 j = j + 1 424 continue 425 426 case '\n': 427 buf = append(buf, s[i:j]...) 428 buf = append(buf, '\\', 'n') 429 i = j + 1 430 j = j + 1 431 continue 432 433 case '\r': 434 buf = append(buf, s[i:j]...) 435 buf = append(buf, '\\', 'r') 436 i = j + 1 437 j = j + 1 438 continue 439 440 case '\t': 441 buf = append(buf, s[i:j]...) 442 buf = append(buf, '\\', 't') 443 i = j + 1 444 j = j + 1 445 continue 446 447 case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 448 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F 449 buf = append(buf, s[i:j]...) 450 buf = append(buf, `\u00`...) 451 buf = append(buf, hex[c>>4], hex[c&0xF]) 452 i = j + 1 453 j = j + 1 454 continue 455 } 456 j++ 457 } 458 459 return append(append(buf, s[i:]...), '"') 460 }