github.com/goccy/go-json@v0.10.3-0.20240509105655-5e2ae3f23c1d/internal/encoder/string.go (about) 1 // This files's string processing codes are inspired by https://github.com/segmentio/encoding. 2 // The license notation is as follows. 3 // 4 // # MIT License 5 // 6 // Copyright (c) 2019 Segment.io, Inc. 7 // 8 // Permission is hereby granted, free of charge, to any person obtaining a copy 9 // of this software and associated documentation files (the "Software"), to deal 10 // in the Software without restriction, including without limitation the rights 11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 // copies of the Software, and to permit persons to whom the Software is 13 // furnished to do so, subject to the following conditions: 14 // 15 // The above copyright notice and this permission notice shall be included in all 16 // copies or substantial portions of the Software. 17 // 18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 // SOFTWARE. 25 package encoder 26 27 import ( 28 "math/bits" 29 "reflect" 30 "unsafe" 31 ) 32 33 const ( 34 lsb = 0x0101010101010101 35 msb = 0x8080808080808080 36 ) 37 38 var hex = "0123456789abcdef" 39 40 //nolint:govet 41 func stringToUint64Slice(s string) []uint64 { 42 return *(*[]uint64)(unsafe.Pointer(&reflect.SliceHeader{ 43 Data: ((*reflect.StringHeader)(unsafe.Pointer(&s))).Data, 44 Len: len(s) / 8, 45 Cap: len(s) / 8, 46 })) 47 } 48 49 func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte { 50 if ctx.Option.Flag&HTMLEscapeOption != 0 { 51 if ctx.Option.Flag&NormalizeUTF8Option != 0 { 52 return appendNormalizedHTMLString(buf, s) 53 } 54 return appendHTMLString(buf, s) 55 } 56 if ctx.Option.Flag&NormalizeUTF8Option != 0 { 57 return appendNormalizedString(buf, s) 58 } 59 return appendString(buf, s) 60 } 61 62 func appendNormalizedHTMLString(buf []byte, s string) []byte { 63 valLen := len(s) 64 if valLen == 0 { 65 return append(buf, `""`...) 66 } 67 buf = append(buf, '"') 68 var ( 69 i, j int 70 ) 71 if valLen >= 8 { 72 chunks := stringToUint64Slice(s) 73 for _, n := range chunks { 74 // combine masks before checking for the MSB of each byte. We include 75 // `n` in the mask to check whether any of the *input* byte MSBs were 76 // set (i.e. the byte was outside the ASCII range). 77 mask := n | (n - (lsb * 0x20)) | 78 ((n ^ (lsb * '"')) - lsb) | 79 ((n ^ (lsb * '\\')) - lsb) | 80 ((n ^ (lsb * '<')) - lsb) | 81 ((n ^ (lsb * '>')) - lsb) | 82 ((n ^ (lsb * '&')) - lsb) 83 if (mask & msb) != 0 { 84 j = bits.TrailingZeros64(mask&msb) / 8 85 goto ESCAPE_END 86 } 87 } 88 for i := len(chunks) * 8; i < valLen; i++ { 89 if needEscapeHTMLNormalizeUTF8[s[i]] { 90 j = i 91 goto ESCAPE_END 92 } 93 } 94 // no found any escape characters. 95 return append(append(buf, s...), '"') 96 } 97 ESCAPE_END: 98 for j < valLen { 99 c := s[j] 100 101 if !needEscapeHTMLNormalizeUTF8[c] { 102 // fast path: most of the time, printable ascii characters are used 103 j++ 104 continue 105 } 106 107 switch c { 108 case '\\', '"': 109 buf = append(buf, s[i:j]...) 110 buf = append(buf, '\\', c) 111 i = j + 1 112 j = j + 1 113 continue 114 115 case '\n': 116 buf = append(buf, s[i:j]...) 117 buf = append(buf, '\\', 'n') 118 i = j + 1 119 j = j + 1 120 continue 121 122 case '\r': 123 buf = append(buf, s[i:j]...) 124 buf = append(buf, '\\', 'r') 125 i = j + 1 126 j = j + 1 127 continue 128 129 case '\t': 130 buf = append(buf, s[i:j]...) 131 buf = append(buf, '\\', 't') 132 i = j + 1 133 j = j + 1 134 continue 135 136 case '<', '>', '&': 137 buf = append(buf, s[i:j]...) 138 buf = append(buf, `\u00`...) 139 buf = append(buf, hex[c>>4], hex[c&0xF]) 140 i = j + 1 141 j = j + 1 142 continue 143 144 case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 145 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F 146 buf = append(buf, s[i:j]...) 147 buf = append(buf, `\u00`...) 148 buf = append(buf, hex[c>>4], hex[c&0xF]) 149 i = j + 1 150 j = j + 1 151 continue 152 } 153 state, size := decodeRuneInString(s[j:]) 154 switch state { 155 case runeErrorState: 156 buf = append(buf, s[i:j]...) 157 buf = append(buf, `\ufffd`...) 158 i = j + 1 159 j = j + 1 160 continue 161 // U+2028 is LINE SEPARATOR. 162 // U+2029 is PARAGRAPH SEPARATOR. 163 // They are both technically valid characters in JSON strings, 164 // but don't work in JSONP, which has to be evaluated as JavaScript, 165 // and can lead to security holes there. It is valid JSON to 166 // escape them, so we do so unconditionally. 167 // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion. 168 case lineSepState: 169 buf = append(buf, s[i:j]...) 170 buf = append(buf, `\u2028`...) 171 i = j + 3 172 j = j + 3 173 continue 174 case paragraphSepState: 175 buf = append(buf, s[i:j]...) 176 buf = append(buf, `\u2029`...) 177 i = j + 3 178 j = j + 3 179 continue 180 } 181 j += size 182 } 183 184 return append(append(buf, s[i:]...), '"') 185 } 186 187 func appendHTMLString(buf []byte, s string) []byte { 188 valLen := len(s) 189 if valLen == 0 { 190 return append(buf, `""`...) 191 } 192 buf = append(buf, '"') 193 var ( 194 i, j int 195 ) 196 if valLen >= 8 { 197 chunks := stringToUint64Slice(s) 198 for _, n := range chunks { 199 // combine masks before checking for the MSB of each byte. We include 200 // `n` in the mask to check whether any of the *input* byte MSBs were 201 // set (i.e. the byte was outside the ASCII range). 202 mask := n | (n - (lsb * 0x20)) | 203 ((n ^ (lsb * '"')) - lsb) | 204 ((n ^ (lsb * '\\')) - lsb) | 205 ((n ^ (lsb * '<')) - lsb) | 206 ((n ^ (lsb * '>')) - lsb) | 207 ((n ^ (lsb * '&')) - lsb) 208 if (mask & msb) != 0 { 209 j = bits.TrailingZeros64(mask&msb) / 8 210 goto ESCAPE_END 211 } 212 } 213 for i := len(chunks) * 8; i < valLen; i++ { 214 if needEscapeHTML[s[i]] { 215 j = i 216 goto ESCAPE_END 217 } 218 } 219 // no found any escape characters. 220 return append(append(buf, s...), '"') 221 } 222 ESCAPE_END: 223 for j < valLen { 224 c := s[j] 225 226 if !needEscapeHTML[c] { 227 // fast path: most of the time, printable ascii characters are used 228 j++ 229 continue 230 } 231 232 switch c { 233 case '\\', '"': 234 buf = append(buf, s[i:j]...) 235 buf = append(buf, '\\', c) 236 i = j + 1 237 j = j + 1 238 continue 239 240 case '\n': 241 buf = append(buf, s[i:j]...) 242 buf = append(buf, '\\', 'n') 243 i = j + 1 244 j = j + 1 245 continue 246 247 case '\r': 248 buf = append(buf, s[i:j]...) 249 buf = append(buf, '\\', 'r') 250 i = j + 1 251 j = j + 1 252 continue 253 254 case '\t': 255 buf = append(buf, s[i:j]...) 256 buf = append(buf, '\\', 't') 257 i = j + 1 258 j = j + 1 259 continue 260 261 case '<', '>', '&': 262 buf = append(buf, s[i:j]...) 263 buf = append(buf, `\u00`...) 264 buf = append(buf, hex[c>>4], hex[c&0xF]) 265 i = j + 1 266 j = j + 1 267 continue 268 269 case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 270 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F 271 buf = append(buf, s[i:j]...) 272 buf = append(buf, `\u00`...) 273 buf = append(buf, hex[c>>4], hex[c&0xF]) 274 i = j + 1 275 j = j + 1 276 continue 277 } 278 j++ 279 } 280 281 return append(append(buf, s[i:]...), '"') 282 } 283 284 func appendNormalizedString(buf []byte, s string) []byte { 285 valLen := len(s) 286 if valLen == 0 { 287 return append(buf, `""`...) 288 } 289 buf = append(buf, '"') 290 var ( 291 i, j int 292 ) 293 if valLen >= 8 { 294 chunks := stringToUint64Slice(s) 295 for _, n := range chunks { 296 // combine masks before checking for the MSB of each byte. We include 297 // `n` in the mask to check whether any of the *input* byte MSBs were 298 // set (i.e. the byte was outside the ASCII range). 299 mask := n | (n - (lsb * 0x20)) | 300 ((n ^ (lsb * '"')) - lsb) | 301 ((n ^ (lsb * '\\')) - lsb) 302 if (mask & msb) != 0 { 303 j = bits.TrailingZeros64(mask&msb) / 8 304 goto ESCAPE_END 305 } 306 } 307 valLen := len(s) 308 for i := len(chunks) * 8; i < valLen; i++ { 309 if needEscapeNormalizeUTF8[s[i]] { 310 j = i 311 goto ESCAPE_END 312 } 313 } 314 return append(append(buf, s...), '"') 315 } 316 ESCAPE_END: 317 for j < valLen { 318 c := s[j] 319 320 if !needEscapeNormalizeUTF8[c] { 321 // fast path: most of the time, printable ascii characters are used 322 j++ 323 continue 324 } 325 326 switch c { 327 case '\\', '"': 328 buf = append(buf, s[i:j]...) 329 buf = append(buf, '\\', c) 330 i = j + 1 331 j = j + 1 332 continue 333 334 case '\n': 335 buf = append(buf, s[i:j]...) 336 buf = append(buf, '\\', 'n') 337 i = j + 1 338 j = j + 1 339 continue 340 341 case '\r': 342 buf = append(buf, s[i:j]...) 343 buf = append(buf, '\\', 'r') 344 i = j + 1 345 j = j + 1 346 continue 347 348 case '\t': 349 buf = append(buf, s[i:j]...) 350 buf = append(buf, '\\', 't') 351 i = j + 1 352 j = j + 1 353 continue 354 355 case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 356 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F 357 buf = append(buf, s[i:j]...) 358 buf = append(buf, `\u00`...) 359 buf = append(buf, hex[c>>4], hex[c&0xF]) 360 i = j + 1 361 j = j + 1 362 continue 363 } 364 365 state, size := decodeRuneInString(s[j:]) 366 switch state { 367 case runeErrorState: 368 buf = append(buf, s[i:j]...) 369 buf = append(buf, `\ufffd`...) 370 i = j + 1 371 j = j + 1 372 continue 373 // U+2028 is LINE SEPARATOR. 374 // U+2029 is PARAGRAPH SEPARATOR. 375 // They are both technically valid characters in JSON strings, 376 // but don't work in JSONP, which has to be evaluated as JavaScript, 377 // and can lead to security holes there. It is valid JSON to 378 // escape them, so we do so unconditionally. 379 // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion. 380 case lineSepState: 381 buf = append(buf, s[i:j]...) 382 buf = append(buf, `\u2028`...) 383 i = j + 3 384 j = j + 3 385 continue 386 case paragraphSepState: 387 buf = append(buf, s[i:j]...) 388 buf = append(buf, `\u2029`...) 389 i = j + 3 390 j = j + 3 391 continue 392 } 393 j += size 394 } 395 396 return append(append(buf, s[i:]...), '"') 397 } 398 399 func appendString(buf []byte, s string) []byte { 400 valLen := len(s) 401 if valLen == 0 { 402 return append(buf, `""`...) 403 } 404 buf = append(buf, '"') 405 var ( 406 i, j int 407 ) 408 if valLen >= 8 { 409 chunks := stringToUint64Slice(s) 410 for _, n := range chunks { 411 // combine masks before checking for the MSB of each byte. We include 412 // `n` in the mask to check whether any of the *input* byte MSBs were 413 // set (i.e. the byte was outside the ASCII range). 414 mask := n | (n - (lsb * 0x20)) | 415 ((n ^ (lsb * '"')) - lsb) | 416 ((n ^ (lsb * '\\')) - lsb) 417 if (mask & msb) != 0 { 418 j = bits.TrailingZeros64(mask&msb) / 8 419 goto ESCAPE_END 420 } 421 } 422 valLen := len(s) 423 for i := len(chunks) * 8; i < valLen; i++ { 424 if needEscape[s[i]] { 425 j = i 426 goto ESCAPE_END 427 } 428 } 429 return append(append(buf, s...), '"') 430 } 431 ESCAPE_END: 432 for j < valLen { 433 c := s[j] 434 435 if !needEscape[c] { 436 // fast path: most of the time, printable ascii characters are used 437 j++ 438 continue 439 } 440 441 switch c { 442 case '\\', '"': 443 buf = append(buf, s[i:j]...) 444 buf = append(buf, '\\', c) 445 i = j + 1 446 j = j + 1 447 continue 448 449 case '\n': 450 buf = append(buf, s[i:j]...) 451 buf = append(buf, '\\', 'n') 452 i = j + 1 453 j = j + 1 454 continue 455 456 case '\r': 457 buf = append(buf, s[i:j]...) 458 buf = append(buf, '\\', 'r') 459 i = j + 1 460 j = j + 1 461 continue 462 463 case '\t': 464 buf = append(buf, s[i:j]...) 465 buf = append(buf, '\\', 't') 466 i = j + 1 467 j = j + 1 468 continue 469 470 case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 471 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F 472 buf = append(buf, s[i:j]...) 473 buf = append(buf, `\u00`...) 474 buf = append(buf, hex[c>>4], hex[c&0xF]) 475 i = j + 1 476 j = j + 1 477 continue 478 } 479 j++ 480 } 481 482 return append(append(buf, s[i:]...), '"') 483 }