github.com/cloudwego/dynamicgo@v0.2.6-0.20240519101509-707f41b6b834/native/parsing.c (about) 1 /* 2 * Copyright 2023 CloudWeGo Authors. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "native.h" 18 #include <stdint.h> 19 20 /** String Quoting **/ 21 #define MAX_ESCAPED_BYTES 8 22 typedef struct 23 { 24 const long n; 25 const char s[MAX_ESCAPED_BYTES]; 26 } quoted_t; 27 28 static const quoted_t _SingleQuoteTab[256] = { 29 ['\x00'] = {.n = 6, .s = "\\u0000"}, 30 ['\x01'] = {.n = 6, .s = "\\u0001"}, 31 ['\x02'] = {.n = 6, .s = "\\u0002"}, 32 ['\x03'] = {.n = 6, .s = "\\u0003"}, 33 ['\x04'] = {.n = 6, .s = "\\u0004"}, 34 ['\x05'] = {.n = 6, .s = "\\u0005"}, 35 ['\x06'] = {.n = 6, .s = "\\u0006"}, 36 ['\x07'] = {.n = 6, .s = "\\u0007"}, 37 ['\b'] = {.n = 6, .s = "\\u0008"}, 38 ['\t'] = {.n = 2, .s = "\\t"}, 39 ['\n'] = {.n = 2, .s = "\\n"}, 40 ['\x0b'] = {.n = 6, .s = "\\u000b"}, 41 ['\f'] = {.n = 6, .s = "\\u000c"}, 42 ['\r'] = {.n = 2, .s = "\\r"}, 43 ['\x0e'] = {.n = 6, .s = "\\u000e"}, 44 ['\x0f'] = {.n = 6, .s = "\\u000f"}, 45 ['\x10'] = {.n = 6, .s = "\\u0010"}, 46 ['\x11'] = {.n = 6, .s = "\\u0011"}, 47 ['\x12'] = {.n = 6, .s = "\\u0012"}, 48 ['\x13'] = {.n = 6, .s = "\\u0013"}, 49 ['\x14'] = {.n = 6, .s = "\\u0014"}, 50 ['\x15'] = {.n = 6, .s = "\\u0015"}, 51 ['\x16'] = {.n = 6, .s = "\\u0016"}, 52 ['\x17'] = {.n = 6, .s = "\\u0017"}, 53 ['\x18'] = {.n = 6, .s = "\\u0018"}, 54 ['\x19'] = {.n = 6, .s = "\\u0019"}, 55 ['\x1a'] = {.n = 6, .s = "\\u001a"}, 56 ['\x1b'] = {.n = 6, .s = "\\u001b"}, 57 ['\x1c'] = {.n = 6, .s = "\\u001c"}, 58 ['\x1d'] = {.n = 6, .s = "\\u001d"}, 59 ['\x1e'] = {.n = 6, .s = "\\u001e"}, 60 ['\x1f'] = {.n = 6, .s = "\\u001f"}, 61 ['"'] = {.n = 2, .s = "\\\""}, 62 ['\\'] = {.n = 2, .s = "\\\\"}, 63 }; 64 65 static const quoted_t _DoubleQuoteTab[256] = { 66 ['\x00'] = {.n = 7, .s = "\\\\u0000"}, 67 ['\x01'] = {.n = 7, .s = "\\\\u0001"}, 68 ['\x02'] = {.n = 7, .s = "\\\\u0002"}, 69 ['\x03'] = {.n = 7, .s = "\\\\u0003"}, 70 ['\x04'] = {.n = 7, .s = "\\\\u0004"}, 71 ['\x05'] = {.n = 7, .s = "\\\\u0005"}, 72 ['\x06'] = {.n = 7, .s = "\\\\u0006"}, 73 ['\x07'] = {.n = 7, .s = "\\\\u0007"}, 74 ['\b'] = {.n = 7, .s = "\\\\u0008"}, 75 ['\t'] = {.n = 3, .s = "\\\\t"}, 76 ['\n'] = {.n = 3, .s = "\\\\n"}, 77 ['\x0b'] = {.n = 7, .s = "\\\\u000b"}, 78 ['\f'] = {.n = 7, .s = "\\\\u000c"}, 79 ['\r'] = {.n = 3, .s = "\\\\r"}, 80 ['\x0e'] = {.n = 7, .s = "\\\\u000e"}, 81 ['\x0f'] = {.n = 7, .s = "\\\\u000f"}, 82 ['\x10'] = {.n = 7, .s = "\\\\u0010"}, 83 ['\x11'] = {.n = 7, .s = "\\\\u0011"}, 84 ['\x12'] = {.n = 7, .s = "\\\\u0012"}, 85 ['\x13'] = {.n = 7, .s = "\\\\u0013"}, 86 ['\x14'] = {.n = 7, .s = "\\\\u0014"}, 87 ['\x15'] = {.n = 7, .s = "\\\\u0015"}, 88 ['\x16'] = {.n = 7, .s = "\\\\u0016"}, 89 ['\x17'] = {.n = 7, .s = "\\\\u0017"}, 90 ['\x18'] = {.n = 7, .s = "\\\\u0018"}, 91 ['\x19'] = {.n = 7, .s = "\\\\u0019"}, 92 ['\x1a'] = {.n = 7, .s = "\\\\u001a"}, 93 ['\x1b'] = {.n = 7, .s = "\\\\u001b"}, 94 ['\x1c'] = {.n = 7, .s = "\\\\u001c"}, 95 ['\x1d'] = {.n = 7, .s = "\\\\u001d"}, 96 ['\x1e'] = {.n = 7, .s = "\\\\u001e"}, 97 ['\x1f'] = {.n = 7, .s = "\\\\u001f"}, 98 ['"'] = {.n = 4, .s = "\\\\\\\""}, 99 ['\\'] = {.n = 4, .s = "\\\\\\\\"}, 100 }; 101 102 static const quoted_t _HtmlQuoteTab[256] = { 103 ['<'] = {.n = 6, .s = "\\u003c"}, 104 ['>'] = {.n = 6, .s = "\\u003e"}, 105 ['&'] = {.n = 6, .s = "\\u0026"}, 106 // \u2028 and \u2029 is [E2 80 A8] and [E2 80 A9] 107 [0xe2] = {.n = 0, .s = {0}}, 108 [0xa8] = {.n = 6, .s = "\\u2028"}, 109 [0xa9] = {.n = 6, .s = "\\u2029"}, 110 }; 111 112 static inline void memcpy_p8(char *dp, const char *sp, ssize_t nb) 113 { 114 if (nb >= 4) 115 { 116 *(uint32_t *)dp = *(const uint32_t *)sp; 117 sp += 4, dp += 4, nb -= 4; 118 } 119 if (nb >= 2) 120 { 121 *(uint16_t *)dp = *(const uint16_t *)sp; 122 sp += 2, dp += 2, nb -= 2; 123 } 124 if (nb >= 1) 125 { 126 *dp = *sp; 127 } 128 } 129 130 static inline void memcpy_p16(char *dp, const char *sp, size_t nb) 131 { 132 if (nb >= 8) 133 { 134 *(uint64_t *)dp = *(const uint64_t *)sp; 135 sp += 8, dp += 8, nb -= 8; 136 } 137 if (nb >= 4) 138 { 139 *(uint32_t *)dp = *(const uint32_t *)sp; 140 sp += 4, dp += 4, nb -= 4; 141 } 142 if (nb >= 2) 143 { 144 *(uint16_t *)dp = *(const uint16_t *)sp; 145 sp += 2, dp += 2, nb -= 2; 146 } 147 if (nb >= 1) 148 { 149 *dp = *sp; 150 } 151 } 152 153 static inline void memcpy_p32(char *dp, const char *sp, size_t nb) 154 { 155 if (nb >= 16) 156 { 157 _mm_storeu_si128((void *)dp, _mm_loadu_si128((const void *)sp)); 158 sp += 16, dp += 16, nb -= 16; 159 } 160 if (nb >= 8) 161 { 162 *(uint64_t *)dp = *(const uint64_t *)sp; 163 sp += 8, dp += 8, nb -= 8; 164 } 165 if (nb >= 4) 166 { 167 *(uint32_t *)dp = *(const uint32_t *)sp; 168 sp += 4, dp += 4, nb -= 4; 169 } 170 if (nb >= 2) 171 { 172 *(uint16_t *)dp = *(const uint16_t *)sp; 173 sp += 2, dp += 2, nb -= 2; 174 } 175 if (nb >= 1) 176 { 177 *dp = *sp; 178 } 179 } 180 181 static inline __m128i _mm_find_quote(__m128i vv) 182 { 183 __m128i e1 = _mm_cmpgt_epi8(vv, _mm_set1_epi8(-1)); 184 __m128i e2 = _mm_cmpgt_epi8(vv, _mm_set1_epi8(31)); 185 __m128i e3 = _mm_cmpeq_epi8(vv, _mm_set1_epi8('"')); 186 __m128i e4 = _mm_cmpeq_epi8(vv, _mm_set1_epi8('\\')); 187 __m128i r1 = _mm_andnot_si128(e2, e1); 188 __m128i r2 = _mm_or_si128(e3, e4); 189 __m128i rv = _mm_or_si128(r1, r2); 190 return rv; 191 } 192 193 #if USE_AVX2 194 static inline __m256i _mm256_find_quote(__m256i vv) 195 { 196 __m256i e1 = _mm256_cmpgt_epi8(vv, _mm256_set1_epi8(-1)); 197 __m256i e2 = _mm256_cmpgt_epi8(vv, _mm256_set1_epi8(31)); 198 __m256i e3 = _mm256_cmpeq_epi8(vv, _mm256_set1_epi8('"')); 199 __m256i e4 = _mm256_cmpeq_epi8(vv, _mm256_set1_epi8('\\')); 200 __m256i r1 = _mm256_andnot_si256(e2, e1); 201 __m256i r2 = _mm256_or_si256(e3, e4); 202 __m256i rv = _mm256_or_si256(r1, r2); 203 return rv; 204 } 205 #endif 206 207 static inline ssize_t memcchr_quote(const char *sp, ssize_t nb, char *dp, ssize_t dn) 208 { 209 uint32_t mm; 210 const char *ss = sp; 211 212 #if USE_AVX2 213 /* 32-byte loop, full store */ 214 while (nb >= 32 && dn >= 32) 215 { 216 __m256i vv = _mm256_loadu_si256((const void *)sp); 217 __m256i rv = _mm256_find_quote(vv); 218 _mm256_storeu_si256((void *)dp, vv); 219 220 /* check for matches */ 221 if ((mm = _mm256_movemask_epi8(rv)) != 0) 222 { 223 return sp - ss + __builtin_ctz(mm); 224 } 225 226 /* move to next block */ 227 sp += 32; 228 dp += 32; 229 nb -= 32; 230 dn -= 32; 231 } 232 233 /* 32-byte test, partial store */ 234 if (nb >= 32) 235 { 236 __m256i vv = _mm256_loadu_si256((const void *)sp); 237 __m256i rv = _mm256_find_quote(vv); 238 uint32_t mv = _mm256_movemask_epi8(rv); 239 uint32_t fv = __builtin_ctzll((uint64_t)mv | 0x0100000000); 240 241 /* copy at most `dn` characters */ 242 if (fv <= dn) 243 { 244 memcpy_p32(dp, sp, fv); 245 return sp - ss + fv; 246 } 247 else 248 { 249 memcpy_p32(dp, sp, dn); 250 return -(sp - ss + dn) - 1; 251 } 252 } 253 254 /* clear upper half to avoid AVX-SSE transition penalty */ 255 _mm256_zeroupper(); 256 #endif 257 258 /* 16-byte loop, full store */ 259 while (nb >= 16 && dn >= 16) 260 { 261 __m128i vv = _mm_loadu_si128((const void *)sp); 262 __m128i rv = _mm_find_quote(vv); 263 _mm_storeu_si128((void *)dp, vv); 264 265 /* check for matches */ 266 if ((mm = _mm_movemask_epi8(rv)) != 0) 267 { 268 return sp - ss + __builtin_ctz(mm); 269 } 270 271 /* move to next block */ 272 sp += 16; 273 dp += 16; 274 nb -= 16; 275 dn -= 16; 276 } 277 278 /* 16-byte test, partial store */ 279 if (nb >= 16) 280 { 281 __m128i vv = _mm_loadu_si128((const void *)sp); 282 __m128i rv = _mm_find_quote(vv); 283 uint32_t mv = _mm_movemask_epi8(rv); 284 uint32_t fv = __builtin_ctz(mv | 0x010000); 285 286 /* copy at most `dn` characters */ 287 if (fv <= dn) 288 { 289 memcpy_p16(dp, sp, fv); 290 return sp - ss + fv; 291 } 292 else 293 { 294 memcpy_p16(dp, sp, dn); 295 return -(sp - ss + dn) - 1; 296 } 297 } 298 299 /* handle the remaining bytes with scalar code */ 300 while (nb > 0 && dn > 0) 301 { 302 if (_SingleQuoteTab[*(uint8_t *)sp].n) 303 { 304 return sp - ss; 305 } 306 else 307 { 308 dn--, nb--; 309 *dp++ = *sp++; 310 } 311 } 312 313 /* check for dest buffer */ 314 if (nb == 0) 315 { 316 return sp - ss; 317 } 318 else 319 { 320 return -(sp - ss) - 1; 321 } 322 } 323 324 static const bool _EscTab[256] = { 325 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00-0x0F 326 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10-0x1F 327 // '"' 328 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20-0x2F 329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30-0x3F 330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40-0x4F 331 // '"" 332 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, // 0x50-0x5F 333 // 0x60-0xFF are zeroes 334 }; 335 336 static inline uint8_t escape_mask4(const char *sp) 337 { 338 return _EscTab[*(uint8_t *)(sp)] | (_EscTab[*(uint8_t *)(sp + 1)] << 1) | (_EscTab[*(uint8_t *)(sp + 2)] << 2) | (_EscTab[*(uint8_t *)(sp + 3)] << 3); 339 } 340 341 static inline ssize_t memcchr_quote_unsafe(const char *sp, ssize_t nb, char *dp, const quoted_t *tab) 342 { 343 uint32_t mm; 344 const char *ds = dp; 345 size_t cn = 0; 346 347 simd_copy: 348 349 if (nb < 16) 350 goto scalar_copy; 351 352 #if USE_AVX2 353 /* 32-byte loop, full store */ 354 while (nb >= 32) 355 { 356 __m256i vv = _mm256_loadu_si256((const void *)sp); 357 __m256i rv = _mm256_find_quote(vv); 358 _mm256_storeu_si256((void *)dp, vv); 359 360 /* check for matches */ 361 if ((mm = _mm256_movemask_epi8(rv)) != 0) 362 { 363 cn = __builtin_ctz(mm); 364 sp += cn; 365 nb -= cn; 366 dp += cn; 367 goto escape; 368 } 369 370 /* move to next block */ 371 sp += 32; 372 dp += 32; 373 nb -= 32; 374 } 375 376 /* clear upper half to avoid AVX-SSE transition penalty */ 377 _mm256_zeroupper(); 378 #endif 379 380 /* 16-byte loop, full store */ 381 while (nb >= 16) 382 { 383 __m128i vv = _mm_loadu_si128((const void *)sp); 384 __m128i rv = _mm_find_quote(vv); 385 _mm_storeu_si128((void *)dp, vv); 386 387 /* check for matches */ 388 if ((mm = _mm_movemask_epi8(rv)) != 0) 389 { 390 cn = __builtin_ctz(mm); 391 sp += cn; 392 nb -= cn; 393 dp += cn; 394 goto escape; 395 } 396 397 /* move to next block */ 398 sp += 16; 399 dp += 16; 400 nb -= 16; 401 } 402 403 /* handle the remaining bytes with scalar code */ 404 // while (nb > 0) { 405 // if (_EscTab[*(uint8_t *)sp]) { 406 // goto escape; 407 // } else { 408 // nb--; 409 // *dp++ = *sp++; 410 // } 411 // } 412 // optimize: loop unrolling here 413 414 scalar_copy: 415 if (nb >= 8) 416 { 417 uint8_t mask1 = escape_mask4(sp); 418 *(uint64_t *)dp = *(const uint64_t *)sp; 419 if (unlikely(mask1)) 420 { 421 cn = __builtin_ctz(mask1); 422 sp += cn; 423 nb -= cn; 424 dp += cn; 425 goto escape; 426 } 427 uint8_t mask2 = escape_mask4(sp + 4); 428 if (unlikely(mask2)) 429 { 430 cn = __builtin_ctz(mask2); 431 sp += cn + 4; 432 nb -= cn + 4; 433 dp += cn + 4; 434 goto escape; 435 } 436 dp += 8, sp += 8, nb -= 8; 437 } 438 439 if (nb >= 4) 440 { 441 uint8_t mask2 = escape_mask4(sp); 442 *(uint32_t *)dp = *(const uint32_t *)sp; 443 if (unlikely(mask2)) 444 { 445 cn = __builtin_ctz(mask2); 446 sp += cn; 447 nb -= cn; 448 dp += cn; 449 goto escape; 450 } 451 dp += 4, sp += 4, nb -= 4; 452 } 453 454 while (nb > 0) 455 { 456 if (unlikely(_EscTab[*(uint8_t *)(sp)])) 457 goto escape; 458 *dp++ = *sp++, nb--; 459 } 460 /* all quote done */ 461 return dp - ds; 462 escape: 463 /* get the escape entry, handle consecutive quotes */ 464 do 465 { 466 uint8_t ch = *(uint8_t *)sp; 467 int nc = tab[ch].n; 468 /* copy the quoted value. 469 * Note: dp always has at least 8 bytes (MAX_ESCAPED_BYTES) here. 470 * so, we not use memcpy_p8(dp, tab[ch].s, nc); 471 */ 472 *(uint64_t *)dp = *(const uint64_t *)tab[ch].s; 473 sp++; 474 nb--; 475 dp += nc; 476 if (nb <= 0) 477 break; 478 /* copy and find escape chars */ 479 if (_EscTab[*(uint8_t *)(sp)] == 0) 480 { 481 goto simd_copy; 482 } 483 } while (true); 484 return dp - ds; 485 } 486 487 ssize_t quote(const char *sp, ssize_t nb, char *dp, ssize_t *dn, uint64_t flags) 488 { 489 ssize_t nd = *dn; 490 const char *ds = dp; 491 const char *ss = sp; 492 const quoted_t *tab; 493 494 /* select quoting table */ 495 if (!(flags & F_DBLUNQ)) 496 { 497 tab = _SingleQuoteTab; 498 } 499 else 500 { 501 tab = _DoubleQuoteTab; 502 } 503 504 if (*dn >= nb * MAX_ESCAPED_BYTES) 505 { 506 *dn = memcchr_quote_unsafe(sp, nb, dp, tab); 507 return nb; 508 } 509 510 /* find the special characters, copy on the fly */ 511 while (nb != 0) 512 { 513 int nc; 514 uint8_t ch; 515 ssize_t rb = memcchr_quote(sp, nb, dp, nd); 516 517 /* not enough buffer space */ 518 if (rb < 0) 519 { 520 *dn = dp - ds - rb - 1; 521 return -(sp - ss - rb - 1) - 1; 522 } 523 524 /* skip already copied bytes */ 525 sp += rb; 526 dp += rb; 527 nb -= rb; 528 nd -= rb; 529 530 /* get the escape entry, handle consecutive quotes */ 531 while (nb != 0) 532 { 533 ch = *(uint8_t *)sp; 534 nc = tab[ch].n; 535 536 /* check for escape character */ 537 if (nc == 0) 538 { 539 break; 540 } 541 542 /* check for buffer space */ 543 if (nc > nd) 544 { 545 *dn = dp - ds; 546 return -(sp - ss) - 1; 547 } 548 549 /* copy the quoted value */ 550 memcpy_p8(dp, tab[ch].s, nc); 551 sp++; 552 nb--; 553 dp += nc; 554 nd -= nc; 555 } 556 } 557 558 /* all done */ 559 *dn = dp - ds; 560 return sp - ss; 561 } 562 563 /** String Unquoting **/ 564 565 static const char _UnquoteTab[256] = { 566 ['/'] = '/', 567 ['"'] = '"', 568 ['b'] = '\b', 569 ['f'] = '\f', 570 ['n'] = '\n', 571 ['r'] = '\r', 572 ['t'] = '\t', 573 ['u'] = -1, 574 ['\\'] = '\\', 575 }; 576 577 static inline ssize_t memcchr_p32(const char *s, ssize_t nb, char *p) 578 { 579 int64_t r; 580 ssize_t n = nb; 581 const char *q = s; 582 583 #if USE_AVX2 584 __m256i u; 585 __m256i v; 586 __m256i b = _mm256_set1_epi8('\\'); 587 588 /* process every 32 bytes */ 589 while (n >= 32) 590 { 591 u = _mm256_loadu_si256((const void *)s); 592 v = _mm256_cmpeq_epi8(u, b); 593 _mm256_storeu_si256((void *)p, u); 594 595 /* check for matches */ 596 if ((r = _mm256_movemask_epi8(v)) != 0) 597 { 598 return s - q + __builtin_ctzll(r); 599 } 600 601 /* move to the next 32 bytes */ 602 s += 32; 603 p += 32; 604 n -= 32; 605 } 606 607 /* clear upper half to avoid AVX-SSE transition penalty */ 608 _mm256_zeroupper(); 609 #endif 610 611 /* initialze with '\\' */ 612 __m128i x; 613 __m128i y; 614 __m128i a = _mm_set1_epi8('\\'); 615 616 /* process every 16 bytes */ 617 while (n >= 16) 618 { 619 x = _mm_loadu_si128((const void *)s); 620 y = _mm_cmpeq_epi8(x, a); 621 _mm_storeu_si128((void *)p, x); 622 623 /* check for matches */ 624 if ((r = _mm_movemask_epi8(y)) != 0) 625 { 626 return s - q + __builtin_ctzll(r); 627 } 628 629 /* move to the next 16 bytes */ 630 s += 16; 631 p += 16; 632 n -= 16; 633 } 634 635 /* remaining bytes, do with scalar code */ 636 while (n--) 637 { 638 if (*s != '\\') 639 { 640 *p++ = *s++; 641 } 642 else 643 { 644 return s - q; 645 } 646 } 647 648 /* nothing found, but everything was copied */ 649 return -1; 650 } 651 652 #define ALL_01h (~0ul / 255) 653 #define ALL_7fh (ALL_01h * 127) 654 #define ALL_80h (ALL_01h * 128) 655 656 static inline uint32_t hasless(uint32_t x, uint8_t n) 657 { 658 return (x - ALL_01h * n) & ~x & ALL_80h; 659 } 660 661 static inline uint32_t hasmore(uint32_t x, uint8_t n) 662 { 663 return (x + ALL_01h * (127 - n) | x) & ALL_80h; 664 } 665 666 static inline uint32_t hasbetween(uint32_t x, uint8_t m, uint8_t n) 667 { 668 return (ALL_01h * (127 + n) - (x & ALL_7fh) & ~x & (x & ALL_7fh) + ALL_01h * (127 - m)) & ALL_80h; 669 } 670 671 #undef ALL_01h 672 #undef ALL_7fh 673 #undef ALL_80h 674 675 static inline char ishex(char c) 676 { 677 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); 678 } 679 680 static inline void unirep(char **dp) 681 { 682 *(*dp)++ = 0xef; 683 *(*dp)++ = 0xbf; 684 *(*dp)++ = 0xbd; 685 } 686 687 static inline char unhex16_is(const char *s) 688 { 689 uint32_t v = *(uint32_t *)s; 690 return !(hasless(v, '0') || hasmore(v, 'f') || hasbetween(v, '9', 'A') || hasbetween(v, 'F', 'a')); 691 } 692 693 static inline uint32_t unhex16_fast(const char *s) 694 { 695 uint32_t a = __builtin_bswap32(*(uint32_t *)s); 696 uint32_t b = 9 * ((~a & 0x10101010) >> 4) + (a & 0x0f0f0f0f); 697 uint32_t c = (b >> 4) | b; 698 uint32_t d = ((c >> 8) & 0xff00) | (c & 0x00ff); 699 return d; 700 } 701 702 ssize_t unquote(const char *sp, ssize_t nb, char *dp, int64_t *ep, uint64_t flags) 703 { 704 ssize_t n; 705 ssize_t x = nb; 706 const char *s = sp; 707 const char *p = dp; 708 709 /* scan & copy all the non-escape characters */ 710 while (nb && (n = (*sp == '\\' ? 0 : memcchr_p32(sp, nb, dp))) != -1) 711 { 712 char cc; 713 uint32_t r0; 714 uint32_t r1; 715 716 /* skip the plain text */ 717 dp += n; 718 sp += n + 2; 719 nb -= n + 2; 720 721 /* check for EOF */ 722 if (nb < 0) 723 { 724 *ep = x; 725 return -ERR_EOF; 726 } 727 728 /* check for double unquote */ 729 if (unlikely(flags & F_DBLUNQ)) 730 { 731 int nr = nb; 732 char c1 = sp[-1]; 733 734 /* must have at least 1 character left */ 735 if (nr == 0) 736 { 737 *ep = x; 738 return -ERR_EOF; 739 } 740 741 /* every quote must be a double quote */ 742 if (c1 != '\\') 743 { 744 *ep = sp - s - 1; 745 return -ERR_INVAL; 746 } 747 748 /* special case of '\\\\' and '\\\"' */ 749 if (*sp == '\\') 750 { 751 if (nr < 2) 752 { 753 *ep = x; 754 return -ERR_EOF; 755 } 756 else if (sp[1] != '"' && sp[1] != '\\') 757 { 758 *ep = sp - s + 1; 759 return -ERR_INVAL; 760 } 761 else 762 { 763 sp++; 764 nb--; 765 } 766 } 767 768 /* skip the second escape */ 769 sp++; 770 nb--; 771 } 772 773 /* check for escape sequence */ 774 if ((cc = _UnquoteTab[(uint8_t)sp[-1]]) == 0) 775 { 776 *ep = sp - s - 1; 777 return -ERR_ESCAPE; 778 } 779 780 /* check for simple escape sequence */ 781 if (cc != -1) 782 { 783 *dp++ = cc; 784 continue; 785 } 786 787 /* must have at least 4 characters */ 788 if (nb < 4) 789 { 790 *ep = x; 791 return -ERR_EOF; 792 } 793 794 /* check for hexadecimal characters */ 795 if (!unhex16_is(sp)) 796 { 797 *ep = sp - s; 798 for (int i = 0; i < 4 && ishex(*sp); i++, sp++) 799 ++*ep; 800 return -ERR_INVAL; 801 } 802 803 /* decode the code-point */ 804 r0 = unhex16_fast(sp); 805 sp += 4; 806 nb -= 4; 807 808 /* from line 598 */ 809 retry_decode: 810 811 /* ASCII characters, unlikely */ 812 if (unlikely(r0 <= 0x7f)) 813 { 814 *dp++ = (char)r0; 815 continue; 816 } 817 818 /* latin-1 characters, unlikely */ 819 if (unlikely(r0 <= 0x07ff)) 820 { 821 *dp++ = (char)(0xc0 | (r0 >> 6)); 822 *dp++ = (char)(0x80 | (r0 & 0x3f)); 823 continue; 824 } 825 826 /* 3-byte characters, likely */ 827 if (likely(r0 < 0xd800 || r0 > 0xdfff)) 828 { 829 *dp++ = (char)(0xe0 | ((r0 >> 12))); 830 *dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f)); 831 *dp++ = (char)(0x80 | ((r0)&0x3f)); 832 continue; 833 } 834 835 /* check for double unquote */ 836 if (unlikely(flags & F_DBLUNQ)) 837 { 838 if (nb < 1) 839 { 840 if (likely(flags & F_UNIREP)) 841 { 842 unirep(&dp); 843 continue; 844 } 845 else 846 { 847 *ep = x; 848 return -ERR_EOF; 849 } 850 } 851 else 852 { 853 if (sp[0] == '\\') 854 { 855 nb--; 856 sp++; 857 } 858 else if (likely(flags & F_UNIREP)) 859 { 860 unirep(&dp); 861 continue; 862 } 863 else 864 { 865 *ep = sp - s - 4; 866 return -ERR_UNICODE; 867 } 868 } 869 } 870 871 /* surrogate half, must follows by the other half */ 872 if (nb < 6 || r0 > 0xdbff || sp[0] != '\\' || sp[1] != 'u') 873 { 874 if (likely(flags & F_UNIREP)) 875 { 876 unirep(&dp); 877 continue; 878 } 879 else 880 { 881 *ep = sp - s - ((flags & F_DBLUNQ) ? 5 : 4); 882 return -ERR_UNICODE; 883 } 884 } 885 886 /* check the hexadecimal escape */ 887 if (!unhex16_is(sp + 2)) 888 { 889 *ep = sp - s + 2; 890 for (int i = 2; i < 6 && ishex(sp[i]); i++) 891 ++*ep; 892 return -ERR_INVAL; 893 } 894 895 /* decode the second code-point */ 896 r1 = unhex16_fast(sp + 2); 897 sp += 6; 898 nb -= 6; 899 900 /* it must be the other half */ 901 if (r1 < 0xdc00 || r1 > 0xdfff) 902 { 903 if (unlikely(!(flags & F_UNIREP))) 904 { 905 *ep = sp - s - 4; 906 return -ERR_UNICODE; 907 } 908 else 909 { 910 r0 = r1; 911 unirep(&dp); 912 goto retry_decode; 913 } 914 } 915 916 /* merge two surrogates */ 917 r0 = (r0 - 0xd800) << 10; 918 r1 = (r1 - 0xdc00) + 0x010000; 919 r0 += r1; 920 921 /* check the code point range */ 922 if (r0 > 0x10ffff) 923 { 924 if (likely(!(flags & F_UNIREP))) 925 { 926 *ep = sp - s - 4; 927 return -ERR_UNICODE; 928 } 929 else 930 { 931 unirep(&dp); 932 continue; 933 } 934 } 935 936 /* encode the character */ 937 *dp++ = (char)(0xf0 | ((r0 >> 18))); 938 *dp++ = (char)(0x80 | ((r0 >> 12) & 0x3f)); 939 *dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f)); 940 *dp++ = (char)(0x80 | ((r0)&0x3f)); 941 } 942 943 /* calculate the result length */ 944 return dp + nb - p; 945 } 946 947 static inline __m128i _mm_find_html(__m128i vv) 948 { 949 __m128i e1 = _mm_cmpeq_epi8(vv, _mm_set1_epi8('<')); 950 __m128i e2 = _mm_cmpeq_epi8(vv, _mm_set1_epi8('>')); 951 __m128i e3 = _mm_cmpeq_epi8(vv, _mm_set1_epi8('&')); 952 __m128i e4 = _mm_cmpeq_epi8(vv, _mm_set1_epi8('\xe2')); 953 __m128i r1 = _mm_or_si128(e1, e2); 954 __m128i r2 = _mm_or_si128(e3, e4); 955 __m128i rv = _mm_or_si128(r1, r2); 956 return rv; 957 } 958 959 #if USE_AVX2 960 static inline __m256i _mm256_find_html(__m256i vv) 961 { 962 __m256i e1 = _mm256_cmpeq_epi8(vv, _mm256_set1_epi8('<')); 963 __m256i e2 = _mm256_cmpeq_epi8(vv, _mm256_set1_epi8('>')); 964 __m256i e3 = _mm256_cmpeq_epi8(vv, _mm256_set1_epi8('&')); 965 __m256i e4 = _mm256_cmpeq_epi8(vv, _mm256_set1_epi8('\xe2')); 966 __m256i r1 = _mm256_or_si256(e1, e2); 967 __m256i r2 = _mm256_or_si256(e3, e4); 968 __m256i rv = _mm256_or_si256(r1, r2); 969 return rv; 970 } 971 #endif 972 973 static inline ssize_t memcchr_html_quote(const char *sp, ssize_t nb, char *dp, ssize_t dn) 974 { 975 uint32_t mm; 976 const char *ss = sp; 977 978 #if USE_AVX2 979 /* 32-byte loop, full store */ 980 while (nb >= 32 && dn >= 32) 981 { 982 __m256i vv = _mm256_loadu_si256((const void *)sp); 983 __m256i rv = _mm256_find_html(vv); 984 _mm256_storeu_si256((void *)dp, vv); 985 986 /* check for matches */ 987 if ((mm = _mm256_movemask_epi8(rv)) != 0) 988 { 989 return sp - ss + __builtin_ctz(mm); 990 } 991 992 /* move to next block */ 993 sp += 32; 994 dp += 32; 995 nb -= 32; 996 dn -= 32; 997 } 998 999 /* 32-byte test, partial store */ 1000 if (nb >= 32) 1001 { 1002 __m256i vv = _mm256_loadu_si256((const void *)sp); 1003 __m256i rv = _mm256_find_html(vv); 1004 uint32_t mv = _mm256_movemask_epi8(rv); 1005 uint32_t fv = __builtin_ctzll((uint64_t)mv | 0x0100000000); 1006 1007 /* copy at most `dn` characters */ 1008 if (fv <= dn) 1009 { 1010 memcpy_p32(dp, sp, fv); 1011 return sp - ss + fv; 1012 } 1013 else 1014 { 1015 memcpy_p32(dp, sp, dn); 1016 return -(sp - ss + dn) - 1; 1017 } 1018 } 1019 1020 /* clear upper half to avoid AVX-SSE transition penalty */ 1021 _mm256_zeroupper(); 1022 #endif 1023 1024 /* 16-byte loop, full store */ 1025 while (nb >= 16 && dn >= 16) 1026 { 1027 __m128i vv = _mm_loadu_si128((const void *)sp); 1028 __m128i rv = _mm_find_html(vv); 1029 _mm_storeu_si128((void *)dp, vv); 1030 1031 /* check for matches */ 1032 if ((mm = _mm_movemask_epi8(rv)) != 0) 1033 { 1034 return sp - ss + __builtin_ctz(mm); 1035 } 1036 1037 /* move to next block */ 1038 sp += 16; 1039 dp += 16; 1040 nb -= 16; 1041 dn -= 16; 1042 } 1043 1044 /* 16-byte test, partial store */ 1045 if (nb >= 16) 1046 { 1047 __m128i vv = _mm_loadu_si128((const void *)sp); 1048 __m128i rv = _mm_find_html(vv); 1049 uint32_t mv = _mm_movemask_epi8(rv); 1050 uint32_t fv = __builtin_ctz(mv | 0x010000); 1051 1052 /* copy at most `dn` characters */ 1053 if (fv <= dn) 1054 { 1055 memcpy_p16(dp, sp, fv); 1056 return sp - ss + fv; 1057 } 1058 else 1059 { 1060 memcpy_p16(dp, sp, dn); 1061 return -(sp - ss + dn) - 1; 1062 } 1063 } 1064 1065 /* handle the remaining bytes with scalar code */ 1066 while (nb > 0 && dn > 0) 1067 { 1068 if (*sp == '<' || *sp == '>' || *sp == '&' || *sp == '\xe2') 1069 { 1070 return sp - ss; 1071 } 1072 else 1073 { 1074 dn--, nb--; 1075 *dp++ = *sp++; 1076 } 1077 } 1078 1079 /* check for dest buffer */ 1080 if (nb == 0) 1081 { 1082 return sp - ss; 1083 } 1084 else 1085 { 1086 return -(sp - ss) - 1; 1087 } 1088 } 1089 1090 ssize_t html_escape(const char *sp, ssize_t nb, char *dp, ssize_t *dn) 1091 { 1092 ssize_t nd = *dn; 1093 const char *ds = dp; 1094 const char *ss = sp; 1095 const quoted_t *tab = _HtmlQuoteTab; 1096 1097 /* find the special characters, copy on the fly */ 1098 while (nb > 0) 1099 { 1100 int nc = 0; 1101 uint8_t ch = 0; 1102 ssize_t rb = 0; 1103 const char *cur = 0; 1104 1105 /* not enough buffer space */ 1106 if (nd <= 0) 1107 { 1108 return -(sp - ss) - 1; 1109 } 1110 1111 /* find and copy */ 1112 if ((rb = memcchr_html_quote(sp, nb, dp, nd)) < 0) 1113 { 1114 *dn = dp - ds - rb - 1; 1115 return -(sp - ss - rb - 1) - 1; 1116 } 1117 1118 /* skip already copied bytes */ 1119 sp += rb; 1120 dp += rb; 1121 nb -= rb; 1122 nd -= rb; 1123 1124 /* stop if already finished */ 1125 if (nb <= 0) 1126 { 1127 break; 1128 } 1129 1130 /* mark cur postion */ 1131 cur = sp; 1132 1133 /* check for \u2028 and \u2029, binary is \xe2\x80\xa8 and \xe2\x80\xa9 */ 1134 if (unlikely(*sp == '\xe2')) 1135 { 1136 if (nb >= 3 && *(sp + 1) == '\x80' && (*(sp + 2) == '\xa8' || *(sp + 2) == '\xa9')) 1137 { 1138 sp += 2, nb -= 2; 1139 } 1140 else if (nd > 0) 1141 { 1142 *dp++ = *sp++; 1143 nb--, nd--; 1144 continue; 1145 } 1146 else 1147 { 1148 return -(sp - ss) - 1; 1149 } 1150 } 1151 1152 /* get the escape entry, handle consecutive quotes */ 1153 ch = *(uint8_t *)sp; 1154 nc = tab[ch].n; 1155 1156 /* check for buffer space */ 1157 if (nd < nc) 1158 { 1159 *dn = dp - ds; 1160 return -(cur - ss) - 1; 1161 } 1162 1163 /* copy the quoted value */ 1164 memcpy_p8(dp, tab[ch].s, nc); 1165 sp++; 1166 nb--; 1167 dp += nc; 1168 nd -= nc; 1169 } 1170 1171 /* all done */ 1172 *dn = dp - ds; 1173 return sp - ss; 1174 } 1175 1176 #undef MAX_ESCAPED_BYTES