github.com/goshafaq/sonic@v0.0.0-20231026082336-871835fb94c6/native/parsing.c (about) 1 /* 2 * Copyright 2021 ByteDance Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "native.h" 18 #include "utils.h" 19 #include <stdint.h> 20 21 /** String Quoting **/ 22 #define MAX_ESCAPED_BYTES 8 23 typedef struct { 24 const long n; 25 const char s[MAX_ESCAPED_BYTES]; 26 } quoted_t; 27 28 static const quoted_t _SingleQuoteTab[256] = { 29 ['\x00'] = { .n = 6, .s = "\\u0000" }, 30 ['\x01'] = { .n = 6, .s = "\\u0001" }, 31 ['\x02'] = { .n = 6, .s = "\\u0002" }, 32 ['\x03'] = { .n = 6, .s = "\\u0003" }, 33 ['\x04'] = { .n = 6, .s = "\\u0004" }, 34 ['\x05'] = { .n = 6, .s = "\\u0005" }, 35 ['\x06'] = { .n = 6, .s = "\\u0006" }, 36 ['\x07'] = { .n = 6, .s = "\\u0007" }, 37 ['\b' ] = { .n = 6, .s = "\\u0008" }, 38 ['\t' ] = { .n = 2, .s = "\\t" }, 39 ['\n' ] = { .n = 2, .s = "\\n" }, 40 ['\x0b'] = { .n = 6, .s = "\\u000b" }, 41 ['\f' ] = { .n = 6, .s = "\\u000c" }, 42 ['\r' ] = { .n = 2, .s = "\\r" }, 43 ['\x0e'] = { .n = 6, .s = "\\u000e" }, 44 ['\x0f'] = { .n = 6, .s = "\\u000f" }, 45 ['\x10'] = { .n = 6, .s = "\\u0010" }, 46 ['\x11'] = { .n = 6, .s = "\\u0011" }, 47 ['\x12'] = { .n = 6, .s = "\\u0012" }, 48 ['\x13'] = { .n = 6, .s = "\\u0013" }, 49 ['\x14'] = { .n = 6, .s = "\\u0014" }, 50 ['\x15'] = { .n = 6, .s = "\\u0015" }, 51 ['\x16'] = { .n = 6, .s = "\\u0016" }, 52 ['\x17'] = { .n = 6, .s = "\\u0017" }, 53 ['\x18'] = { .n = 6, .s = "\\u0018" }, 54 ['\x19'] = { .n = 6, .s = "\\u0019" }, 55 ['\x1a'] = { .n = 6, .s = "\\u001a" }, 56 ['\x1b'] = { .n = 6, .s = "\\u001b" }, 57 ['\x1c'] = { .n = 6, .s = "\\u001c" }, 58 ['\x1d'] = { .n = 6, .s = "\\u001d" }, 59 ['\x1e'] = { .n = 6, .s = "\\u001e" }, 60 ['\x1f'] = { .n = 6, .s = "\\u001f" }, 61 ['"' ] = { .n = 2, .s = "\\\"" }, 62 ['\\' ] = { .n = 2, .s = "\\\\" }, 63 }; 64 65 static const quoted_t _DoubleQuoteTab[256] = { 66 ['\x00'] = { .n = 7, .s = "\\\\u0000" }, 67 ['\x01'] = { .n = 7, .s = "\\\\u0001" }, 68 ['\x02'] = { .n = 7, .s = "\\\\u0002" }, 69 ['\x03'] = { .n = 7, .s = "\\\\u0003" }, 70 ['\x04'] = { .n = 7, .s = "\\\\u0004" }, 71 ['\x05'] = { .n = 7, .s = "\\\\u0005" }, 72 ['\x06'] = { .n = 7, .s = "\\\\u0006" }, 73 ['\x07'] = { .n = 7, .s = "\\\\u0007" }, 74 ['\b' ] = { .n = 7, .s = "\\\\u0008" }, 75 ['\t' ] = { .n = 3, .s = "\\\\t" }, 76 ['\n' ] = { .n = 3, .s = "\\\\n" }, 77 ['\x0b'] = { .n = 7, .s = "\\\\u000b" }, 78 ['\f' ] = { .n = 7, .s = "\\\\u000c" }, 79 ['\r' ] = { .n = 3, .s = "\\\\r" }, 80 ['\x0e'] = { .n = 7, .s = "\\\\u000e" }, 81 ['\x0f'] = { .n = 7, .s = "\\\\u000f" }, 82 ['\x10'] = { .n = 7, .s = "\\\\u0010" }, 83 ['\x11'] = { .n = 7, .s = "\\\\u0011" }, 84 ['\x12'] = { .n = 7, .s = "\\\\u0012" }, 85 ['\x13'] = { .n = 7, .s = "\\\\u0013" }, 86 ['\x14'] = { .n = 7, .s = "\\\\u0014" }, 87 ['\x15'] = { .n = 7, .s = "\\\\u0015" }, 88 ['\x16'] = { .n = 7, .s = "\\\\u0016" }, 89 ['\x17'] = { .n = 7, .s = "\\\\u0017" }, 90 ['\x18'] = { .n = 7, .s = "\\\\u0018" }, 91 ['\x19'] = { .n = 7, .s = "\\\\u0019" }, 92 ['\x1a'] = { .n = 7, .s = "\\\\u001a" }, 93 ['\x1b'] = { .n = 7, .s = "\\\\u001b" }, 94 ['\x1c'] = { .n = 7, .s = "\\\\u001c" }, 95 ['\x1d'] = { .n = 7, .s = "\\\\u001d" }, 96 ['\x1e'] = { .n = 7, .s = "\\\\u001e" }, 97 ['\x1f'] = { .n = 7, .s = "\\\\u001f" }, 98 ['"' ] = { .n = 4, .s = "\\\\\\\"" }, 99 ['\\' ] = { .n = 4, .s = "\\\\\\\\" }, 100 }; 101 102 static const quoted_t _HtmlQuoteTab[256] = { 103 ['<'] = { .n = 6, .s = "\\u003c" }, 104 ['>'] = { .n = 6, .s = "\\u003e" }, 105 ['&'] = { .n = 6, .s = "\\u0026" }, 106 // \u2028 and \u2029 is [E2 80 A8] and [E2 80 A9] 107 [0xe2] = { .n = 0, .s = {0} }, 108 [0xa8] = { .n = 6, .s = "\\u2028" }, 109 [0xa9] = { .n = 6, .s = "\\u2029" }, 110 }; 111 112 static inline __m128i _mm_find_quote(__m128i vv) { 113 __m128i e1 = _mm_cmpgt_epi8 (vv, _mm_set1_epi8(-1)); 114 __m128i e2 = _mm_cmpgt_epi8 (vv, _mm_set1_epi8(31)); 115 __m128i e3 = _mm_cmpeq_epi8 (vv, _mm_set1_epi8('"')); 116 __m128i e4 = _mm_cmpeq_epi8 (vv, _mm_set1_epi8('\\')); 117 __m128i r1 = _mm_andnot_si128 (e2, e1); 118 __m128i r2 = _mm_or_si128 (e3, e4); 119 __m128i rv = _mm_or_si128 (r1, r2); 120 return rv; 121 } 122 123 #if USE_AVX2 124 static inline __m256i _mm256_find_quote(__m256i vv) { 125 __m256i e1 = _mm256_cmpgt_epi8 (vv, _mm256_set1_epi8(-1)); 126 __m256i e2 = _mm256_cmpgt_epi8 (vv, _mm256_set1_epi8(31)); 127 __m256i e3 = _mm256_cmpeq_epi8 (vv, _mm256_set1_epi8('"')); 128 __m256i e4 = _mm256_cmpeq_epi8 (vv, _mm256_set1_epi8('\\')); 129 __m256i r1 = _mm256_andnot_si256 (e2, e1); 130 __m256i r2 = _mm256_or_si256 (e3, e4); 131 __m256i rv = _mm256_or_si256 (r1, r2); 132 return rv; 133 } 134 #endif 135 136 static inline ssize_t memcchr_quote(const char *sp, ssize_t nb, char *dp, ssize_t dn) { 137 uint32_t mm; 138 const char * ss = sp; 139 140 #if USE_AVX2 141 /* 32-byte loop, full store */ 142 while (nb >= 32 && dn >= 32) { 143 __m256i vv = _mm256_loadu_si256 ((const void *)sp); 144 __m256i rv = _mm256_find_quote (vv); 145 _mm256_storeu_si256 ((void *)dp, vv); 146 147 /* check for matches */ 148 if ((mm = _mm256_movemask_epi8(rv)) != 0) { 149 return sp - ss + __builtin_ctz(mm); 150 } 151 152 /* move to next block */ 153 sp += 32; 154 dp += 32; 155 nb -= 32; 156 dn -= 32; 157 } 158 159 /* 32-byte test, partial store */ 160 if (nb >= 32) { 161 __m256i vv = _mm256_loadu_si256 ((const void *)sp); 162 __m256i rv = _mm256_find_quote (vv); 163 uint32_t mv = _mm256_movemask_epi8 (rv); 164 uint32_t fv = __builtin_ctzll ((uint64_t)mv | 0x0100000000); 165 166 /* copy at most `dn` characters */ 167 if (fv <= dn) { 168 memcpy_p32(dp, sp, fv); 169 return sp - ss + fv; 170 } else { 171 memcpy_p32(dp, sp, dn); 172 return -(sp - ss + dn) - 1; 173 } 174 } 175 176 /* clear upper half to avoid AVX-SSE transition penalty */ 177 _mm256_zeroupper(); 178 #endif 179 180 /* 16-byte loop, full store */ 181 while (nb >= 16 && dn >= 16) { 182 __m128i vv = _mm_loadu_si128 ((const void *)sp); 183 __m128i rv = _mm_find_quote (vv); 184 _mm_storeu_si128 ((void *)dp, vv); 185 186 /* check for matches */ 187 if ((mm = _mm_movemask_epi8(rv)) != 0) { 188 return sp - ss + __builtin_ctz(mm); 189 } 190 191 /* move to next block */ 192 sp += 16; 193 dp += 16; 194 nb -= 16; 195 dn -= 16; 196 } 197 198 /* 16-byte test, partial store */ 199 if (nb >= 16) { 200 __m128i vv = _mm_loadu_si128 ((const void *)sp); 201 __m128i rv = _mm_find_quote (vv); 202 uint32_t mv = _mm_movemask_epi8 (rv); 203 uint32_t fv = __builtin_ctz (mv | 0x010000); 204 205 /* copy at most `dn` characters */ 206 if (fv <= dn) { 207 memcpy_p16(dp, sp, fv); 208 return sp - ss + fv; 209 } else { 210 memcpy_p16(dp, sp, dn); 211 return -(sp - ss + dn) - 1; 212 } 213 } 214 215 /* handle the remaining bytes with scalar code */ 216 while (nb > 0 && dn > 0) { 217 if (_SingleQuoteTab[*(uint8_t *)sp].n) { 218 return sp - ss; 219 } else { 220 dn--, nb--; 221 *dp++ = *sp++; 222 } 223 } 224 225 /* check for dest buffer */ 226 if (nb == 0) { 227 return sp - ss; 228 } else { 229 return -(sp - ss) - 1; 230 } 231 } 232 233 static const bool _EscTab[256] = { 234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00-0x0F 235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10-0x1F 236 // '"' 237 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20-0x2F 238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30-0x3F 239 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40-0x4F 240 // '"" 241 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, // 0x50-0x5F 242 // 0x60-0xFF are zeroes 243 }; 244 245 static inline uint8_t escape_mask4(const char *sp) { 246 return _EscTab[*(uint8_t *)(sp)] | (_EscTab[*(uint8_t *)(sp + 1)] << 1) | (_EscTab[*(uint8_t *)(sp + 2)] << 2) | (_EscTab[*(uint8_t *)(sp + 3)] << 3); 247 } 248 249 static inline ssize_t memcchr_quote_unsafe(const char *sp, ssize_t nb, char *dp, const quoted_t * tab) { 250 uint32_t mm; 251 const char * ds = dp; 252 size_t cn = 0; 253 254 simd_copy: 255 256 if (nb < 16) goto scalar_copy; 257 258 #if USE_AVX2 259 /* 32-byte loop, full store */ 260 while (nb >= 32) { 261 __m256i vv = _mm256_loadu_si256 ((const void *)sp); 262 __m256i rv = _mm256_find_quote (vv); 263 _mm256_storeu_si256 ((void *)dp, vv); 264 265 /* check for matches */ 266 if ((mm = _mm256_movemask_epi8(rv)) != 0) { 267 cn = __builtin_ctz(mm); 268 sp += cn; 269 nb -= cn; 270 dp += cn; 271 goto escape; 272 } 273 274 /* move to next block */ 275 sp += 32; 276 dp += 32; 277 nb -= 32; 278 } 279 280 /* clear upper half to avoid AVX-SSE transition penalty */ 281 _mm256_zeroupper(); 282 #endif 283 284 /* 16-byte loop, full store */ 285 while (nb >= 16) { 286 __m128i vv = _mm_loadu_si128 ((const void *)sp); 287 __m128i rv = _mm_find_quote (vv); 288 _mm_storeu_si128 ((void *)dp, vv); 289 290 /* check for matches */ 291 if ((mm = _mm_movemask_epi8(rv)) != 0) { 292 cn = __builtin_ctz(mm); 293 sp += cn; 294 nb -= cn; 295 dp += cn; 296 goto escape; 297 } 298 299 /* move to next block */ 300 sp += 16; 301 dp += 16; 302 nb -= 16; 303 } 304 305 /* handle the remaining bytes with scalar code */ 306 // while (nb > 0) { 307 // if (_EscTab[*(uint8_t *)sp]) { 308 // goto escape; 309 // } else { 310 // nb--; 311 // *dp++ = *sp++; 312 // } 313 // } 314 // optimize: loop unrolling here 315 316 scalar_copy: 317 if (nb >= 8) { 318 uint8_t mask1 = escape_mask4(sp); 319 *(uint64_t *)dp = *(const uint64_t *)sp; 320 if (unlikely(mask1)) { 321 cn = __builtin_ctz(mask1); 322 sp += cn; 323 nb -= cn; 324 dp += cn; 325 goto escape; 326 } 327 uint8_t mask2 = escape_mask4(sp + 4); 328 if (unlikely(mask2)) { 329 cn = __builtin_ctz(mask2); 330 sp += cn + 4; 331 nb -= cn + 4; 332 dp += cn + 4; 333 goto escape; 334 } 335 dp += 8, sp += 8, nb -= 8; 336 } 337 338 if (nb >= 4) { 339 uint8_t mask2 = escape_mask4(sp); 340 *(uint32_t *)dp = *(const uint32_t *)sp; 341 if (unlikely(mask2)) { 342 cn = __builtin_ctz(mask2); 343 sp += cn; 344 nb -= cn; 345 dp += cn; 346 goto escape; 347 } 348 dp += 4, sp += 4, nb -= 4; 349 } 350 351 while (nb > 0) { 352 if (unlikely(_EscTab[*(uint8_t *)(sp)])) goto escape; 353 *dp++ = *sp++, nb--; 354 } 355 /* all quote done */ 356 return dp - ds; 357 escape: 358 /* get the escape entry, handle consecutive quotes */ 359 do { 360 uint8_t ch = *(uint8_t *)sp; 361 int nc = tab[ch].n; 362 /* copy the quoted value. 363 * Note: dp always has at least 8 bytes (MAX_ESCAPED_BYTES) here. 364 * so, we not use memcpy_p8(dp, tab[ch].s, nc); 365 */ 366 *(uint64_t *)dp = *(const uint64_t *)tab[ch].s; 367 sp++; 368 nb--; 369 dp += nc; 370 if (nb <= 0) break; 371 /* copy and find escape chars */ 372 if (_EscTab[*(uint8_t *)(sp)] == 0) { 373 goto simd_copy; 374 } 375 } while (true); 376 return dp - ds; 377 } 378 379 ssize_t quote(const char *sp, ssize_t nb, char *dp, ssize_t *dn, uint64_t flags) { 380 ssize_t nd = *dn; 381 const char * ds = dp; 382 const char * ss = sp; 383 const quoted_t * tab; 384 385 /* select quoting table */ 386 if (!(flags & F_DBLUNQ)) { 387 tab = _SingleQuoteTab; 388 } else { 389 tab = _DoubleQuoteTab; 390 } 391 392 if (*dn >= nb * MAX_ESCAPED_BYTES) { 393 *dn = memcchr_quote_unsafe(sp, nb, dp, tab); 394 return nb; 395 } 396 397 /* find the special characters, copy on the fly */ 398 while (nb != 0) { 399 int nc; 400 uint8_t ch; 401 ssize_t rb = memcchr_quote(sp, nb, dp, nd); 402 403 /* not enough buffer space */ 404 if (rb < 0) { 405 *dn = dp - ds - rb - 1; 406 return -(sp - ss - rb - 1) - 1; 407 } 408 409 /* skip already copied bytes */ 410 sp += rb; 411 dp += rb; 412 nb -= rb; 413 nd -= rb; 414 415 /* get the escape entry, handle consecutive quotes */ 416 while (nb != 0) { 417 ch = *(uint8_t *)sp; 418 nc = tab[ch].n; 419 420 /* check for escape character */ 421 if (nc == 0) { 422 break; 423 } 424 425 /* check for buffer space */ 426 if (nc > nd) { 427 *dn = dp - ds; 428 return -(sp - ss) - 1; 429 } 430 431 /* copy the quoted value */ 432 memcpy_p8(dp, tab[ch].s, nc); 433 sp++; 434 nb--; 435 dp += nc; 436 nd -= nc; 437 } 438 } 439 440 /* all done */ 441 *dn = dp - ds; 442 return sp - ss; 443 } 444 445 /** String Unquoting **/ 446 447 static const char _UnquoteTab[256] = { 448 ['/' ] = '/', 449 ['"' ] = '"', 450 ['b' ] = '\b', 451 ['f' ] = '\f', 452 ['n' ] = '\n', 453 ['r' ] = '\r', 454 ['t' ] = '\t', 455 ['u' ] = -1, 456 ['\\'] = '\\', 457 }; 458 459 static inline ssize_t memcchr_p32(const char *s, ssize_t nb, char *p) { 460 int64_t r; 461 ssize_t n = nb; 462 const char * q = s; 463 464 #if USE_AVX2 465 __m256i u; 466 __m256i v; 467 __m256i b = _mm256_set1_epi8('\\'); 468 469 /* process every 32 bytes */ 470 while (n >= 32) { 471 u = _mm256_loadu_si256 ((const void *)s); 472 v = _mm256_cmpeq_epi8 (u, b); 473 _mm256_storeu_si256 ((void *)p, u); 474 475 /* check for matches */ 476 if ((r = _mm256_movemask_epi8(v)) != 0) { 477 return s - q + __builtin_ctzll(r); 478 } 479 480 /* move to the next 32 bytes */ 481 s += 32; 482 p += 32; 483 n -= 32; 484 } 485 486 /* clear upper half to avoid AVX-SSE transition penalty */ 487 _mm256_zeroupper(); 488 #endif 489 490 /* initialze with '\\' */ 491 __m128i x; 492 __m128i y; 493 __m128i a = _mm_set1_epi8('\\'); 494 495 /* process every 16 bytes */ 496 while (n >= 16) { 497 x = _mm_loadu_si128 ((const void *)s); 498 y = _mm_cmpeq_epi8 (x, a); 499 _mm_storeu_si128 ((void *)p, x); 500 501 /* check for matches */ 502 if ((r = _mm_movemask_epi8(y)) != 0) { 503 return s - q + __builtin_ctzll(r); 504 } 505 506 /* move to the next 16 bytes */ 507 s += 16; 508 p += 16; 509 n -= 16; 510 } 511 512 /* remaining bytes, do with scalar code */ 513 while (n--) { 514 if (*s != '\\') { 515 *p++ = *s++; 516 } else { 517 return s - q; 518 } 519 } 520 521 /* nothing found, but everything was copied */ 522 return -1; 523 } 524 525 #define ALL_01h (~0ul / 255) 526 #define ALL_7fh (ALL_01h * 127) 527 #define ALL_80h (ALL_01h * 128) 528 529 static inline uint32_t hasless(uint32_t x, uint8_t n) { 530 return (x - ALL_01h * n) & ~x & ALL_80h; 531 } 532 533 static inline uint32_t hasmore(uint32_t x, uint8_t n) { 534 return (x + ALL_01h * (127 - n) | x) & ALL_80h; 535 } 536 537 static inline uint32_t hasbetween(uint32_t x, uint8_t m, uint8_t n) { 538 return (ALL_01h * (127 + n) - (x & ALL_7fh) & ~x & (x & ALL_7fh) + ALL_01h * (127 - m)) & ALL_80h; 539 } 540 541 #undef ALL_01h 542 #undef ALL_7fh 543 #undef ALL_80h 544 545 static inline char ishex(char c) { 546 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); 547 } 548 549 static inline void unirep(char **dp) { 550 *(*dp)++ = 0xef; 551 *(*dp)++ = 0xbf; 552 *(*dp)++ = 0xbd; 553 } 554 555 static inline char unhex16_is(const char *s) { 556 uint32_t v = *(uint32_t *)s; 557 return !(hasless(v, '0') || hasmore(v, 'f') || hasbetween(v, '9', 'A') || hasbetween(v, 'F', 'a')); 558 } 559 560 static inline uint32_t unhex16_fast(const char *s) { 561 uint32_t a = __builtin_bswap32(*(uint32_t *)s); 562 uint32_t b = 9 * ((~a & 0x10101010) >> 4) + (a & 0x0f0f0f0f); 563 uint32_t c = (b >> 4) | b; 564 uint32_t d = ((c >> 8) & 0xff00) | (c & 0x00ff); 565 return d; 566 } 567 568 ssize_t unquote(const char *sp, ssize_t nb, char *dp, ssize_t *ep, uint64_t flags) { 569 ssize_t n; 570 ssize_t x = nb; 571 const char * s = sp; 572 const char * p = dp; 573 574 /* scan & copy all the non-escape characters */ 575 while (nb && (n = (*sp == '\\' ? 0 : memcchr_p32(sp, nb, dp))) != -1) { 576 char cc; 577 uint32_t r0; 578 uint32_t r1; 579 580 /* skip the plain text */ 581 dp += n; 582 sp += n + 2; 583 nb -= n + 2; 584 585 /* check for EOF */ 586 if (nb < 0) { 587 *ep = x; 588 return -ERR_EOF; 589 } 590 591 /* check for double unquote */ 592 if (unlikely(flags & F_DBLUNQ)) { 593 int nr = nb; 594 char c1 = sp[-1]; 595 596 /* must have at least 1 character left */ 597 if (nr == 0) { 598 *ep = x; 599 return -ERR_EOF; 600 } 601 602 /* every quote must be a double quote */ 603 if (c1 != '\\') { 604 *ep = sp - s - 1; 605 return -ERR_INVAL; 606 } 607 608 /* special case of '\\\\' and '\\\"' */ 609 if (*sp == '\\') { 610 if (nr < 2) { 611 *ep = x; 612 return -ERR_EOF; 613 } else if (sp[1] != '"' && sp[1] != '\\') { 614 *ep = sp - s + 1; 615 return -ERR_INVAL; 616 } else { 617 sp++; 618 nb--; 619 } 620 } 621 622 /* skip the second escape */ 623 sp++; 624 nb--; 625 } 626 627 /* check for escape sequence */ 628 if ((cc = _UnquoteTab[(uint8_t)sp[-1]]) == 0) { 629 *ep = sp - s - 1; 630 return -ERR_ESCAPE; 631 } 632 633 /* check for simple escape sequence */ 634 if (cc != -1) { 635 *dp++ = cc; 636 continue; 637 } 638 639 /* must have at least 4 characters */ 640 if (nb < 4) { 641 *ep = x; 642 return -ERR_EOF; 643 } 644 645 /* check for hexadecimal characters */ 646 if (!unhex16_is(sp)) { 647 *ep = sp - s; 648 for (int i = 0; i < 4 && ishex(*sp); i++, sp++) ++*ep; 649 return -ERR_INVAL; 650 } 651 652 /* decode the code-point */ 653 r0 = unhex16_fast(sp); 654 sp += 4; 655 nb -= 4; 656 657 /* from line 598 */ 658 retry_decode: 659 660 /* ASCII characters, unlikely */ 661 if (unlikely(r0 <= 0x7f)) { 662 *dp++ = (char)r0; 663 continue; 664 } 665 666 /* latin-1 characters, unlikely */ 667 if (unlikely(r0 <= 0x07ff)) { 668 *dp++ = (char)(0xc0 | (r0 >> 6)); 669 *dp++ = (char)(0x80 | (r0 & 0x3f)); 670 continue; 671 } 672 673 /* 3-byte characters, likely */ 674 if (likely(r0 < 0xd800 || r0 > 0xdfff)) { 675 *dp++ = (char)(0xe0 | ((r0 >> 12) )); 676 *dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f)); 677 *dp++ = (char)(0x80 | ((r0 ) & 0x3f)); 678 continue; 679 } 680 681 /* check for double unquote */ 682 if (unlikely(flags & F_DBLUNQ)) { 683 if (nb < 1) { 684 if (likely(flags & F_UNIREP)) { 685 unirep(&dp); 686 continue; 687 } else { 688 *ep = x; 689 return -ERR_EOF; 690 } 691 } else { 692 if (sp[0] == '\\') { 693 nb--; 694 sp++; 695 } else if (likely(flags & F_UNIREP)) { 696 unirep(&dp); 697 continue; 698 } else { 699 *ep = sp - s - 4; 700 return -ERR_UNICODE; 701 } 702 } 703 } 704 705 /* surrogate half, must follows by the other half */ 706 if (nb < 6 || r0 > 0xdbff || sp[0] != '\\' || sp[1] != 'u') { 707 if (likely(flags & F_UNIREP)) { 708 unirep(&dp); 709 continue; 710 } else { 711 *ep = sp - s - ((flags & F_DBLUNQ) ? 5 : 4); 712 return -ERR_UNICODE; 713 } 714 } 715 716 /* check the hexadecimal escape */ 717 if (!unhex16_is(sp + 2)) { 718 *ep = sp - s + 2; 719 for (int i = 2; i < 6 && ishex(sp[i]); i++) ++*ep; 720 return -ERR_INVAL; 721 } 722 723 /* decode the second code-point */ 724 r1 = unhex16_fast(sp + 2); 725 sp += 6; 726 nb -= 6; 727 728 /* it must be the other half */ 729 if (r1 < 0xdc00 || r1 > 0xdfff) { 730 if (unlikely(!(flags & F_UNIREP))) { 731 *ep = sp - s - 4; 732 return -ERR_UNICODE; 733 } else { 734 r0 = r1; 735 unirep(&dp); 736 goto retry_decode; 737 } 738 } 739 740 /* merge two surrogates */ 741 r0 = (r0 - 0xd800) << 10; 742 r1 = (r1 - 0xdc00) + 0x010000; 743 r0 += r1; 744 745 /* check the code point range */ 746 if (r0 > 0x10ffff) { 747 if (likely(!(flags & F_UNIREP))) { 748 *ep = sp - s - 4; 749 return -ERR_UNICODE; 750 } else { 751 unirep(&dp); 752 continue; 753 } 754 } 755 756 /* encode the character */ 757 *dp++ = (char)(0xf0 | ((r0 >> 18) )); 758 *dp++ = (char)(0x80 | ((r0 >> 12) & 0x3f)); 759 *dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f)); 760 *dp++ = (char)(0x80 | ((r0 ) & 0x3f)); 761 } 762 763 /* calculate the result length */ 764 return dp + nb - p; 765 } 766 767 static inline __m128i _mm_find_html(__m128i vv) { 768 __m128i e1 = _mm_cmpeq_epi8 (vv, _mm_set1_epi8('<')); 769 __m128i e2 = _mm_cmpeq_epi8 (vv, _mm_set1_epi8('>')); 770 __m128i e3 = _mm_cmpeq_epi8 (vv, _mm_set1_epi8('&')); 771 __m128i e4 = _mm_cmpeq_epi8 (vv, _mm_set1_epi8('\xe2')); 772 __m128i r1 = _mm_or_si128 (e1, e2); 773 __m128i r2 = _mm_or_si128 (e3, e4); 774 __m128i rv = _mm_or_si128 (r1, r2); 775 return rv; 776 } 777 778 #if USE_AVX2 779 static inline __m256i _mm256_find_html(__m256i vv) { 780 __m256i e1 = _mm256_cmpeq_epi8 (vv, _mm256_set1_epi8('<')); 781 __m256i e2 = _mm256_cmpeq_epi8 (vv, _mm256_set1_epi8('>')); 782 __m256i e3 = _mm256_cmpeq_epi8 (vv, _mm256_set1_epi8('&')); 783 __m256i e4 = _mm256_cmpeq_epi8 (vv, _mm256_set1_epi8('\xe2')); 784 __m256i r1 = _mm256_or_si256 (e1, e2); 785 __m256i r2 = _mm256_or_si256 (e3, e4); 786 __m256i rv = _mm256_or_si256 (r1, r2); 787 return rv; 788 } 789 #endif 790 791 static inline ssize_t memcchr_html_quote(const char *sp, ssize_t nb, char *dp, ssize_t dn) { 792 uint32_t mm; 793 const char * ss = sp; 794 795 #if USE_AVX2 796 /* 32-byte loop, full store */ 797 while (nb >= 32 && dn >= 32) { 798 __m256i vv = _mm256_loadu_si256 ((const void *)sp); 799 __m256i rv = _mm256_find_html (vv); 800 _mm256_storeu_si256 ((void *)dp, vv); 801 802 /* check for matches */ 803 if ((mm = _mm256_movemask_epi8(rv)) != 0) { 804 return sp - ss + __builtin_ctz(mm); 805 } 806 807 /* move to next block */ 808 sp += 32; 809 dp += 32; 810 nb -= 32; 811 dn -= 32; 812 } 813 814 /* 32-byte test, partial store */ 815 if (nb >= 32) { 816 __m256i vv = _mm256_loadu_si256 ((const void *)sp); 817 __m256i rv = _mm256_find_html (vv); 818 uint32_t mv = _mm256_movemask_epi8 (rv); 819 uint32_t fv = __builtin_ctzll ((uint64_t)mv | 0x0100000000); 820 821 /* copy at most `dn` characters */ 822 if (fv <= dn) { 823 memcpy_p32(dp, sp, fv); 824 return sp - ss + fv; 825 } else { 826 memcpy_p32(dp, sp, dn); 827 return -(sp - ss + dn) - 1; 828 } 829 } 830 831 /* clear upper half to avoid AVX-SSE transition penalty */ 832 _mm256_zeroupper(); 833 #endif 834 835 /* 16-byte loop, full store */ 836 while (nb >= 16 && dn >= 16) { 837 __m128i vv = _mm_loadu_si128 ((const void *)sp); 838 __m128i rv = _mm_find_html (vv); 839 _mm_storeu_si128 ((void *)dp, vv); 840 841 /* check for matches */ 842 if ((mm = _mm_movemask_epi8(rv)) != 0) { 843 return sp - ss + __builtin_ctz(mm); 844 } 845 846 /* move to next block */ 847 sp += 16; 848 dp += 16; 849 nb -= 16; 850 dn -= 16; 851 } 852 853 /* 16-byte test, partial store */ 854 if (nb >= 16) { 855 __m128i vv = _mm_loadu_si128 ((const void *)sp); 856 __m128i rv = _mm_find_html (vv); 857 uint32_t mv = _mm_movemask_epi8 (rv); 858 uint32_t fv = __builtin_ctz (mv | 0x010000); 859 860 /* copy at most `dn` characters */ 861 if (fv <= dn) { 862 memcpy_p16(dp, sp, fv); 863 return sp - ss + fv; 864 } else { 865 memcpy_p16(dp, sp, dn); 866 return -(sp - ss + dn) - 1; 867 } 868 } 869 870 /* handle the remaining bytes with scalar code */ 871 while (nb > 0 && dn > 0) { 872 if (*sp == '<' || *sp == '>' || *sp == '&' || *sp == '\xe2') { 873 return sp - ss; 874 } else { 875 dn--, nb--; 876 *dp++ = *sp++; 877 } 878 } 879 880 /* check for dest buffer */ 881 if (nb == 0) { 882 return sp - ss; 883 } else { 884 return -(sp - ss) - 1; 885 } 886 } 887 888 ssize_t html_escape(const char *sp, ssize_t nb, char *dp, ssize_t *dn) { 889 ssize_t nd = *dn; 890 const char * ds = dp; 891 const char * ss = sp; 892 const quoted_t * tab = _HtmlQuoteTab; 893 894 /* find the special characters, copy on the fly */ 895 while (nb > 0) { 896 int nc = 0; 897 uint8_t ch = 0; 898 ssize_t rb = 0; 899 const char * cur = 0; 900 901 /* not enough buffer space */ 902 if (nd <= 0) { 903 return -(sp - ss) - 1; 904 } 905 906 /* find and copy */ 907 if ((rb = memcchr_html_quote(sp, nb, dp, nd)) < 0) { 908 *dn = dp - ds - rb - 1; 909 return -(sp - ss - rb - 1) - 1; 910 } 911 912 /* skip already copied bytes */ 913 sp += rb; 914 dp += rb; 915 nb -= rb; 916 nd -= rb; 917 918 /* stop if already finished */ 919 if (nb <= 0) { 920 break; 921 } 922 923 /* mark cur postion */ 924 cur = sp; 925 926 /* check for \u2028 and \u2029, binary is \xe2\x80\xa8 and \xe2\x80\xa9 */ 927 if (unlikely(*sp == '\xe2')) { 928 if (nb >= 3 && *(sp+1) == '\x80' && (*(sp+2) == '\xa8' || *(sp+2) == '\xa9')) { 929 sp += 2, nb -= 2; 930 } else if (nd > 0) { 931 *dp++ = *sp++; 932 nb--, nd--; 933 continue; 934 } else { 935 return -(sp - ss) - 1; 936 } 937 } 938 939 /* get the escape entry, handle consecutive quotes */ 940 ch = * (uint8_t*) sp; 941 nc = tab[ch].n; 942 943 944 /* check for buffer space */ 945 if (nd < nc) { 946 *dn = dp - ds; 947 return -(cur - ss) - 1; 948 } 949 950 /* copy the quoted value */ 951 memcpy_p8(dp, tab[ch].s, nc); 952 sp++; 953 nb--; 954 dp += nc; 955 nd -= nc; 956 } 957 958 /* all done */ 959 *dn = dp - ds; 960 return sp - ss; 961 } 962 963 #undef MAX_ESCAPED_BYTES 964 965 static inline long unescape(const char** src, const char* end, char* dp) { 966 const char* sp = *src; 967 long nb = end - sp; 968 char cc = 0; 969 uint32_t r0, r1; 970 971 if (nb <= 0) return -ERR_EOF; 972 973 if ((cc = _UnquoteTab[(uint8_t)sp[1]]) == 0) { 974 *src += 1; 975 return -ERR_ESCAPE; 976 } 977 978 if (cc != -1) { 979 *dp = cc; 980 *src += 2; 981 return 1; 982 } 983 984 if (nb < 4) { 985 *src += 1; 986 return -ERR_EOF; 987 } 988 989 /* check for hexadecimal characters */ 990 if (!unhex16_is(sp + 2)) { 991 *src += 2; 992 return -ERR_INVAL; 993 } 994 995 /* decode the code-point */ 996 r0 = unhex16_fast(sp + 2); 997 sp += 6; 998 *src = sp; 999 1000 /* ASCII characters, unlikely */ 1001 if (unlikely(r0 <= 0x7f)) { 1002 *dp++ = (char)r0; 1003 return 1; 1004 } 1005 1006 /* latin-1 characters, unlikely */ 1007 if (unlikely(r0 <= 0x07ff)) { 1008 *dp++ = (char)(0xc0 | (r0 >> 6)); 1009 *dp++ = (char)(0x80 | (r0 & 0x3f)); 1010 return 2; 1011 } 1012 1013 /* 3-byte characters, likely */ 1014 if (likely(r0 < 0xd800 || r0 > 0xdfff)) { 1015 *dp++ = (char)(0xe0 | ((r0 >> 12) )); 1016 *dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f)); 1017 *dp++ = (char)(0x80 | ((r0 ) & 0x3f)); 1018 return 3; 1019 } 1020 1021 /* surrogate half, must follows by the other half */ 1022 if (nb < 6 || r0 > 0xdbff || sp[0] != '\\' || sp[1] != 'u') { 1023 return -ERR_UNICODE; 1024 } 1025 1026 /* check the hexadecimal escape */ 1027 if (!unhex16_is(sp + 2)) { 1028 *src += 2; 1029 return -ERR_INVAL; 1030 } 1031 1032 /* decode the second code-point */ 1033 r1 = unhex16_fast(sp + 2); 1034 1035 /* it must be the other half */ 1036 if (r1 < 0xdc00 || r1 > 0xdfff) { 1037 *src += 2; 1038 return -ERR_UNICODE; 1039 } 1040 1041 /* merge two surrogates */ 1042 r0 = (r0 - 0xd800) << 10; 1043 r1 = (r1 - 0xdc00) + 0x010000; 1044 r0 += r1; 1045 1046 /* encode the character */ 1047 *dp++ = (char)(0xf0 | ((r0 >> 18) )); 1048 *dp++ = (char)(0x80 | ((r0 >> 12) & 0x3f)); 1049 *dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f)); 1050 *dp++ = (char)(0x80 | ((r0 ) & 0x3f)); 1051 *src = sp + 6; 1052 return 4; 1053 }