github.com/bytedance/sonic@v1.11.7-0.20240517092252-d2edb31b167b/native/unquote.c (about)

     1  #include "parsing.h"
     2  
     3  
     4  ssize_t unquote(const char *sp, ssize_t nb, char *dp, ssize_t *ep, uint64_t flags) {
     5      ssize_t      n;
     6      ssize_t      x = nb;
     7      const char * s = sp;
     8      const char * p = dp;
     9  
    10      /* scan & copy all the non-escape characters */
    11      while (nb && (n = (*sp == '\\' ? 0 : memcchr_p32(sp, nb, dp))) != -1) {
    12          char     cc;
    13          uint32_t r0;
    14          uint32_t r1;
    15  
    16          /* skip the plain text */
    17          dp += n;
    18          sp += n + 2;
    19          nb -= n + 2;
    20  
    21          /* check for EOF */
    22          if (nb < 0) {
    23              *ep = x;
    24              return -ERR_EOF;
    25          }
    26  
    27          /* check for double unquote */
    28          if (unlikely(flags & F_DBLUNQ)) {
    29              int  nr = nb;
    30              char c1 = sp[-1];
    31  
    32              /* must have at least 1 character left */
    33              if (nr == 0) {
    34                  *ep = x;
    35                  return -ERR_EOF;
    36              }
    37  
    38              /* every quote must be a double quote */
    39              if (c1 != '\\') {
    40                  *ep = sp - s - 1;
    41                  return -ERR_INVAL;
    42              }
    43  
    44              /* special case of '\\\\' and '\\\"' */
    45              if (*sp == '\\') {
    46                  if (nr < 2) {
    47                      *ep = x;
    48                      return -ERR_EOF;
    49                  } else if (sp[1] != '"' && sp[1] != '\\') {
    50                      *ep = sp - s + 1;
    51                      return -ERR_INVAL;
    52                  } else {
    53                      sp++;
    54                      nb--;
    55                  }
    56              }
    57  
    58              /* skip the second escape */
    59              sp++;
    60              nb--;
    61          }
    62  
    63          /* check for escape sequence */
    64          if ((cc = _UnquoteTab[(uint8_t)sp[-1]]) == 0) {
    65              *ep = sp - s - 1;
    66              return -ERR_ESCAPE;
    67          }
    68  
    69          /* check for simple escape sequence */
    70          if (cc != -1) {
    71              *dp++ = cc;
    72              continue;
    73          }
    74  
    75          /* must have at least 4 characters */
    76          if (nb < 4) {
    77              *ep = x;
    78              return -ERR_EOF;
    79          }
    80  
    81          /* check for hexadecimal characters */
    82          if (!unhex16_is(sp)) {
    83              *ep = sp - s;
    84              for (int i = 0; i < 4 && ishex(*sp); i++, sp++) ++*ep;
    85              return -ERR_INVAL;
    86          }
    87  
    88          /* decode the code-point */
    89          r0 = unhex16_fast(sp);
    90          sp += 4;
    91          nb -= 4;
    92  
    93      /* from line 598 */
    94      retry_decode:
    95  
    96          /* ASCII characters, unlikely */
    97          if (unlikely(r0 <= 0x7f)) {
    98              *dp++ = (char)r0;
    99              continue;
   100          }
   101  
   102          /* latin-1 characters, unlikely */
   103          if (unlikely(r0 <= 0x07ff)) {
   104              *dp++ = (char)(0xc0 | (r0 >> 6));
   105              *dp++ = (char)(0x80 | (r0 & 0x3f));
   106              continue;
   107          }
   108  
   109          /* 3-byte characters, likely */
   110          if (likely(r0 < 0xd800 || r0 > 0xdfff)) {
   111              *dp++ = (char)(0xe0 | ((r0 >> 12)       ));
   112              *dp++ = (char)(0x80 | ((r0 >>  6) & 0x3f));
   113              *dp++ = (char)(0x80 | ((r0      ) & 0x3f));
   114              continue;
   115          }
   116  
   117          /* check for double unquote */
   118          if (unlikely(flags & F_DBLUNQ)) {
   119              if (nb < 1) {
   120                  if (likely(flags & F_UNIREP)) {
   121                      unirep(&dp);
   122                      continue;
   123                  } else {
   124                      *ep = x;
   125                      return -ERR_EOF;
   126                  }
   127              } else {
   128                  if (sp[0] == '\\') {
   129                      nb--;
   130                      sp++;
   131                  } else if (likely(flags & F_UNIREP)) {
   132                      unirep(&dp);
   133                      continue;
   134                  } else {
   135                      *ep = sp - s - 4;
   136                      return -ERR_UNICODE;
   137                  }
   138              }
   139          }
   140  
   141          /* surrogate half, must follows by the other half */
   142          if (nb < 6 || r0 > 0xdbff || sp[0] != '\\' || sp[1] != 'u') {
   143              if (likely(flags & F_UNIREP)) {
   144                  unirep(&dp);
   145                  continue;
   146              } else {
   147                  *ep = sp - s - ((flags & F_DBLUNQ) ? 5 : 4);
   148                  return -ERR_UNICODE;
   149              }
   150          }
   151  
   152          /* check the hexadecimal escape */
   153          if (!unhex16_is(sp + 2)) {
   154              *ep = sp - s + 2;
   155              for (int i = 2; i < 6 && ishex(sp[i]); i++) ++*ep;
   156              return -ERR_INVAL;
   157          }
   158  
   159          /* decode the second code-point */
   160          r1 = unhex16_fast(sp + 2);
   161          sp += 6;
   162          nb -= 6;
   163  
   164          /* it must be the other half */
   165          if (r1 < 0xdc00 || r1 > 0xdfff) {
   166              if (unlikely(!(flags & F_UNIREP))) {
   167                  *ep = sp - s - 4;
   168                  return -ERR_UNICODE;
   169              } else {
   170                  r0 = r1;
   171                  unirep(&dp);
   172                  goto retry_decode;
   173              }
   174          }
   175  
   176          /* merge two surrogates */
   177          r0 = (r0 - 0xd800) << 10;
   178          r1 = (r1 - 0xdc00) + 0x010000;
   179          r0 += r1;
   180  
   181          /* check the code point range */
   182          if (r0 > 0x10ffff) {
   183              if (likely(!(flags & F_UNIREP))) {
   184                  *ep = sp - s - 4;
   185                  return -ERR_UNICODE;
   186              } else {
   187                  unirep(&dp);
   188                  continue;
   189              }
   190          }
   191  
   192          /* encode the character */
   193          *dp++ = (char)(0xf0 | ((r0 >> 18)       ));
   194          *dp++ = (char)(0x80 | ((r0 >> 12) & 0x3f));
   195          *dp++ = (char)(0x80 | ((r0 >>  6) & 0x3f));
   196          *dp++ = (char)(0x80 | ((r0      ) & 0x3f));
   197      }
   198  
   199      /* calculate the result length */
   200      return dp + nb - p;
   201  }