github.com/bytedance/sonic@v1.11.7-0.20240517092252-d2edb31b167b/native/unquote.c (about) 1 #include "parsing.h" 2 3 4 ssize_t unquote(const char *sp, ssize_t nb, char *dp, ssize_t *ep, uint64_t flags) { 5 ssize_t n; 6 ssize_t x = nb; 7 const char * s = sp; 8 const char * p = dp; 9 10 /* scan & copy all the non-escape characters */ 11 while (nb && (n = (*sp == '\\' ? 0 : memcchr_p32(sp, nb, dp))) != -1) { 12 char cc; 13 uint32_t r0; 14 uint32_t r1; 15 16 /* skip the plain text */ 17 dp += n; 18 sp += n + 2; 19 nb -= n + 2; 20 21 /* check for EOF */ 22 if (nb < 0) { 23 *ep = x; 24 return -ERR_EOF; 25 } 26 27 /* check for double unquote */ 28 if (unlikely(flags & F_DBLUNQ)) { 29 int nr = nb; 30 char c1 = sp[-1]; 31 32 /* must have at least 1 character left */ 33 if (nr == 0) { 34 *ep = x; 35 return -ERR_EOF; 36 } 37 38 /* every quote must be a double quote */ 39 if (c1 != '\\') { 40 *ep = sp - s - 1; 41 return -ERR_INVAL; 42 } 43 44 /* special case of '\\\\' and '\\\"' */ 45 if (*sp == '\\') { 46 if (nr < 2) { 47 *ep = x; 48 return -ERR_EOF; 49 } else if (sp[1] != '"' && sp[1] != '\\') { 50 *ep = sp - s + 1; 51 return -ERR_INVAL; 52 } else { 53 sp++; 54 nb--; 55 } 56 } 57 58 /* skip the second escape */ 59 sp++; 60 nb--; 61 } 62 63 /* check for escape sequence */ 64 if ((cc = _UnquoteTab[(uint8_t)sp[-1]]) == 0) { 65 *ep = sp - s - 1; 66 return -ERR_ESCAPE; 67 } 68 69 /* check for simple escape sequence */ 70 if (cc != -1) { 71 *dp++ = cc; 72 continue; 73 } 74 75 /* must have at least 4 characters */ 76 if (nb < 4) { 77 *ep = x; 78 return -ERR_EOF; 79 } 80 81 /* check for hexadecimal characters */ 82 if (!unhex16_is(sp)) { 83 *ep = sp - s; 84 for (int i = 0; i < 4 && ishex(*sp); i++, sp++) ++*ep; 85 return -ERR_INVAL; 86 } 87 88 /* decode the code-point */ 89 r0 = unhex16_fast(sp); 90 sp += 4; 91 nb -= 4; 92 93 /* from line 598 */ 94 retry_decode: 95 96 /* ASCII characters, unlikely */ 97 if (unlikely(r0 <= 0x7f)) { 98 *dp++ = (char)r0; 99 continue; 100 } 101 102 /* latin-1 characters, unlikely */ 103 if (unlikely(r0 <= 0x07ff)) { 104 *dp++ = (char)(0xc0 | (r0 >> 6)); 105 *dp++ = (char)(0x80 | (r0 & 0x3f)); 106 continue; 107 } 108 109 /* 3-byte characters, likely */ 110 if (likely(r0 < 0xd800 || r0 > 0xdfff)) { 111 *dp++ = (char)(0xe0 | ((r0 >> 12) )); 112 *dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f)); 113 *dp++ = (char)(0x80 | ((r0 ) & 0x3f)); 114 continue; 115 } 116 117 /* check for double unquote */ 118 if (unlikely(flags & F_DBLUNQ)) { 119 if (nb < 1) { 120 if (likely(flags & F_UNIREP)) { 121 unirep(&dp); 122 continue; 123 } else { 124 *ep = x; 125 return -ERR_EOF; 126 } 127 } else { 128 if (sp[0] == '\\') { 129 nb--; 130 sp++; 131 } else if (likely(flags & F_UNIREP)) { 132 unirep(&dp); 133 continue; 134 } else { 135 *ep = sp - s - 4; 136 return -ERR_UNICODE; 137 } 138 } 139 } 140 141 /* surrogate half, must follows by the other half */ 142 if (nb < 6 || r0 > 0xdbff || sp[0] != '\\' || sp[1] != 'u') { 143 if (likely(flags & F_UNIREP)) { 144 unirep(&dp); 145 continue; 146 } else { 147 *ep = sp - s - ((flags & F_DBLUNQ) ? 5 : 4); 148 return -ERR_UNICODE; 149 } 150 } 151 152 /* check the hexadecimal escape */ 153 if (!unhex16_is(sp + 2)) { 154 *ep = sp - s + 2; 155 for (int i = 2; i < 6 && ishex(sp[i]); i++) ++*ep; 156 return -ERR_INVAL; 157 } 158 159 /* decode the second code-point */ 160 r1 = unhex16_fast(sp + 2); 161 sp += 6; 162 nb -= 6; 163 164 /* it must be the other half */ 165 if (r1 < 0xdc00 || r1 > 0xdfff) { 166 if (unlikely(!(flags & F_UNIREP))) { 167 *ep = sp - s - 4; 168 return -ERR_UNICODE; 169 } else { 170 r0 = r1; 171 unirep(&dp); 172 goto retry_decode; 173 } 174 } 175 176 /* merge two surrogates */ 177 r0 = (r0 - 0xd800) << 10; 178 r1 = (r1 - 0xdc00) + 0x010000; 179 r0 += r1; 180 181 /* check the code point range */ 182 if (r0 > 0x10ffff) { 183 if (likely(!(flags & F_UNIREP))) { 184 *ep = sp - s - 4; 185 return -ERR_UNICODE; 186 } else { 187 unirep(&dp); 188 continue; 189 } 190 } 191 192 /* encode the character */ 193 *dp++ = (char)(0xf0 | ((r0 >> 18) )); 194 *dp++ = (char)(0x80 | ((r0 >> 12) & 0x3f)); 195 *dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f)); 196 *dp++ = (char)(0x80 | ((r0 ) & 0x3f)); 197 } 198 199 /* calculate the result length */ 200 return dp + nb - p; 201 }