github.com/cloudwego/dynamicgo@v0.2.6-0.20240519101509-707f41b6b834/native/base64.c (about) 1 /* 2 * Copyright 2023 CloudWeGo Authors. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <stdint.h> 18 #include <immintrin.h> 19 #include <sys/types.h> 20 #include "native.h" 21 22 #define MODE_URL 1 23 #define MODE_RAW 2 24 #define MODE_AVX2 4 25 26 #define as_m32v(v) (*(uint32_t *)(v)) 27 #define as_m64v(v) (*(uint64_t *)(v)) 28 29 #define as_m128p(v) ((__m128i *)(v)) 30 #define as_m256p(v) ((__m256i *)(v)) 31 32 #define as_m8c(v) ((const uint8_t *)(v)) 33 #define as_m128c(v) ((const __m128i *)(v)) 34 #define as_m256c(v) ((const __m256i *)(v)) 35 36 /** Exported Functions **/ 37 38 void b64encode(GoSlice *out, const GoSlice *src, int mode); 39 ssize_t b64decode(GoSlice *out, const char *src, size_t nb, int mode); 40 41 /** Encoder Helper Functions **/ 42 43 static const char TabEncodeCharsetStd[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 44 static const char TabEncodeCharsetURL[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; 45 46 static const uint8_t VecEncodeShuffles[32] = { 47 1, 48 0, 49 2, 50 1, 51 4, 52 3, 53 5, 54 4, 55 7, 56 6, 57 8, 58 7, 59 10, 60 9, 61 11, 62 10, 63 1, 64 0, 65 2, 66 1, 67 4, 68 3, 69 5, 70 4, 71 7, 72 6, 73 8, 74 7, 75 10, 76 9, 77 11, 78 10, 79 }; 80 81 static const uint8_t VecEncodeCharsetStd[32] = { 82 'a' - 26, 83 '0' - 52, 84 '0' - 52, 85 '0' - 52, 86 '0' - 52, 87 '0' - 52, 88 '0' - 52, 89 '0' - 52, 90 '0' - 52, 91 '0' - 52, 92 '0' - 52, 93 '+' - 62, 94 '/' - 63, 95 'A', 96 0, 97 0, 98 'a' - 26, 99 '0' - 52, 100 '0' - 52, 101 '0' - 52, 102 '0' - 52, 103 '0' - 52, 104 '0' - 52, 105 '0' - 52, 106 '0' - 52, 107 '0' - 52, 108 '0' - 52, 109 '+' - 62, 110 '/' - 63, 111 'A', 112 0, 113 0, 114 }; 115 116 static const uint8_t VecEncodeCharsetURL[32] = { 117 'a' - 26, 118 '0' - 52, 119 '0' - 52, 120 '0' - 52, 121 '0' - 52, 122 '0' - 52, 123 '0' - 52, 124 '0' - 52, 125 '0' - 52, 126 '0' - 52, 127 '0' - 52, 128 '-' - 62, 129 '_' - 63, 130 'A', 131 0, 132 0, 133 'a' - 26, 134 '0' - 52, 135 '0' - 52, 136 '0' - 52, 137 '0' - 52, 138 '0' - 52, 139 '0' - 52, 140 '0' - 52, 141 '0' - 52, 142 '0' - 52, 143 '0' - 52, 144 '-' - 62, 145 '_' - 63, 146 'A', 147 0, 148 0, 149 }; 150 151 static inline __m256i encode_avx2(__m128i v0, __m128i v1, const uint8_t *tab) 152 { 153 __m256i vv = _mm256_set_m128i(v1, v0); 154 __m256i sh = _mm256_loadu_si256(as_m256c(VecEncodeShuffles)); 155 __m256i in = _mm256_shuffle_epi8(vv, sh); 156 __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); 157 __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); 158 __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); 159 __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); 160 __m256i vi = _mm256_or_si256(t1, t3); 161 __m256i s0 = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), vi); 162 __m256i s1 = _mm256_and_si256(_mm256_set1_epi8(13), s0); 163 __m256i s2 = _mm256_loadu_si256(as_m256c(tab)); 164 __m256i r0 = _mm256_subs_epu8(vi, _mm256_set1_epi8(51)); 165 __m256i r1 = _mm256_or_si256(r0, s1); 166 __m256i r2 = _mm256_shuffle_epi8(s2, r1); 167 __m256i r3 = _mm256_add_epi8(vi, r2); 168 return r3; 169 } 170 171 /** Function Implementations **/ 172 173 void b64encode(GoSlice *out, const GoSlice *src, int mode) 174 { 175 char *ob = out->buf + out->len; 176 char *op = out->buf + out->len; 177 const char *ip = src->buf; 178 const char *ie = src->buf + src->len; 179 const char *st = TabEncodeCharsetStd; 180 const uint8_t *vt = VecEncodeCharsetStd; 181 182 /* check for empty string */ 183 if (src->len == 0) 184 { 185 return; 186 } 187 188 /* check for URL encoding */ 189 if (mode & MODE_URL) 190 { 191 st = TabEncodeCharsetURL; 192 vt = VecEncodeCharsetURL; 193 } 194 195 #if USE_AVX2 196 /* SIMD 24 bytes loop, but the SIMD instruction will load 4 bytes 197 * past the end, so it's safe only if there are 28 bytes or more left */ 198 while ((ip <= ie - 28) && (mode & MODE_AVX2) != 0) 199 { 200 __m128i v0 = _mm_loadu_si128(as_m128c(ip)); 201 __m128i v1 = _mm_loadu_si128(as_m128c(ip + 12)); 202 __m256i vv = encode_avx2(v0, v1, vt); 203 204 /* store the result, and advance buffer pointers */ 205 _mm256_storeu_si256(as_m256p(op), vv); 206 op += 32; 207 ip += 24; 208 } 209 210 /* can do one more 24 bytes round, but needs special handling */ 211 if ((ip <= ie - 24) && (mode & MODE_AVX2) != 0) 212 { 213 __m128i v0 = _mm_loadu_si128(as_m128c(ip)); 214 __m128i v1 = _mm_loadu_si128(as_m128c(ip + 8)); 215 __m128i v2 = _mm_srli_si128(v1, 4); 216 __m256i vv = encode_avx2(v0, v2, vt); 217 218 /* store the result, and advance buffer pointers */ 219 _mm256_storeu_si256(as_m256p(op), vv); 220 op += 32; 221 ip += 24; 222 } 223 #endif 224 225 /* no more bytes */ 226 if (ip == ie) 227 { 228 out->len += op - ob; 229 return; 230 } 231 232 /* handle the remaining bytes with scalar code (with 4 bytes load) */ 233 while (ip <= ie - 4) 234 { 235 uint32_t v0 = __builtin_bswap32(*(const uint32_t *)ip); 236 uint8_t v1 = (v0 >> 26) & 0x3f; 237 uint8_t v2 = (v0 >> 20) & 0x3f; 238 uint8_t v3 = (v0 >> 14) & 0x3f; 239 uint8_t v4 = (v0 >> 8) & 0x3f; 240 241 /* encode the characters, and move to next block */ 242 ip += 3; 243 *op++ = st[v1]; 244 *op++ = st[v2]; 245 *op++ = st[v3]; 246 *op++ = st[v4]; 247 } 248 249 /* load the last bytes */ 250 size_t dp = ie - ip; 251 uint32_t v0 = (uint32_t)(uint8_t)ip[0] << 16; 252 253 #define B2 v0 |= (uint32_t)(uint8_t)ip[2] 254 #define B1 v0 |= (uint32_t)(uint8_t)ip[1] << 8 255 256 #define R4 *op++ = st[(v0 >> 0) & 0x3f] 257 #define R3 *op++ = st[(v0 >> 6) & 0x3f] 258 #define R2 *op++ = st[(v0 >> 12) & 0x3f] 259 #define R1 *op++ = st[(v0 >> 18) & 0x3f] 260 261 #define NB \ 262 { \ 263 out->len += op - ob; \ 264 } 265 #define PD \ 266 { \ 267 if ((mode & MODE_RAW) == 0) \ 268 { \ 269 *op++ = '='; \ 270 } \ 271 } 272 273 /* encode the last few bytes */ 274 switch (dp) 275 { 276 case 3: 277 B2; 278 B1; 279 R1; 280 R2; 281 R3; 282 R4; 283 NB; 284 break; 285 case 2: 286 B1; 287 R1; 288 R2; 289 R3; 290 PD; 291 NB; 292 break; 293 case 1: 294 R1; 295 R2; 296 PD; 297 PD; 298 NB; 299 break; 300 default: 301 NB; 302 break; 303 } 304 305 #undef PD 306 #undef NB 307 #undef R1 308 #undef R2 309 #undef R3 310 #undef R4 311 #undef B1 312 #undef B2 313 } 314 315 /** Decoder Helper Functions **/ 316 317 static const uint8_t VecPacking[32] = { 318 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 128, 128, 128, 128, 319 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 128, 128, 128, 128}; 320 321 static const uint8_t VecDecodeBits[32] = { 322 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 323 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; 324 325 static const uint8_t VecDecodeTableStd[128] = { 326 0x00, 0x00, 0x13, 0x04, 0xbf, 0xbf, 0xb9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 327 0x00, 0x00, 0x13, 0x04, 0xbf, 0xbf, 0xb9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 328 0xa8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0, 0x54, 0x50, 0x50, 0x50, 0x54, 329 0xa8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0, 0x54, 0x50, 0x50, 0x50, 0x54, 330 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 331 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 332 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 333 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}; 334 335 static const uint8_t VecDecodeTableURL[128] = { 336 0x00, 337 0x00, 338 0x11, 339 0x04, 340 0xbf, 341 0xbf, 342 0xb9, 343 0xb9, 344 0x00, 345 0x00, 346 0x00, 347 0x00, 348 0x00, 349 0x00, 350 0x00, 351 0x00, 352 0x00, 353 0x00, 354 0x11, 355 0x04, 356 0xbf, 357 0xbf, 358 0xb9, 359 0xb9, 360 0x00, 361 0x00, 362 0x00, 363 0x00, 364 0x00, 365 0x00, 366 0x00, 367 0x00, 368 0xa8, 369 0xf8, 370 0xf8, 371 0xf8, 372 0xf8, 373 0xf8, 374 0xf8, 375 0xf8, 376 0xf8, 377 0xf8, 378 0xf0, 379 0x50, 380 0x50, 381 0x54, 382 0x50, 383 0x70, 384 0xa8, 385 0xf8, 386 0xf8, 387 0xf8, 388 0xf8, 389 0xf8, 390 0xf8, 391 0xf8, 392 0xf8, 393 0xf8, 394 0xf0, 395 0x50, 396 0x50, 397 0x54, 398 0x50, 399 0x70, 400 0x5f, 401 0x5f, 402 0x5f, 403 0x5f, 404 0x5f, 405 0x5f, 406 0x5f, 407 0x5f, 408 0x5f, 409 0x5f, 410 0x5f, 411 0x5f, 412 0x5f, 413 0x5f, 414 0x5f, 415 0x5f, 416 0x5f, 417 0x5f, 418 0x5f, 419 0x5f, 420 0x5f, 421 0x5f, 422 0x5f, 423 0x5f, 424 0x5f, 425 0x5f, 426 0x5f, 427 0x5f, 428 0x5f, 429 0x5f, 430 0x5f, 431 0x5f, 432 0xe0, 433 0xe0, 434 0xe0, 435 0xe0, 436 0xe0, 437 0xe0, 438 0xe0, 439 0xe0, 440 0xe0, 441 0xe0, 442 0xe0, 443 0xe0, 444 0xe0, 445 0xe0, 446 0xe0, 447 0xe0, 448 0xe0, 449 0xe0, 450 0xe0, 451 0xe0, 452 0xe0, 453 0xe0, 454 0xe0, 455 0xe0, 456 0xe0, 457 0xe0, 458 0xe0, 459 0xe0, 460 0xe0, 461 0xe0, 462 0xe0, 463 0xe0, 464 }; 465 466 static const uint8_t VecDecodeCharsetStd[256] = { 467 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 468 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 469 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 62, 0xff, 0xff, 0xff, 63, 470 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 471 0xff, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 472 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0xff, 0xff, 0xff, 0xff, 0xff, 473 0xff, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 474 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0xff, 0xff, 0xff, 0xff, 0xff, 475 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 476 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 477 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 478 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 479 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 480 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 481 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 482 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; 483 484 static const uint8_t VecDecodeCharsetURL[256] = { 485 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 486 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 487 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 62, 0xff, 0xff, 488 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 489 0xff, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 490 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0xff, 0xff, 0xff, 0xff, 63, 491 0xff, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 492 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0xff, 0xff, 0xff, 0xff, 0xff, 493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; 501 502 static inline void memcopy_24(char *dp, const uint8_t *sp) 503 { 504 *(uint64_t *)(dp + 0) = *(const uint64_t *)(sp + 0); 505 *(uint64_t *)(dp + 8) = *(const uint64_t *)(sp + 8); 506 *(uint64_t *)(dp + 16) = *(const uint64_t *)(sp + 16); 507 } 508 509 static inline __m256i decode_avx2(__m256i v0, int *pos, const uint8_t *tab) 510 { 511 __m256i v1 = _mm256_srli_epi32(v0, 4); 512 __m256i vl = _mm256_and_si256(v0, _mm256_set1_epi8(0x0f)); 513 __m256i vh = _mm256_and_si256(v1, _mm256_set1_epi8(0x0f)); 514 __m256i st = _mm256_loadu_si256(as_m256c(tab)); 515 __m256i mt = _mm256_loadu_si256(as_m256c(tab + 32)); 516 __m256i et = _mm256_loadu_si256(as_m256c(tab + 64)); 517 __m256i rt = _mm256_loadu_si256(as_m256c(tab + 96)); 518 __m256i pt = _mm256_loadu_si256(as_m256c(VecPacking)); 519 __m256i bt = _mm256_loadu_si256(as_m256c(VecDecodeBits)); 520 __m256i sh = _mm256_shuffle_epi8(st, vh); 521 __m256i eq = _mm256_cmpeq_epi8(v0, et); 522 __m256i sv = _mm256_blendv_epi8(sh, rt, eq); 523 __m256i bm = _mm256_shuffle_epi8(mt, vl); 524 __m256i bv = _mm256_shuffle_epi8(bt, vh); 525 __m256i mr = _mm256_and_si256(bm, bv); 526 __m256i nm = _mm256_cmpeq_epi8(mr, _mm256_setzero_si256()); 527 __m256i sr = _mm256_add_epi8(v0, sv); 528 __m256i r0 = _mm256_and_si256(sr, _mm256_set1_epi8(0x3f)); 529 __m256i r1 = _mm256_maddubs_epi16(r0, _mm256_set1_epi32(0x01400140)); 530 __m256i r2 = _mm256_madd_epi16(r1, _mm256_set1_epi32(0x00011000)); 531 __m256i r3 = _mm256_shuffle_epi8(r2, pt); 532 __m256i r4 = _mm256_permutevar8x32_epi32(r3, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, 3, 7)); 533 int64_t mp = _mm256_movemask_epi8(nm); 534 int32_t np = __builtin_ctzll(mp | 0xffffffff00000000); 535 return (*pos = np), r4; 536 } 537 538 /* Return 0 if success, otherwise return the error position + 1 */ 539 static inline int64_t decode_block( 540 const uint8_t *ie, 541 const uint8_t **ipp, 542 char **opp, 543 const uint8_t *tab, 544 int mode) 545 { 546 int nb = 0; 547 uint32_t v0 = 0; 548 549 /* buffer pointers */ 550 char *op = *opp; 551 const uint8_t *ip = *ipp; 552 553 /* load up to 4 characters */ 554 while (nb < 4 && ip < ie) 555 { 556 uint8_t id; 557 uint8_t ch = *ip; 558 559 /* skip new lines */ 560 if (ch == '\r' || ch == '\n') 561 { 562 ip++; 563 continue; 564 } 565 566 /* lookup the index, and check for invalid characters */ 567 if ((id = tab[ch]) == 0xff) 568 { 569 break; 570 } 571 572 /* move to next character */ 573 ip++; 574 nb++; 575 v0 = (v0 << 6) | id; 576 } 577 578 /* never ends with 1 characer */ 579 if (nb == 1) 580 { 581 return ip - *ipp + 1; 582 } 583 584 #define P2() \ 585 { \ 586 E2() \ 587 P1() \ 588 P1() \ 589 } 590 #define P1() \ 591 { \ 592 if (*ip++ != '=') \ 593 return ip - *ipp; \ 594 } // ip has been added 1 595 #define E2() \ 596 { \ 597 if (ip >= ie - 1) \ 598 return ip - *ipp + 1; \ 599 } 600 #define R1() \ 601 { \ 602 if ((mode & MODE_RAW) == 0) \ 603 return ip - *ipp + 1; \ 604 } 605 606 #define align_val() \ 607 { \ 608 v0 <<= 6 * (4 - nb); \ 609 } 610 #define parse_eof() \ 611 { \ 612 if (ip < ie) \ 613 return ip - *ipp + 1; \ 614 } 615 #define check_pad() \ 616 { \ 617 if (ip == ie) \ 618 R1() \ 619 else if (nb == 3) \ 620 P1() \ 621 else \ 622 P2() \ 623 } 624 625 /* not enough characters, can either be EOF or paddings or illegal characters */ 626 if (nb < 4) 627 { 628 check_pad() 629 parse_eof() 630 align_val() 631 } 632 633 #undef check_pad 634 #undef parse_eof 635 #undef align_val 636 637 #undef R1 638 #undef E2 639 #undef P1 640 #undef P2 641 642 /* decode into output */ 643 switch (nb) 644 { 645 case 4: 646 op[2] = (v0 >> 0) & 0xff; 647 case 3: 648 op[1] = (v0 >> 8) & 0xff; 649 case 2: 650 op[0] = (v0 >> 16) & 0xff; 651 } 652 653 /* update the pointers */ 654 *ipp = ip; 655 *opp = op + nb - 1; 656 return 0; 657 } 658 659 ssize_t b64decode(GoSlice *out, const char *src, size_t nb, int mode) 660 { 661 int ep; 662 __m256i vv; 663 int64_t dv; 664 uint8_t buf[32] = {0}; 665 666 /* check for empty input */ 667 if (nb == 0) 668 { 669 return 0; 670 } 671 672 /* output buffer */ 673 char *ob = out->buf + out->len; 674 char *op = out->buf + out->len; 675 char *oe = out->buf + out->cap; 676 677 /* input buffer */ 678 const uint8_t *dt = VecDecodeTableStd; 679 const uint8_t *st = VecDecodeCharsetStd; 680 const uint8_t *ib = (const uint8_t *)src; 681 const uint8_t *ip = (const uint8_t *)src; 682 const uint8_t *ie = (const uint8_t *)src + nb; 683 684 /* check for URL encoding */ 685 if (mode & MODE_URL) 686 { 687 dt = VecDecodeTableURL; 688 st = VecDecodeCharsetURL; 689 } 690 691 #if USE_AVX2 692 /* decode every 32 bytes, the final round should be handled separately, because the 693 * SIMD instruction performs 32-byte store, and it might store past the end of the 694 * output buffer */ 695 while ((ip <= ie - 32) && (mode & MODE_AVX2) != 0) 696 { 697 vv = _mm256_loadu_si256(as_m256c(ip)); 698 vv = decode_avx2(vv, &ep, dt); 699 700 /* check for invalid characters (or '=' paddings) */ 701 if (ep < 32) 702 { 703 if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0) 704 { 705 return ib - ip - dv; 706 } 707 else 708 { 709 continue; 710 } 711 } 712 713 /* check for store boundary, perform the last 24-byte store if needed */ 714 if (op <= oe - 32) 715 { 716 _mm256_storeu_si256(as_m256p(op), vv); 717 } 718 else 719 { 720 _mm256_storeu_si256(as_m256p(buf), vv); 721 memcopy_24(op, buf); 722 } 723 724 /* move to next block */ 725 ip += 32; 726 op += 24; 727 } 728 #endif 729 /* handle the remaining bytes with scalar code (8 byte loop) */ 730 while (ip <= ie - 8 && op <= oe - 8) 731 { 732 uint8_t v0 = st[ip[0]]; 733 uint8_t v1 = st[ip[1]]; 734 uint8_t v2 = st[ip[2]]; 735 uint8_t v3 = st[ip[3]]; 736 uint8_t v4 = st[ip[4]]; 737 uint8_t v5 = st[ip[5]]; 738 uint8_t v6 = st[ip[6]]; 739 uint8_t v7 = st[ip[7]]; 740 741 /* check for invalid bytes */ 742 if ((v0 | v1 | v2 | v3 | v4 | v5 | v6 | v7) == 0xff) 743 { 744 if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0) 745 { 746 return ib - ip - dv; 747 } 748 else 749 { 750 continue; 751 } 752 } 753 754 /* construct the characters */ 755 uint64_t vv = __builtin_bswap64( 756 ((uint64_t)v0 << 58) | 757 ((uint64_t)v1 << 52) | 758 ((uint64_t)v2 << 46) | 759 ((uint64_t)v3 << 40) | 760 ((uint64_t)v4 << 34) | 761 ((uint64_t)v5 << 28) | 762 ((uint64_t)v6 << 22) | 763 ((uint64_t)v7 << 16)); 764 765 /* store the result, and move to next block */ 766 as_m64v(op) = vv; 767 ip += 8; 768 op += 6; 769 } 770 771 /* handle the remaining bytes with scalar code (4 byte loop) */ 772 while (ip <= ie - 4 && op <= oe - 4) 773 { 774 uint8_t v0 = st[ip[0]]; 775 uint8_t v1 = st[ip[1]]; 776 uint8_t v2 = st[ip[2]]; 777 uint8_t v3 = st[ip[3]]; 778 779 /* check for invalid bytes */ 780 if ((v0 | v1 | v2 | v3) == 0xff) 781 { 782 if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0) 783 { 784 return ib - ip - dv; 785 } 786 else 787 { 788 continue; 789 } 790 } 791 792 /* construct the characters */ 793 uint32_t vv = __builtin_bswap32( 794 ((uint32_t)v0 << 26) | 795 ((uint32_t)v1 << 20) | 796 ((uint32_t)v2 << 14) | 797 ((uint32_t)v3 << 8)); 798 799 /* store the result, and move to next block */ 800 as_m32v(op) = vv; 801 ip += 4; 802 op += 3; 803 } 804 805 /* decode the last few bytes */ 806 while (ip < ie) 807 { 808 if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0) 809 { 810 return ib - ip - dv; 811 } 812 } 813 814 /* update the result length */ 815 out->len += op - ob; 816 return op - ob; 817 }