github.com/cloudwego/dynamicgo@v0.2.6-0.20240519101509-707f41b6b834/native/base64.c

github.com/cloudwego/dynamicgo@v0.2.6-0.20240519101509-707f41b6b834/native/base64.c (about)

     1  /*
     2   * Copyright 2023 CloudWeGo Authors.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  #include <stdint.h>
    18  #include <immintrin.h>
    19  #include <sys/types.h>
    20  #include "native.h"
    21  
    22  #define MODE_URL 1
    23  #define MODE_RAW 2
    24  #define MODE_AVX2 4
    25  
    26  #define as_m32v(v) (*(uint32_t *)(v))
    27  #define as_m64v(v) (*(uint64_t *)(v))
    28  
    29  #define as_m128p(v) ((__m128i *)(v))
    30  #define as_m256p(v) ((__m256i *)(v))
    31  
    32  #define as_m8c(v) ((const uint8_t *)(v))
    33  #define as_m128c(v) ((const __m128i *)(v))
    34  #define as_m256c(v) ((const __m256i *)(v))
    35  
    36  /** Exported Functions **/
    37  
    38  void b64encode(GoSlice *out, const GoSlice *src, int mode);
    39  ssize_t b64decode(GoSlice *out, const char *src, size_t nb, int mode);
    40  
    41  /** Encoder Helper Functions **/
    42  
    43  static const char TabEncodeCharsetStd[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    44  static const char TabEncodeCharsetURL[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
    45  
    46  static const uint8_t VecEncodeShuffles[32] = {
    47      1,
    48      0,
    49      2,
    50      1,
    51      4,
    52      3,
    53      5,
    54      4,
    55      7,
    56      6,
    57      8,
    58      7,
    59      10,
    60      9,
    61      11,
    62      10,
    63      1,
    64      0,
    65      2,
    66      1,
    67      4,
    68      3,
    69      5,
    70      4,
    71      7,
    72      6,
    73      8,
    74      7,
    75      10,
    76      9,
    77      11,
    78      10,
    79  };
    80  
    81  static const uint8_t VecEncodeCharsetStd[32] = {
    82      'a' - 26,
    83      '0' - 52,
    84      '0' - 52,
    85      '0' - 52,
    86      '0' - 52,
    87      '0' - 52,
    88      '0' - 52,
    89      '0' - 52,
    90      '0' - 52,
    91      '0' - 52,
    92      '0' - 52,
    93      '+' - 62,
    94      '/' - 63,
    95      'A',
    96      0,
    97      0,
    98      'a' - 26,
    99      '0' - 52,
   100      '0' - 52,
   101      '0' - 52,
   102      '0' - 52,
   103      '0' - 52,
   104      '0' - 52,
   105      '0' - 52,
   106      '0' - 52,
   107      '0' - 52,
   108      '0' - 52,
   109      '+' - 62,
   110      '/' - 63,
   111      'A',
   112      0,
   113      0,
   114  };
   115  
   116  static const uint8_t VecEncodeCharsetURL[32] = {
   117      'a' - 26,
   118      '0' - 52,
   119      '0' - 52,
   120      '0' - 52,
   121      '0' - 52,
   122      '0' - 52,
   123      '0' - 52,
   124      '0' - 52,
   125      '0' - 52,
   126      '0' - 52,
   127      '0' - 52,
   128      '-' - 62,
   129      '_' - 63,
   130      'A',
   131      0,
   132      0,
   133      'a' - 26,
   134      '0' - 52,
   135      '0' - 52,
   136      '0' - 52,
   137      '0' - 52,
   138      '0' - 52,
   139      '0' - 52,
   140      '0' - 52,
   141      '0' - 52,
   142      '0' - 52,
   143      '0' - 52,
   144      '-' - 62,
   145      '_' - 63,
   146      'A',
   147      0,
   148      0,
   149  };
   150  
   151  static inline __m256i encode_avx2(__m128i v0, __m128i v1, const uint8_t *tab)
   152  {
   153      __m256i vv = _mm256_set_m128i(v1, v0);
   154      __m256i sh = _mm256_loadu_si256(as_m256c(VecEncodeShuffles));
   155      __m256i in = _mm256_shuffle_epi8(vv, sh);
   156      __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
   157      __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
   158      __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
   159      __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
   160      __m256i vi = _mm256_or_si256(t1, t3);
   161      __m256i s0 = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), vi);
   162      __m256i s1 = _mm256_and_si256(_mm256_set1_epi8(13), s0);
   163      __m256i s2 = _mm256_loadu_si256(as_m256c(tab));
   164      __m256i r0 = _mm256_subs_epu8(vi, _mm256_set1_epi8(51));
   165      __m256i r1 = _mm256_or_si256(r0, s1);
   166      __m256i r2 = _mm256_shuffle_epi8(s2, r1);
   167      __m256i r3 = _mm256_add_epi8(vi, r2);
   168      return r3;
   169  }
   170  
   171  /** Function Implementations **/
   172  
   173  void b64encode(GoSlice *out, const GoSlice *src, int mode)
   174  {
   175      char *ob = out->buf + out->len;
   176      char *op = out->buf + out->len;
   177      const char *ip = src->buf;
   178      const char *ie = src->buf + src->len;
   179      const char *st = TabEncodeCharsetStd;
   180      const uint8_t *vt = VecEncodeCharsetStd;
   181  
   182      /* check for empty string */
   183      if (src->len == 0)
   184      {
   185          return;
   186      }
   187  
   188      /* check for URL encoding */
   189      if (mode & MODE_URL)
   190      {
   191          st = TabEncodeCharsetURL;
   192          vt = VecEncodeCharsetURL;
   193      }
   194  
   195  #if USE_AVX2
   196      /* SIMD 24 bytes loop, but the SIMD instruction will load 4 bytes
   197       * past the end, so it's safe only if there are 28 bytes or more left */
   198      while ((ip <= ie - 28) && (mode & MODE_AVX2) != 0)
   199      {
   200          __m128i v0 = _mm_loadu_si128(as_m128c(ip));
   201          __m128i v1 = _mm_loadu_si128(as_m128c(ip + 12));
   202          __m256i vv = encode_avx2(v0, v1, vt);
   203  
   204          /* store the result, and advance buffer pointers */
   205          _mm256_storeu_si256(as_m256p(op), vv);
   206          op += 32;
   207          ip += 24;
   208      }
   209  
   210      /* can do one more 24 bytes round, but needs special handling */
   211      if ((ip <= ie - 24) && (mode & MODE_AVX2) != 0)
   212      {
   213          __m128i v0 = _mm_loadu_si128(as_m128c(ip));
   214          __m128i v1 = _mm_loadu_si128(as_m128c(ip + 8));
   215          __m128i v2 = _mm_srli_si128(v1, 4);
   216          __m256i vv = encode_avx2(v0, v2, vt);
   217  
   218          /* store the result, and advance buffer pointers */
   219          _mm256_storeu_si256(as_m256p(op), vv);
   220          op += 32;
   221          ip += 24;
   222      }
   223  #endif
   224  
   225      /* no more bytes */
   226      if (ip == ie)
   227      {
   228          out->len += op - ob;
   229          return;
   230      }
   231  
   232      /* handle the remaining bytes with scalar code (with 4 bytes load) */
   233      while (ip <= ie - 4)
   234      {
   235          uint32_t v0 = __builtin_bswap32(*(const uint32_t *)ip);
   236          uint8_t v1 = (v0 >> 26) & 0x3f;
   237          uint8_t v2 = (v0 >> 20) & 0x3f;
   238          uint8_t v3 = (v0 >> 14) & 0x3f;
   239          uint8_t v4 = (v0 >> 8) & 0x3f;
   240  
   241          /* encode the characters, and move to next block */
   242          ip += 3;
   243          *op++ = st[v1];
   244          *op++ = st[v2];
   245          *op++ = st[v3];
   246          *op++ = st[v4];
   247      }
   248  
   249      /* load the last bytes */
   250      size_t dp = ie - ip;
   251      uint32_t v0 = (uint32_t)(uint8_t)ip[0] << 16;
   252  
   253  #define B2 v0 |= (uint32_t)(uint8_t)ip[2]
   254  #define B1 v0 |= (uint32_t)(uint8_t)ip[1] << 8
   255  
   256  #define R4 *op++ = st[(v0 >> 0) & 0x3f]
   257  #define R3 *op++ = st[(v0 >> 6) & 0x3f]
   258  #define R2 *op++ = st[(v0 >> 12) & 0x3f]
   259  #define R1 *op++ = st[(v0 >> 18) & 0x3f]
   260  
   261  #define NB                   \
   262      {                        \
   263          out->len += op - ob; \
   264      }
   265  #define PD                          \
   266      {                               \
   267          if ((mode & MODE_RAW) == 0) \
   268          {                           \
   269              *op++ = '=';            \
   270          }                           \
   271      }
   272  
   273      /* encode the last few bytes */
   274      switch (dp)
   275      {
   276      case 3:
   277          B2;
   278          B1;
   279          R1;
   280          R2;
   281          R3;
   282          R4;
   283          NB;
   284          break;
   285      case 2:
   286          B1;
   287          R1;
   288          R2;
   289          R3;
   290          PD;
   291          NB;
   292          break;
   293      case 1:
   294          R1;
   295          R2;
   296          PD;
   297          PD;
   298          NB;
   299          break;
   300      default:
   301          NB;
   302          break;
   303      }
   304  
   305  #undef PD
   306  #undef NB
   307  #undef R1
   308  #undef R2
   309  #undef R3
   310  #undef R4
   311  #undef B1
   312  #undef B2
   313  }
   314  
   315  /** Decoder Helper Functions **/
   316  
   317  static const uint8_t VecPacking[32] = {
   318      2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 128, 128, 128, 128,
   319      2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 128, 128, 128, 128};
   320  
   321  static const uint8_t VecDecodeBits[32] = {
   322      0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   323      0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
   324  
   325  static const uint8_t VecDecodeTableStd[128] = {
   326      0x00, 0x00, 0x13, 0x04, 0xbf, 0xbf, 0xb9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   327      0x00, 0x00, 0x13, 0x04, 0xbf, 0xbf, 0xb9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   328      0xa8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0, 0x54, 0x50, 0x50, 0x50, 0x54,
   329      0xa8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0, 0x54, 0x50, 0x50, 0x50, 0x54,
   330      0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
   331      0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
   332      0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
   333      0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10};
   334  
   335  static const uint8_t VecDecodeTableURL[128] = {
   336      0x00,
   337      0x00,
   338      0x11,
   339      0x04,
   340      0xbf,
   341      0xbf,
   342      0xb9,
   343      0xb9,
   344      0x00,
   345      0x00,
   346      0x00,
   347      0x00,
   348      0x00,
   349      0x00,
   350      0x00,
   351      0x00,
   352      0x00,
   353      0x00,
   354      0x11,
   355      0x04,
   356      0xbf,
   357      0xbf,
   358      0xb9,
   359      0xb9,
   360      0x00,
   361      0x00,
   362      0x00,
   363      0x00,
   364      0x00,
   365      0x00,
   366      0x00,
   367      0x00,
   368      0xa8,
   369      0xf8,
   370      0xf8,
   371      0xf8,
   372      0xf8,
   373      0xf8,
   374      0xf8,
   375      0xf8,
   376      0xf8,
   377      0xf8,
   378      0xf0,
   379      0x50,
   380      0x50,
   381      0x54,
   382      0x50,
   383      0x70,
   384      0xa8,
   385      0xf8,
   386      0xf8,
   387      0xf8,
   388      0xf8,
   389      0xf8,
   390      0xf8,
   391      0xf8,
   392      0xf8,
   393      0xf8,
   394      0xf0,
   395      0x50,
   396      0x50,
   397      0x54,
   398      0x50,
   399      0x70,
   400      0x5f,
   401      0x5f,
   402      0x5f,
   403      0x5f,
   404      0x5f,
   405      0x5f,
   406      0x5f,
   407      0x5f,
   408      0x5f,
   409      0x5f,
   410      0x5f,
   411      0x5f,
   412      0x5f,
   413      0x5f,
   414      0x5f,
   415      0x5f,
   416      0x5f,
   417      0x5f,
   418      0x5f,
   419      0x5f,
   420      0x5f,
   421      0x5f,
   422      0x5f,
   423      0x5f,
   424      0x5f,
   425      0x5f,
   426      0x5f,
   427      0x5f,
   428      0x5f,
   429      0x5f,
   430      0x5f,
   431      0x5f,
   432      0xe0,
   433      0xe0,
   434      0xe0,
   435      0xe0,
   436      0xe0,
   437      0xe0,
   438      0xe0,
   439      0xe0,
   440      0xe0,
   441      0xe0,
   442      0xe0,
   443      0xe0,
   444      0xe0,
   445      0xe0,
   446      0xe0,
   447      0xe0,
   448      0xe0,
   449      0xe0,
   450      0xe0,
   451      0xe0,
   452      0xe0,
   453      0xe0,
   454      0xe0,
   455      0xe0,
   456      0xe0,
   457      0xe0,
   458      0xe0,
   459      0xe0,
   460      0xe0,
   461      0xe0,
   462      0xe0,
   463      0xe0,
   464  };
   465  
   466  static const uint8_t VecDecodeCharsetStd[256] = {
   467      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   468      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   469      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 62, 0xff, 0xff, 0xff, 63,
   470      52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   471      0xff, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
   472      15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0xff, 0xff, 0xff, 0xff, 0xff,
   473      0xff, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
   474      41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0xff, 0xff, 0xff, 0xff, 0xff,
   475      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   476      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   477      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   478      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   479      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   480      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   481      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   482      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   483  
   484  static const uint8_t VecDecodeCharsetURL[256] = {
   485      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   486      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   487      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 62, 0xff, 0xff,
   488      52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   489      0xff, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
   490      15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0xff, 0xff, 0xff, 0xff, 63,
   491      0xff, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
   492      41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0xff, 0xff, 0xff, 0xff, 0xff,
   493      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   494      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   495      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   496      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   497      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   498      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   499      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   500      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   501  
   502  static inline void memcopy_24(char *dp, const uint8_t *sp)
   503  {
   504      *(uint64_t *)(dp + 0) = *(const uint64_t *)(sp + 0);
   505      *(uint64_t *)(dp + 8) = *(const uint64_t *)(sp + 8);
   506      *(uint64_t *)(dp + 16) = *(const uint64_t *)(sp + 16);
   507  }
   508  
   509  static inline __m256i decode_avx2(__m256i v0, int *pos, const uint8_t *tab)
   510  {
   511      __m256i v1 = _mm256_srli_epi32(v0, 4);
   512      __m256i vl = _mm256_and_si256(v0, _mm256_set1_epi8(0x0f));
   513      __m256i vh = _mm256_and_si256(v1, _mm256_set1_epi8(0x0f));
   514      __m256i st = _mm256_loadu_si256(as_m256c(tab));
   515      __m256i mt = _mm256_loadu_si256(as_m256c(tab + 32));
   516      __m256i et = _mm256_loadu_si256(as_m256c(tab + 64));
   517      __m256i rt = _mm256_loadu_si256(as_m256c(tab + 96));
   518      __m256i pt = _mm256_loadu_si256(as_m256c(VecPacking));
   519      __m256i bt = _mm256_loadu_si256(as_m256c(VecDecodeBits));
   520      __m256i sh = _mm256_shuffle_epi8(st, vh);
   521      __m256i eq = _mm256_cmpeq_epi8(v0, et);
   522      __m256i sv = _mm256_blendv_epi8(sh, rt, eq);
   523      __m256i bm = _mm256_shuffle_epi8(mt, vl);
   524      __m256i bv = _mm256_shuffle_epi8(bt, vh);
   525      __m256i mr = _mm256_and_si256(bm, bv);
   526      __m256i nm = _mm256_cmpeq_epi8(mr, _mm256_setzero_si256());
   527      __m256i sr = _mm256_add_epi8(v0, sv);
   528      __m256i r0 = _mm256_and_si256(sr, _mm256_set1_epi8(0x3f));
   529      __m256i r1 = _mm256_maddubs_epi16(r0, _mm256_set1_epi32(0x01400140));
   530      __m256i r2 = _mm256_madd_epi16(r1, _mm256_set1_epi32(0x00011000));
   531      __m256i r3 = _mm256_shuffle_epi8(r2, pt);
   532      __m256i r4 = _mm256_permutevar8x32_epi32(r3, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, 3, 7));
   533      int64_t mp = _mm256_movemask_epi8(nm);
   534      int32_t np = __builtin_ctzll(mp | 0xffffffff00000000);
   535      return (*pos = np), r4;
   536  }
   537  
   538  /* Return 0 if success, otherwise return the error position + 1 */
   539  static inline int64_t decode_block(
   540      const uint8_t *ie,
   541      const uint8_t **ipp,
   542      char **opp,
   543      const uint8_t *tab,
   544      int mode)
   545  {
   546      int nb = 0;
   547      uint32_t v0 = 0;
   548  
   549      /* buffer pointers */
   550      char *op = *opp;
   551      const uint8_t *ip = *ipp;
   552  
   553      /* load up to 4 characters */
   554      while (nb < 4 && ip < ie)
   555      {
   556          uint8_t id;
   557          uint8_t ch = *ip;
   558  
   559          /* skip new lines */
   560          if (ch == '\r' || ch == '\n')
   561          {
   562              ip++;
   563              continue;
   564          }
   565  
   566          /* lookup the index, and check for invalid characters */
   567          if ((id = tab[ch]) == 0xff)
   568          {
   569              break;
   570          }
   571  
   572          /* move to next character */
   573          ip++;
   574          nb++;
   575          v0 = (v0 << 6) | id;
   576      }
   577  
   578      /* never ends with 1 characer */
   579      if (nb == 1)
   580      {
   581          return ip - *ipp + 1;
   582      }
   583  
   584  #define P2() \
   585      {        \
   586          E2() \
   587          P1() \
   588          P1() \
   589      }
   590  #define P1()                  \
   591      {                         \
   592          if (*ip++ != '=')     \
   593              return ip - *ipp; \
   594      } // ip has been added 1
   595  #define E2()                      \
   596      {                             \
   597          if (ip >= ie - 1)         \
   598              return ip - *ipp + 1; \
   599      }
   600  #define R1()                        \
   601      {                               \
   602          if ((mode & MODE_RAW) == 0) \
   603              return ip - *ipp + 1;   \
   604      }
   605  
   606  #define align_val()          \
   607      {                        \
   608          v0 <<= 6 * (4 - nb); \
   609      }
   610  #define parse_eof()               \
   611      {                             \
   612          if (ip < ie)              \
   613              return ip - *ipp + 1; \
   614      }
   615  #define check_pad()       \
   616      {                     \
   617          if (ip == ie)     \
   618              R1()          \
   619          else if (nb == 3) \
   620              P1()          \
   621          else              \
   622              P2()          \
   623      }
   624  
   625      /* not enough characters, can either be EOF or paddings or illegal characters */
   626      if (nb < 4)
   627      {
   628          check_pad()
   629              parse_eof()
   630                  align_val()
   631      }
   632  
   633  #undef check_pad
   634  #undef parse_eof
   635  #undef align_val
   636  
   637  #undef R1
   638  #undef E2
   639  #undef P1
   640  #undef P2
   641  
   642      /* decode into output */
   643      switch (nb)
   644      {
   645      case 4:
   646          op[2] = (v0 >> 0) & 0xff;
   647      case 3:
   648          op[1] = (v0 >> 8) & 0xff;
   649      case 2:
   650          op[0] = (v0 >> 16) & 0xff;
   651      }
   652  
   653      /* update the pointers */
   654      *ipp = ip;
   655      *opp = op + nb - 1;
   656      return 0;
   657  }
   658  
   659  ssize_t b64decode(GoSlice *out, const char *src, size_t nb, int mode)
   660  {
   661      int ep;
   662      __m256i vv;
   663      int64_t dv;
   664      uint8_t buf[32] = {0};
   665  
   666      /* check for empty input */
   667      if (nb == 0)
   668      {
   669          return 0;
   670      }
   671  
   672      /* output buffer */
   673      char *ob = out->buf + out->len;
   674      char *op = out->buf + out->len;
   675      char *oe = out->buf + out->cap;
   676  
   677      /* input buffer */
   678      const uint8_t *dt = VecDecodeTableStd;
   679      const uint8_t *st = VecDecodeCharsetStd;
   680      const uint8_t *ib = (const uint8_t *)src;
   681      const uint8_t *ip = (const uint8_t *)src;
   682      const uint8_t *ie = (const uint8_t *)src + nb;
   683  
   684      /* check for URL encoding */
   685      if (mode & MODE_URL)
   686      {
   687          dt = VecDecodeTableURL;
   688          st = VecDecodeCharsetURL;
   689      }
   690  
   691  #if USE_AVX2
   692      /* decode every 32 bytes, the final round should be handled separately, because the
   693       * SIMD instruction performs 32-byte store, and it might store past the end of the
   694       * output buffer */
   695      while ((ip <= ie - 32) && (mode & MODE_AVX2) != 0)
   696      {
   697          vv = _mm256_loadu_si256(as_m256c(ip));
   698          vv = decode_avx2(vv, &ep, dt);
   699  
   700          /* check for invalid characters (or '=' paddings) */
   701          if (ep < 32)
   702          {
   703              if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0)
   704              {
   705                  return ib - ip - dv;
   706              }
   707              else
   708              {
   709                  continue;
   710              }
   711          }
   712  
   713          /* check for store boundary, perform the last 24-byte store if needed */
   714          if (op <= oe - 32)
   715          {
   716              _mm256_storeu_si256(as_m256p(op), vv);
   717          }
   718          else
   719          {
   720              _mm256_storeu_si256(as_m256p(buf), vv);
   721              memcopy_24(op, buf);
   722          }
   723  
   724          /* move to next block */
   725          ip += 32;
   726          op += 24;
   727      }
   728  #endif
   729      /* handle the remaining bytes with scalar code (8 byte loop) */
   730      while (ip <= ie - 8 && op <= oe - 8)
   731      {
   732          uint8_t v0 = st[ip[0]];
   733          uint8_t v1 = st[ip[1]];
   734          uint8_t v2 = st[ip[2]];
   735          uint8_t v3 = st[ip[3]];
   736          uint8_t v4 = st[ip[4]];
   737          uint8_t v5 = st[ip[5]];
   738          uint8_t v6 = st[ip[6]];
   739          uint8_t v7 = st[ip[7]];
   740  
   741          /* check for invalid bytes */
   742          if ((v0 | v1 | v2 | v3 | v4 | v5 | v6 | v7) == 0xff)
   743          {
   744              if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0)
   745              {
   746                  return ib - ip - dv;
   747              }
   748              else
   749              {
   750                  continue;
   751              }
   752          }
   753  
   754          /* construct the characters */
   755          uint64_t vv = __builtin_bswap64(
   756              ((uint64_t)v0 << 58) |
   757              ((uint64_t)v1 << 52) |
   758              ((uint64_t)v2 << 46) |
   759              ((uint64_t)v3 << 40) |
   760              ((uint64_t)v4 << 34) |
   761              ((uint64_t)v5 << 28) |
   762              ((uint64_t)v6 << 22) |
   763              ((uint64_t)v7 << 16));
   764  
   765          /* store the result, and move to next block */
   766          as_m64v(op) = vv;
   767          ip += 8;
   768          op += 6;
   769      }
   770  
   771      /* handle the remaining bytes with scalar code (4 byte loop) */
   772      while (ip <= ie - 4 && op <= oe - 4)
   773      {
   774          uint8_t v0 = st[ip[0]];
   775          uint8_t v1 = st[ip[1]];
   776          uint8_t v2 = st[ip[2]];
   777          uint8_t v3 = st[ip[3]];
   778  
   779          /* check for invalid bytes */
   780          if ((v0 | v1 | v2 | v3) == 0xff)
   781          {
   782              if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0)
   783              {
   784                  return ib - ip - dv;
   785              }
   786              else
   787              {
   788                  continue;
   789              }
   790          }
   791  
   792          /* construct the characters */
   793          uint32_t vv = __builtin_bswap32(
   794              ((uint32_t)v0 << 26) |
   795              ((uint32_t)v1 << 20) |
   796              ((uint32_t)v2 << 14) |
   797              ((uint32_t)v3 << 8));
   798  
   799          /* store the result, and move to next block */
   800          as_m32v(op) = vv;
   801          ip += 4;
   802          op += 3;
   803      }
   804  
   805      /* decode the last few bytes */
   806      while (ip < ie)
   807      {
   808          if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0)
   809          {
   810              return ib - ip - dv;
   811          }
   812      }
   813  
   814      /* update the result length */
   815      out->len += op - ob;
   816      return op - ob;
   817  }