github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/utils/_lib/bit_packing_avx2.c (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  #include <stdint.h>
    18  #include <immintrin.h>
    19  #include <string.h>
    20  
    21  inline const uint32_t* unpack0_32_avx2(const uint32_t* in, uint32_t* out) {
    22    memset(out, 0x0, 32 * sizeof(*out));
    23    out += 32;
    24  
    25    return in;
    26  }
    27  
    28  inline static const uint32_t* unpack1_32_avx2(const uint32_t* in, uint32_t* out) {
    29    uint32_t mask = 0x1;
    30    __m256i reg_shifts, reg_inls, reg_masks;
    31    __m256i results;
    32  
    33    reg_masks = _mm256_set1_epi32(mask);
    34  
    35    // shift the first 8 outs
    36    reg_shifts = _mm256_set_epi32(7, 6, 5, 4,
    37                                 3, 2, 1, 0);
    38    reg_inls = _mm256_set_epi32(in[0], in[0],
    39                               in[0], in[0],
    40                               in[0], in[0],
    41                               in[0], in[0]);
    42    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
    43    _mm256_storeu_si256((__m256i*)(out), results);
    44    out += 8;
    45  
    46    // shift the second 8 outs
    47    reg_shifts = _mm256_set_epi32(15, 14, 13, 12,
    48                                  11, 10, 9, 8);
    49    reg_inls = _mm256_set_epi32(in[0], in[0],
    50                                in[0], in[0],
    51                                in[0], in[0],
    52                                in[0], in[0]);
    53    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
    54    _mm256_storeu_si256((__m256i*)(out), results);
    55    out += 8;
    56  
    57    // shift the third 8 outs
    58    reg_shifts = _mm256_set_epi32(23, 22, 21, 20,
    59                                  19, 18, 17, 16);
    60    reg_inls = _mm256_set_epi32(in[0], in[0],
    61                                in[0], in[0],
    62                                in[0], in[0],
    63                                in[0], in[0]);
    64    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
    65    _mm256_storeu_si256((__m256i*)(out), results);
    66    out += 8;
    67  
    68    // shift the last 8 outs
    69    reg_shifts = _mm256_set_epi32(31, 30, 29, 28,
    70                                  27, 26, 25, 24);
    71    reg_inls = _mm256_set_epi32(in[0], in[0],
    72                                in[0], in[0],
    73                                in[0], in[0],
    74                                in[0], in[0]);
    75    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
    76    _mm256_storeu_si256((__m256i*)(out), results);
    77    out += 8;
    78  
    79    in += 1;
    80  
    81    return in;
    82  }
    83  
    84  inline static const uint32_t* unpack2_32_avx2(const uint32_t* in, uint32_t* out) {
    85    uint32_t mask = 0x3;
    86    __m256i reg_shifts, reg_inls, reg_masks;
    87    __m256i results;
    88  
    89    reg_masks = _mm256_set1_epi32(mask);
    90  
    91    // shift the first 8 outs
    92    reg_shifts = _mm256_set_epi32(14, 12, 10, 8,
    93                                 6, 4, 2, 0);
    94    reg_inls = _mm256_set_epi32(in[0], in[0],
    95                               in[0], in[0],
    96                               in[0], in[0],
    97                               in[0], in[0]);
    98    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
    99    _mm256_storeu_si256((__m256i*)(out), results);
   100    out += 8;
   101  
   102    // shift the second 8 outs
   103    reg_shifts = _mm256_set_epi32(30, 28, 26, 24,
   104                                  22, 20, 18, 16);
   105    reg_inls = _mm256_set_epi32(in[0], in[0],
   106                                in[0], in[0],
   107                                in[0], in[0],
   108                                in[0], in[0]);
   109    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   110    _mm256_storeu_si256((__m256i*)(out), results);
   111    out += 8;
   112  
   113    // shift the third 8 outs
   114    reg_shifts = _mm256_set_epi32(14, 12, 10, 8,
   115                                  6, 4, 2, 0);
   116    reg_inls = _mm256_set_epi32(in[1], in[1],
   117                                in[1], in[1],
   118                                in[1], in[1],
   119                                in[1], in[1]);
   120    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   121    _mm256_storeu_si256((__m256i*)(out), results);
   122    out += 8;
   123  
   124    // shift the last 8 outs
   125    reg_shifts = _mm256_set_epi32(30, 28, 26, 24,
   126                                  22, 20, 18, 16);
   127    reg_inls = _mm256_set_epi32(in[1], in[1],
   128                                in[1], in[1],
   129                                in[1], in[1],
   130                                in[1], in[1]);
   131    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   132    _mm256_storeu_si256((__m256i*)(out), results);
   133    out += 8;
   134  
   135    in += 2;
   136  
   137    return in;
   138  }
   139  
   140  inline static const uint32_t* unpack3_32_avx2(const uint32_t* in, uint32_t* out) {
   141    uint32_t mask = 0x7;
   142    __m256i reg_shifts, reg_inls, reg_masks;
   143    __m256i results;
   144  
   145    reg_masks = _mm256_set1_epi32(mask);
   146  
   147    // shift the first 8 outs
   148    reg_shifts = _mm256_set_epi32(21, 18, 15, 12,
   149                                 9, 6, 3, 0);
   150    reg_inls = _mm256_set_epi32(in[0], in[0],
   151                               in[0], in[0],
   152                               in[0], in[0],
   153                               in[0], in[0]);
   154    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   155    _mm256_storeu_si256((__m256i*)(out), results);
   156    out += 8;
   157  
   158    // shift the second 8 outs
   159    reg_shifts = _mm256_set_epi32(13, 10, 7, 4,
   160                                  1, 0, 27, 24);
   161    reg_inls = _mm256_set_epi32(in[1], in[1],
   162                                in[1], in[1],
   163                                in[1], in[0] >> 30 | in[1] << 2,
   164                                in[0], in[0]);
   165    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   166    _mm256_storeu_si256((__m256i*)(out), results);
   167    out += 8;
   168  
   169    // shift the third 8 outs
   170    reg_shifts = _mm256_set_epi32(5, 2, 0, 28,
   171                                  25, 22, 19, 16);
   172    reg_inls = _mm256_set_epi32(in[2], in[2],
   173                                in[1] >> 31 | in[2] << 1, in[1],
   174                                in[1], in[1],
   175                                in[1], in[1]);
   176    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   177    _mm256_storeu_si256((__m256i*)(out), results);
   178    out += 8;
   179  
   180    // shift the last 8 outs
   181    reg_shifts = _mm256_set_epi32(29, 26, 23, 20,
   182                                  17, 14, 11, 8);
   183    reg_inls = _mm256_set_epi32(in[2], in[2],
   184                                in[2], in[2],
   185                                in[2], in[2],
   186                                in[2], in[2]);
   187    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   188    _mm256_storeu_si256((__m256i*)(out), results);
   189    out += 8;
   190  
   191    in += 3;
   192  
   193    return in;
   194  }
   195  
   196  inline static const uint32_t* unpack4_32_avx2(const uint32_t* in, uint32_t* out) {
   197    uint32_t mask = 0xf;
   198    __m256i reg_shifts, reg_inls, reg_masks;
   199    __m256i results;
   200  
   201    reg_masks = _mm256_set1_epi32(mask);
   202  
   203    // shift the first 8 outs
   204    reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
   205                                 12, 8, 4, 0);
   206    reg_inls = _mm256_set_epi32(in[0], in[0],
   207                               in[0], in[0],
   208                               in[0], in[0],
   209                               in[0], in[0]);
   210    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   211    _mm256_storeu_si256((__m256i*)(out), results);
   212    out += 8;
   213  
   214    // shift the second 8 outs
   215    reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
   216                                  12, 8, 4, 0);
   217    reg_inls = _mm256_set_epi32(in[1], in[1],
   218                                in[1], in[1],
   219                                in[1], in[1],
   220                                in[1], in[1]);
   221    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   222    _mm256_storeu_si256((__m256i*)(out), results);
   223    out += 8;
   224  
   225    // shift the third 8 outs
   226    reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
   227                                  12, 8, 4, 0);
   228    reg_inls = _mm256_set_epi32(in[2], in[2],
   229                                in[2], in[2],
   230                                in[2], in[2],
   231                                in[2], in[2]);
   232    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   233    _mm256_storeu_si256((__m256i*)(out), results);
   234    out += 8;
   235  
   236    // shift the last 8 outs
   237    reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
   238                                  12, 8, 4, 0);
   239    reg_inls = _mm256_set_epi32(in[3], in[3],
   240                                in[3], in[3],
   241                                in[3], in[3],
   242                                in[3], in[3]);
   243    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   244    _mm256_storeu_si256((__m256i*)(out), results);
   245    out += 8;
   246  
   247    in += 4;
   248  
   249    return in;
   250  }
   251  
   252  inline static const uint32_t* unpack5_32_avx2(const uint32_t* in, uint32_t* out) {
   253    uint32_t mask = 0x1f;
   254    __m256i reg_shifts, reg_inls, reg_masks;
   255    __m256i results;
   256  
   257    reg_masks = _mm256_set1_epi32(mask);
   258  
   259    // shift the first 8 outs
   260    reg_shifts = _mm256_set_epi32(3, 0, 25, 20,
   261                                 15, 10, 5, 0);
   262    reg_inls = _mm256_set_epi32(in[1], in[0] >> 30 | in[1] << 2,
   263                               in[0], in[0],
   264                               in[0], in[0],
   265                               in[0], in[0]);
   266    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   267    _mm256_storeu_si256((__m256i*)(out), results);
   268    out += 8;
   269  
   270    // shift the second 8 outs
   271    reg_shifts = _mm256_set_epi32(11, 6, 1, 0,
   272                                  23, 18, 13, 8);
   273    reg_inls = _mm256_set_epi32(in[2], in[2],
   274                                in[2], in[1] >> 28 | in[2] << 4,
   275                                in[1], in[1],
   276                                in[1], in[1]);
   277    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   278    _mm256_storeu_si256((__m256i*)(out), results);
   279    out += 8;
   280  
   281    // shift the third 8 outs
   282    reg_shifts = _mm256_set_epi32(19, 14, 9, 4,
   283                                  0, 26, 21, 16);
   284    reg_inls = _mm256_set_epi32(in[3], in[3],
   285                                in[3], in[3],
   286                                in[2] >> 31 | in[3] << 1, in[2],
   287                                in[2], in[2]);
   288    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   289    _mm256_storeu_si256((__m256i*)(out), results);
   290    out += 8;
   291  
   292    // shift the last 8 outs
   293    reg_shifts = _mm256_set_epi32(27, 22, 17, 12,
   294                                  7, 2, 0, 24);
   295    reg_inls = _mm256_set_epi32(in[4], in[4],
   296                                in[4], in[4],
   297                                in[4], in[4],
   298                                in[3] >> 29 | in[4] << 3, in[3]);
   299    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   300    _mm256_storeu_si256((__m256i*)(out), results);
   301    out += 8;
   302  
   303    in += 5;
   304  
   305    return in;
   306  }
   307  
   308  inline static const uint32_t* unpack6_32_avx2(const uint32_t* in, uint32_t* out) {
   309    uint32_t mask = 0x3f;
   310    __m256i reg_shifts, reg_inls, reg_masks;
   311    __m256i results;
   312  
   313    reg_masks = _mm256_set1_epi32(mask);
   314  
   315    // shift the first 8 outs
   316    reg_shifts = _mm256_set_epi32(10, 4, 0, 24,
   317                                 18, 12, 6, 0);
   318    reg_inls = _mm256_set_epi32(in[1], in[1],
   319                               in[0] >> 30 | in[1] << 2, in[0],
   320                               in[0], in[0],
   321                               in[0], in[0]);
   322    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   323    _mm256_storeu_si256((__m256i*)(out), results);
   324    out += 8;
   325  
   326    // shift the second 8 outs
   327    reg_shifts = _mm256_set_epi32(26, 20, 14, 8,
   328                                  2, 0, 22, 16);
   329    reg_inls = _mm256_set_epi32(in[2], in[2],
   330                                in[2], in[2],
   331                                in[2], in[1] >> 28 | in[2] << 4,
   332                                in[1], in[1]);
   333    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   334    _mm256_storeu_si256((__m256i*)(out), results);
   335    out += 8;
   336  
   337    // shift the third 8 outs
   338    reg_shifts = _mm256_set_epi32(10, 4, 0, 24,
   339                                  18, 12, 6, 0);
   340    reg_inls = _mm256_set_epi32(in[4], in[4],
   341                                in[3] >> 30 | in[4] << 2, in[3],
   342                                in[3], in[3],
   343                                in[3], in[3]);
   344    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   345    _mm256_storeu_si256((__m256i*)(out), results);
   346    out += 8;
   347  
   348    // shift the last 8 outs
   349    reg_shifts = _mm256_set_epi32(26, 20, 14, 8,
   350                                  2, 0, 22, 16);
   351    reg_inls = _mm256_set_epi32(in[5], in[5],
   352                                in[5], in[5],
   353                                in[5], in[4] >> 28 | in[5] << 4,
   354                                in[4], in[4]);
   355    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   356    _mm256_storeu_si256((__m256i*)(out), results);
   357    out += 8;
   358  
   359    in += 6;
   360  
   361    return in;
   362  }
   363  
   364  inline static const uint32_t* unpack7_32_avx2(const uint32_t* in, uint32_t* out) {
   365    uint32_t mask = 0x7f;
   366    __m256i reg_shifts, reg_inls, reg_masks;
   367    __m256i results;
   368  
   369    reg_masks = _mm256_set1_epi32(mask);
   370  
   371    // shift the first 8 outs
   372    reg_shifts = _mm256_set_epi32(17, 10, 3, 0,
   373                                 21, 14, 7, 0);
   374    reg_inls = _mm256_set_epi32(in[1], in[1],
   375                               in[1], in[0] >> 28 | in[1] << 4,
   376                               in[0], in[0],
   377                               in[0], in[0]);
   378    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   379    _mm256_storeu_si256((__m256i*)(out), results);
   380    out += 8;
   381  
   382    // shift the second 8 outs
   383    reg_shifts = _mm256_set_epi32(9, 2, 0, 20,
   384                                  13, 6, 0, 24);
   385    reg_inls = _mm256_set_epi32(in[3], in[3],
   386                                in[2] >> 27 | in[3] << 5, in[2],
   387                                in[2], in[2],
   388                                in[1] >> 31 | in[2] << 1, in[1]);
   389    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   390    _mm256_storeu_si256((__m256i*)(out), results);
   391    out += 8;
   392  
   393    // shift the third 8 outs
   394    reg_shifts = _mm256_set_epi32(1, 0, 19, 12,
   395                                  5, 0, 23, 16);
   396    reg_inls = _mm256_set_epi32(in[5], in[4] >> 26 | in[5] << 6,
   397                                in[4], in[4],
   398                                in[4], in[3] >> 30 | in[4] << 2,
   399                                in[3], in[3]);
   400    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   401    _mm256_storeu_si256((__m256i*)(out), results);
   402    out += 8;
   403  
   404    // shift the last 8 outs
   405    reg_shifts = _mm256_set_epi32(25, 18, 11, 4,
   406                                  0, 22, 15, 8);
   407    reg_inls = _mm256_set_epi32(in[6], in[6],
   408                                in[6], in[6],
   409                                in[5] >> 29 | in[6] << 3, in[5],
   410                                in[5], in[5]);
   411    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   412    _mm256_storeu_si256((__m256i*)(out), results);
   413    out += 8;
   414  
   415    in += 7;
   416  
   417    return in;
   418  }
   419  
   420  inline static const uint32_t* unpack8_32_avx2(const uint32_t* in, uint32_t* out) {
   421    uint32_t mask = 0xff;
   422    __m256i reg_shifts, reg_inls, reg_masks;
   423    __m256i results;
   424  
   425    reg_masks = _mm256_set1_epi32(mask);
   426  
   427    // shift the first 8 outs
   428    reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
   429                                 24, 16, 8, 0);
   430    reg_inls = _mm256_set_epi32(in[1], in[1],
   431                               in[1], in[1],
   432                               in[0], in[0],
   433                               in[0], in[0]);
   434    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   435    _mm256_storeu_si256((__m256i*)(out), results);
   436    out += 8;
   437  
   438    // shift the second 8 outs
   439    reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
   440                                  24, 16, 8, 0);
   441    reg_inls = _mm256_set_epi32(in[3], in[3],
   442                                in[3], in[3],
   443                                in[2], in[2],
   444                                in[2], in[2]);
   445    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   446    _mm256_storeu_si256((__m256i*)(out), results);
   447    out += 8;
   448  
   449    // shift the third 8 outs
   450    reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
   451                                  24, 16, 8, 0);
   452    reg_inls = _mm256_set_epi32(in[5], in[5],
   453                                in[5], in[5],
   454                                in[4], in[4],
   455                                in[4], in[4]);
   456    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   457    _mm256_storeu_si256((__m256i*)(out), results);
   458    out += 8;
   459  
   460    // shift the last 8 outs
   461    reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
   462                                  24, 16, 8, 0);
   463    reg_inls = _mm256_set_epi32(in[7], in[7],
   464                                in[7], in[7],
   465                                in[6], in[6],
   466                                in[6], in[6]);
   467    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   468    _mm256_storeu_si256((__m256i*)(out), results);
   469    out += 8;
   470  
   471    in += 8;
   472  
   473    return in;
   474  }
   475  
   476  inline static const uint32_t* unpack9_32_avx2(const uint32_t* in, uint32_t* out) {
   477    uint32_t mask = 0x1ff;
   478    __m256i reg_shifts, reg_inls, reg_masks;
   479    __m256i results;
   480  
   481    reg_masks = _mm256_set1_epi32(mask);
   482  
   483    // shift the first 8 outs
   484    reg_shifts = _mm256_set_epi32(0, 22, 13, 4,
   485                                 0, 18, 9, 0);
   486    reg_inls = _mm256_set_epi32(in[1] >> 31 | in[2] << 1, in[1],
   487                               in[1], in[1],
   488                               in[0] >> 27 | in[1] << 5, in[0],
   489                               in[0], in[0]);
   490    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   491    _mm256_storeu_si256((__m256i*)(out), results);
   492    out += 8;
   493  
   494    // shift the second 8 outs
   495    reg_shifts = _mm256_set_epi32(7, 0, 21, 12,
   496                                  3, 0, 17, 8);
   497    reg_inls = _mm256_set_epi32(in[4], in[3] >> 30 | in[4] << 2,
   498                                in[3], in[3],
   499                                in[3], in[2] >> 26 | in[3] << 6,
   500                                in[2], in[2]);
   501    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   502    _mm256_storeu_si256((__m256i*)(out), results);
   503    out += 8;
   504  
   505    // shift the third 8 outs
   506    reg_shifts = _mm256_set_epi32(15, 6, 0, 20,
   507                                  11, 2, 0, 16);
   508    reg_inls = _mm256_set_epi32(in[6], in[6],
   509                                in[5] >> 29 | in[6] << 3, in[5],
   510                                in[5], in[5],
   511                                in[4] >> 25 | in[5] << 7, in[4]);
   512    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   513    _mm256_storeu_si256((__m256i*)(out), results);
   514    out += 8;
   515  
   516    // shift the last 8 outs
   517    reg_shifts = _mm256_set_epi32(23, 14, 5, 0,
   518                                  19, 10, 1, 0);
   519    reg_inls = _mm256_set_epi32(in[8], in[8],
   520                                in[8], in[7] >> 28 | in[8] << 4,
   521                                in[7], in[7],
   522                                in[7], in[6] >> 24 | in[7] << 8);
   523    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   524    _mm256_storeu_si256((__m256i*)(out), results);
   525    out += 8;
   526  
   527    in += 9;
   528  
   529    return in;
   530  }
   531  
   532  inline static const uint32_t* unpack10_32_avx2(const uint32_t* in, uint32_t* out) {
   533    uint32_t mask = 0x3ff;
   534    __m256i reg_shifts, reg_inls, reg_masks;
   535    __m256i results;
   536  
   537    reg_masks = _mm256_set1_epi32(mask);
   538  
   539    // shift the first 8 outs
   540    reg_shifts = _mm256_set_epi32(6, 0, 18, 8,
   541                                 0, 20, 10, 0);
   542    reg_inls = _mm256_set_epi32(in[2], in[1] >> 28 | in[2] << 4,
   543                               in[1], in[1],
   544                               in[0] >> 30 | in[1] << 2, in[0],
   545                               in[0], in[0]);
   546    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   547    _mm256_storeu_si256((__m256i*)(out), results);
   548    out += 8;
   549  
   550    // shift the second 8 outs
   551    reg_shifts = _mm256_set_epi32(22, 12, 2, 0,
   552                                  14, 4, 0, 16);
   553    reg_inls = _mm256_set_epi32(in[4], in[4],
   554                                in[4], in[3] >> 24 | in[4] << 8,
   555                                in[3], in[3],
   556                                in[2] >> 26 | in[3] << 6, in[2]);
   557    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   558    _mm256_storeu_si256((__m256i*)(out), results);
   559    out += 8;
   560  
   561    // shift the third 8 outs
   562    reg_shifts = _mm256_set_epi32(6, 0, 18, 8,
   563                                  0, 20, 10, 0);
   564    reg_inls = _mm256_set_epi32(in[7], in[6] >> 28 | in[7] << 4,
   565                                in[6], in[6],
   566                                in[5] >> 30 | in[6] << 2, in[5],
   567                                in[5], in[5]);
   568    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   569    _mm256_storeu_si256((__m256i*)(out), results);
   570    out += 8;
   571  
   572    // shift the last 8 outs
   573    reg_shifts = _mm256_set_epi32(22, 12, 2, 0,
   574                                  14, 4, 0, 16);
   575    reg_inls = _mm256_set_epi32(in[9], in[9],
   576                                in[9], in[8] >> 24 | in[9] << 8,
   577                                in[8], in[8],
   578                                in[7] >> 26 | in[8] << 6, in[7]);
   579    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   580    _mm256_storeu_si256((__m256i*)(out), results);
   581    out += 8;
   582  
   583    in += 10;
   584  
   585    return in;
   586  }
   587  
   588  inline static const uint32_t* unpack11_32_avx2(const uint32_t* in, uint32_t* out) {
   589    uint32_t mask = 0x7ff;
   590    __m256i reg_shifts, reg_inls, reg_masks;
   591    __m256i results;
   592  
   593    reg_masks = _mm256_set1_epi32(mask);
   594  
   595    // shift the first 8 outs
   596    reg_shifts = _mm256_set_epi32(13, 2, 0, 12,
   597                                 1, 0, 11, 0);
   598    reg_inls = _mm256_set_epi32(in[2], in[2],
   599                               in[1] >> 23 | in[2] << 9, in[1],
   600                               in[1], in[0] >> 22 | in[1] << 10,
   601                               in[0], in[0]);
   602    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   603    _mm256_storeu_si256((__m256i*)(out), results);
   604    out += 8;
   605  
   606    // shift the second 8 outs
   607    reg_shifts = _mm256_set_epi32(5, 0, 15, 4,
   608                                  0, 14, 3, 0);
   609    reg_inls = _mm256_set_epi32(in[5], in[4] >> 26 | in[5] << 6,
   610                                in[4], in[4],
   611                                in[3] >> 25 | in[4] << 7, in[3],
   612                                in[3], in[2] >> 24 | in[3] << 8);
   613    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   614    _mm256_storeu_si256((__m256i*)(out), results);
   615    out += 8;
   616  
   617    // shift the third 8 outs
   618    reg_shifts = _mm256_set_epi32(0, 18, 7, 0,
   619                                  17, 6, 0, 16);
   620    reg_inls = _mm256_set_epi32(in[7] >> 29 | in[8] << 3, in[7],
   621                                in[7], in[6] >> 28 | in[7] << 4,
   622                                in[6], in[6],
   623                                in[5] >> 27 | in[6] << 5, in[5]);
   624    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   625    _mm256_storeu_si256((__m256i*)(out), results);
   626    out += 8;
   627  
   628    // shift the last 8 outs
   629    reg_shifts = _mm256_set_epi32(21, 10, 0, 20,
   630                                  9, 0, 19, 8);
   631    reg_inls = _mm256_set_epi32(in[10], in[10],
   632                                in[9] >> 31 | in[10] << 1, in[9],
   633                                in[9], in[8] >> 30 | in[9] << 2,
   634                                in[8], in[8]);
   635    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   636    _mm256_storeu_si256((__m256i*)(out), results);
   637    out += 8;
   638  
   639    in += 11;
   640  
   641    return in;
   642  }
   643  
   644  inline static const uint32_t* unpack12_32_avx2(const uint32_t* in, uint32_t* out) {
   645    uint32_t mask = 0xfff;
   646    __m256i reg_shifts, reg_inls, reg_masks;
   647    __m256i results;
   648  
   649    reg_masks = _mm256_set1_epi32(mask);
   650  
   651    // shift the first 8 outs
   652    reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
   653                                 4, 0, 12, 0);
   654    reg_inls = _mm256_set_epi32(in[2], in[2],
   655                               in[1] >> 28 | in[2] << 4, in[1],
   656                               in[1], in[0] >> 24 | in[1] << 8,
   657                               in[0], in[0]);
   658    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   659    _mm256_storeu_si256((__m256i*)(out), results);
   660    out += 8;
   661  
   662    // shift the second 8 outs
   663    reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
   664                                  4, 0, 12, 0);
   665    reg_inls = _mm256_set_epi32(in[5], in[5],
   666                                in[4] >> 28 | in[5] << 4, in[4],
   667                                in[4], in[3] >> 24 | in[4] << 8,
   668                                in[3], in[3]);
   669    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   670    _mm256_storeu_si256((__m256i*)(out), results);
   671    out += 8;
   672  
   673    // shift the third 8 outs
   674    reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
   675                                  4, 0, 12, 0);
   676    reg_inls = _mm256_set_epi32(in[8], in[8],
   677                                in[7] >> 28 | in[8] << 4, in[7],
   678                                in[7], in[6] >> 24 | in[7] << 8,
   679                                in[6], in[6]);
   680    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   681    _mm256_storeu_si256((__m256i*)(out), results);
   682    out += 8;
   683  
   684    // shift the last 8 outs
   685    reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
   686                                  4, 0, 12, 0);
   687    reg_inls = _mm256_set_epi32(in[11], in[11],
   688                                in[10] >> 28 | in[11] << 4, in[10],
   689                                in[10], in[9] >> 24 | in[10] << 8,
   690                                in[9], in[9]);
   691    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   692    _mm256_storeu_si256((__m256i*)(out), results);
   693    out += 8;
   694  
   695    in += 12;
   696  
   697    return in;
   698  }
   699  
   700  inline static const uint32_t* unpack13_32_avx2(const uint32_t* in, uint32_t* out) {
   701    uint32_t mask = 0x1fff;
   702    __m256i reg_shifts, reg_inls, reg_masks;
   703    __m256i results;
   704  
   705    reg_masks = _mm256_set1_epi32(mask);
   706  
   707    // shift the first 8 outs
   708    reg_shifts = _mm256_set_epi32(0, 14, 1, 0,
   709                                 7, 0, 13, 0);
   710    reg_inls = _mm256_set_epi32(in[2] >> 27 | in[3] << 5, in[2],
   711                               in[2], in[1] >> 20 | in[2] << 12,
   712                               in[1], in[0] >> 26 | in[1] << 6,
   713                               in[0], in[0]);
   714    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   715    _mm256_storeu_si256((__m256i*)(out), results);
   716    out += 8;
   717  
   718    // shift the second 8 outs
   719    reg_shifts = _mm256_set_epi32(3, 0, 9, 0,
   720                                  15, 2, 0, 8);
   721    reg_inls = _mm256_set_epi32(in[6], in[5] >> 22 | in[6] << 10,
   722                                in[5], in[4] >> 28 | in[5] << 4,
   723                                in[4], in[4],
   724                                in[3] >> 21 | in[4] << 11, in[3]);
   725    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   726    _mm256_storeu_si256((__m256i*)(out), results);
   727    out += 8;
   728  
   729    // shift the third 8 outs
   730    reg_shifts = _mm256_set_epi32(11, 0, 17, 4,
   731                                  0, 10, 0, 16);
   732    reg_inls = _mm256_set_epi32(in[9], in[8] >> 30 | in[9] << 2,
   733                                in[8], in[8],
   734                                in[7] >> 23 | in[8] << 9, in[7],
   735                                in[6] >> 29 | in[7] << 3, in[6]);
   736    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   737    _mm256_storeu_si256((__m256i*)(out), results);
   738    out += 8;
   739  
   740    // shift the last 8 outs
   741    reg_shifts = _mm256_set_epi32(19, 6, 0, 12,
   742                                  0, 18, 5, 0);
   743    reg_inls = _mm256_set_epi32(in[12], in[12],
   744                                in[11] >> 25 | in[12] << 7, in[11],
   745                                in[10] >> 31 | in[11] << 1, in[10],
   746                                in[10], in[9] >> 24 | in[10] << 8);
   747    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   748    _mm256_storeu_si256((__m256i*)(out), results);
   749    out += 8;
   750  
   751    in += 13;
   752  
   753    return in;
   754  }
   755  
   756  inline static const uint32_t* unpack14_32_avx2(const uint32_t* in, uint32_t* out) {
   757    uint32_t mask = 0x3fff;
   758    __m256i reg_shifts, reg_inls, reg_masks;
   759    __m256i results;
   760  
   761    reg_masks = _mm256_set1_epi32(mask);
   762  
   763    // shift the first 8 outs
   764    reg_shifts = _mm256_set_epi32(2, 0, 6, 0,
   765                                 10, 0, 14, 0);
   766    reg_inls = _mm256_set_epi32(in[3], in[2] >> 20 | in[3] << 12,
   767                               in[2], in[1] >> 24 | in[2] << 8,
   768                               in[1], in[0] >> 28 | in[1] << 4,
   769                               in[0], in[0]);
   770    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   771    _mm256_storeu_si256((__m256i*)(out), results);
   772    out += 8;
   773  
   774    // shift the second 8 outs
   775    reg_shifts = _mm256_set_epi32(18, 4, 0, 8,
   776                                  0, 12, 0, 16);
   777    reg_inls = _mm256_set_epi32(in[6], in[6],
   778                                in[5] >> 22 | in[6] << 10, in[5],
   779                                in[4] >> 26 | in[5] << 6, in[4],
   780                                in[3] >> 30 | in[4] << 2, in[3]);
   781    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   782    _mm256_storeu_si256((__m256i*)(out), results);
   783    out += 8;
   784  
   785    // shift the third 8 outs
   786    reg_shifts = _mm256_set_epi32(2, 0, 6, 0,
   787                                  10, 0, 14, 0);
   788    reg_inls = _mm256_set_epi32(in[10], in[9] >> 20 | in[10] << 12,
   789                                in[9], in[8] >> 24 | in[9] << 8,
   790                                in[8], in[7] >> 28 | in[8] << 4,
   791                                in[7], in[7]);
   792    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   793    _mm256_storeu_si256((__m256i*)(out), results);
   794    out += 8;
   795  
   796    // shift the last 8 outs
   797    reg_shifts = _mm256_set_epi32(18, 4, 0, 8,
   798                                  0, 12, 0, 16);
   799    reg_inls = _mm256_set_epi32(in[13], in[13],
   800                                in[12] >> 22 | in[13] << 10, in[12],
   801                                in[11] >> 26 | in[12] << 6, in[11],
   802                                in[10] >> 30 | in[11] << 2, in[10]);
   803    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   804    _mm256_storeu_si256((__m256i*)(out), results);
   805    out += 8;
   806  
   807    in += 14;
   808  
   809    return in;
   810  }
   811  
   812  inline static const uint32_t* unpack15_32_avx2(const uint32_t* in, uint32_t* out) {
   813    uint32_t mask = 0x7fff;
   814    __m256i reg_shifts, reg_inls, reg_masks;
   815    __m256i results;
   816  
   817    reg_masks = _mm256_set1_epi32(mask);
   818  
   819    // shift the first 8 outs
   820    reg_shifts = _mm256_set_epi32(9, 0, 11, 0,
   821                                 13, 0, 15, 0);
   822    reg_inls = _mm256_set_epi32(in[3], in[2] >> 26 | in[3] << 6,
   823                               in[2], in[1] >> 28 | in[2] << 4,
   824                               in[1], in[0] >> 30 | in[1] << 2,
   825                               in[0], in[0]);
   826    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   827    _mm256_storeu_si256((__m256i*)(out), results);
   828    out += 8;
   829  
   830    // shift the second 8 outs
   831    reg_shifts = _mm256_set_epi32(1, 0, 3, 0,
   832                                  5, 0, 7, 0);
   833    reg_inls = _mm256_set_epi32(in[7], in[6] >> 18 | in[7] << 14,
   834                                in[6], in[5] >> 20 | in[6] << 12,
   835                                in[5], in[4] >> 22 | in[5] << 10,
   836                                in[4], in[3] >> 24 | in[4] << 8);
   837    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   838    _mm256_storeu_si256((__m256i*)(out), results);
   839    out += 8;
   840  
   841    // shift the third 8 outs
   842    reg_shifts = _mm256_set_epi32(0, 10, 0, 12,
   843                                  0, 14, 0, 16);
   844    reg_inls = _mm256_set_epi32(in[10] >> 25 | in[11] << 7, in[10],
   845                                in[9] >> 27 | in[10] << 5, in[9],
   846                                in[8] >> 29 | in[9] << 3, in[8],
   847                                in[7] >> 31 | in[8] << 1, in[7]);
   848    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   849    _mm256_storeu_si256((__m256i*)(out), results);
   850    out += 8;
   851  
   852    // shift the last 8 outs
   853    reg_shifts = _mm256_set_epi32(17, 2, 0, 4,
   854                                  0, 6, 0, 8);
   855    reg_inls = _mm256_set_epi32(in[14], in[14],
   856                                in[13] >> 19 | in[14] << 13, in[13],
   857                                in[12] >> 21 | in[13] << 11, in[12],
   858                                in[11] >> 23 | in[12] << 9, in[11]);
   859    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   860    _mm256_storeu_si256((__m256i*)(out), results);
   861    out += 8;
   862  
   863    in += 15;
   864  
   865    return in;
   866  }
   867  
   868  inline static const uint32_t* unpack16_32_avx2(const uint32_t* in, uint32_t* out) {
   869    uint32_t mask = 0xffff;
   870    __m256i reg_shifts, reg_inls, reg_masks;
   871    __m256i results;
   872  
   873    reg_masks = _mm256_set1_epi32(mask);
   874  
   875    // shift the first 8 outs
   876    reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
   877                                 16, 0, 16, 0);
   878    reg_inls = _mm256_set_epi32(in[3], in[3],
   879                               in[2], in[2],
   880                               in[1], in[1],
   881                               in[0], in[0]);
   882    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   883    _mm256_storeu_si256((__m256i*)(out), results);
   884    out += 8;
   885  
   886    // shift the second 8 outs
   887    reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
   888                                  16, 0, 16, 0);
   889    reg_inls = _mm256_set_epi32(in[7], in[7],
   890                                in[6], in[6],
   891                                in[5], in[5],
   892                                in[4], in[4]);
   893    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   894    _mm256_storeu_si256((__m256i*)(out), results);
   895    out += 8;
   896  
   897    // shift the third 8 outs
   898    reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
   899                                  16, 0, 16, 0);
   900    reg_inls = _mm256_set_epi32(in[11], in[11],
   901                                in[10], in[10],
   902                                in[9], in[9],
   903                                in[8], in[8]);
   904    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   905    _mm256_storeu_si256((__m256i*)(out), results);
   906    out += 8;
   907  
   908    // shift the last 8 outs
   909    reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
   910                                  16, 0, 16, 0);
   911    reg_inls = _mm256_set_epi32(in[15], in[15],
   912                                in[14], in[14],
   913                                in[13], in[13],
   914                                in[12], in[12]);
   915    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   916    _mm256_storeu_si256((__m256i*)(out), results);
   917    out += 8;
   918  
   919    in += 16;
   920  
   921    return in;
   922  }
   923  
   924  inline static const uint32_t* unpack17_32_avx2(const uint32_t* in, uint32_t* out) {
   925    uint32_t mask = 0x1ffff;
   926    __m256i reg_shifts, reg_inls, reg_masks;
   927    __m256i results;
   928  
   929    reg_masks = _mm256_set1_epi32(mask);
   930  
   931    // shift the first 8 outs
   932    reg_shifts = _mm256_set_epi32(0, 6, 0, 4,
   933                                 0, 2, 0, 0);
   934    reg_inls = _mm256_set_epi32(in[3] >> 23 | in[4] << 9, in[3],
   935                               in[2] >> 21 | in[3] << 11, in[2],
   936                               in[1] >> 19 | in[2] << 13, in[1],
   937                               in[0] >> 17 | in[1] << 15, in[0]);
   938    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   939    _mm256_storeu_si256((__m256i*)(out), results);
   940    out += 8;
   941  
   942    // shift the second 8 outs
   943    reg_shifts = _mm256_set_epi32(0, 14, 0, 12,
   944                                  0, 10, 0, 8);
   945    reg_inls = _mm256_set_epi32(in[7] >> 31 | in[8] << 1, in[7],
   946                                in[6] >> 29 | in[7] << 3, in[6],
   947                                in[5] >> 27 | in[6] << 5, in[5],
   948                                in[4] >> 25 | in[5] << 7, in[4]);
   949    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   950    _mm256_storeu_si256((__m256i*)(out), results);
   951    out += 8;
   952  
   953    // shift the third 8 outs
   954    reg_shifts = _mm256_set_epi32(7, 0, 5, 0,
   955                                  3, 0, 1, 0);
   956    reg_inls = _mm256_set_epi32(in[12], in[11] >> 22 | in[12] << 10,
   957                                in[11], in[10] >> 20 | in[11] << 12,
   958                                in[10], in[9] >> 18 | in[10] << 14,
   959                                in[9], in[8] >> 16 | in[9] << 16);
   960    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   961    _mm256_storeu_si256((__m256i*)(out), results);
   962    out += 8;
   963  
   964    // shift the last 8 outs
   965    reg_shifts = _mm256_set_epi32(15, 0, 13, 0,
   966                                  11, 0, 9, 0);
   967    reg_inls = _mm256_set_epi32(in[16], in[15] >> 30 | in[16] << 2,
   968                                in[15], in[14] >> 28 | in[15] << 4,
   969                                in[14], in[13] >> 26 | in[14] << 6,
   970                                in[13], in[12] >> 24 | in[13] << 8);
   971    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   972    _mm256_storeu_si256((__m256i*)(out), results);
   973    out += 8;
   974  
   975    in += 17;
   976  
   977    return in;
   978  }
   979  
   980  inline static const uint32_t* unpack18_32_avx2(const uint32_t* in, uint32_t* out) {
   981    uint32_t mask = 0x3ffff;
   982    __m256i reg_shifts, reg_inls, reg_masks;
   983    __m256i results;
   984  
   985    reg_masks = _mm256_set1_epi32(mask);
   986  
   987    // shift the first 8 outs
   988    reg_shifts = _mm256_set_epi32(0, 12, 0, 8,
   989                                 0, 4, 0, 0);
   990    reg_inls = _mm256_set_epi32(in[3] >> 30 | in[4] << 2, in[3],
   991                               in[2] >> 26 | in[3] << 6, in[2],
   992                               in[1] >> 22 | in[2] << 10, in[1],
   993                               in[0] >> 18 | in[1] << 14, in[0]);
   994    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
   995    _mm256_storeu_si256((__m256i*)(out), results);
   996    out += 8;
   997  
   998    // shift the second 8 outs
   999    reg_shifts = _mm256_set_epi32(14, 0, 10, 0,
  1000                                  6, 0, 2, 0);
  1001    reg_inls = _mm256_set_epi32(in[8], in[7] >> 28 | in[8] << 4,
  1002                                in[7], in[6] >> 24 | in[7] << 8,
  1003                                in[6], in[5] >> 20 | in[6] << 12,
  1004                                in[5], in[4] >> 16 | in[5] << 16);
  1005    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1006    _mm256_storeu_si256((__m256i*)(out), results);
  1007    out += 8;
  1008  
  1009    // shift the third 8 outs
  1010    reg_shifts = _mm256_set_epi32(0, 12, 0, 8,
  1011                                  0, 4, 0, 0);
  1012    reg_inls = _mm256_set_epi32(in[12] >> 30 | in[13] << 2, in[12],
  1013                                in[11] >> 26 | in[12] << 6, in[11],
  1014                                in[10] >> 22 | in[11] << 10, in[10],
  1015                                in[9] >> 18 | in[10] << 14, in[9]);
  1016    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1017    _mm256_storeu_si256((__m256i*)(out), results);
  1018    out += 8;
  1019  
  1020    // shift the last 8 outs
  1021    reg_shifts = _mm256_set_epi32(14, 0, 10, 0,
  1022                                  6, 0, 2, 0);
  1023    reg_inls = _mm256_set_epi32(in[17], in[16] >> 28 | in[17] << 4,
  1024                                in[16], in[15] >> 24 | in[16] << 8,
  1025                                in[15], in[14] >> 20 | in[15] << 12,
  1026                                in[14], in[13] >> 16 | in[14] << 16);
  1027    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1028    _mm256_storeu_si256((__m256i*)(out), results);
  1029    out += 8;
  1030  
  1031    in += 18;
  1032  
  1033    return in;
  1034  }
  1035  
  1036  inline static const uint32_t* unpack19_32_avx2(const uint32_t* in, uint32_t* out) {
  1037    uint32_t mask = 0x7ffff;
  1038    __m256i reg_shifts, reg_inls, reg_masks;
  1039    __m256i results;
  1040  
  1041    reg_masks = _mm256_set1_epi32(mask);
  1042  
  1043    // shift the first 8 outs
  1044    reg_shifts = _mm256_set_epi32(5, 0, 0, 12,
  1045                                 0, 6, 0, 0);
  1046    reg_inls = _mm256_set_epi32(in[4], in[3] >> 18 | in[4] << 14,
  1047                               in[2] >> 31 | in[3] << 1, in[2],
  1048                               in[1] >> 25 | in[2] << 7, in[1],
  1049                               in[0] >> 19 | in[1] << 13, in[0]);
  1050    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1051    _mm256_storeu_si256((__m256i*)(out), results);
  1052    out += 8;
  1053  
  1054    // shift the second 8 outs
  1055    reg_shifts = _mm256_set_epi32(0, 10, 0, 4,
  1056                                  0, 0, 11, 0);
  1057    reg_inls = _mm256_set_epi32(in[8] >> 29 | in[9] << 3, in[8],
  1058                                in[7] >> 23 | in[8] << 9, in[7],
  1059                                in[6] >> 17 | in[7] << 15, in[5] >> 30 | in[6] << 2,
  1060                                in[5], in[4] >> 24 | in[5] << 8);
  1061    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1062    _mm256_storeu_si256((__m256i*)(out), results);
  1063    out += 8;
  1064  
  1065    // shift the third 8 outs
  1066    reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
  1067                                  9, 0, 3, 0);
  1068    reg_inls = _mm256_set_epi32(in[13] >> 21 | in[14] << 11, in[13],
  1069                                in[12] >> 15 | in[13] << 17, in[11] >> 28 | in[12] << 4,
  1070                                in[11], in[10] >> 22 | in[11] << 10,
  1071                                in[10], in[9] >> 16 | in[10] << 16);
  1072    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1073    _mm256_storeu_si256((__m256i*)(out), results);
  1074    out += 8;
  1075  
  1076    // shift the last 8 outs
  1077    reg_shifts = _mm256_set_epi32(13, 0, 7, 0,
  1078                                  1, 0, 0, 8);
  1079    reg_inls = _mm256_set_epi32(in[18], in[17] >> 26 | in[18] << 6,
  1080                                in[17], in[16] >> 20 | in[17] << 12,
  1081                                in[16], in[15] >> 14 | in[16] << 18,
  1082                                in[14] >> 27 | in[15] << 5, in[14]);
  1083    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1084    _mm256_storeu_si256((__m256i*)(out), results);
  1085    out += 8;
  1086  
  1087    in += 19;
  1088  
  1089    return in;
  1090  }
  1091  
  1092  inline static const uint32_t* unpack20_32_avx2(const uint32_t* in, uint32_t* out) {
  1093    uint32_t mask = 0xfffff;
  1094    __m256i reg_shifts, reg_inls, reg_masks;
  1095    __m256i results;
  1096  
  1097    reg_masks = _mm256_set1_epi32(mask);
  1098  
  1099    // shift the first 8 outs
  1100    reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
  1101                                 0, 8, 0, 0);
  1102    reg_inls = _mm256_set_epi32(in[4], in[3] >> 24 | in[4] << 8,
  1103                               in[3], in[2] >> 16 | in[3] << 16,
  1104                               in[1] >> 28 | in[2] << 4, in[1],
  1105                               in[0] >> 20 | in[1] << 12, in[0]);
  1106    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1107    _mm256_storeu_si256((__m256i*)(out), results);
  1108    out += 8;
  1109  
  1110    // shift the second 8 outs
  1111    reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
  1112                                  0, 8, 0, 0);
  1113    reg_inls = _mm256_set_epi32(in[9], in[8] >> 24 | in[9] << 8,
  1114                                in[8], in[7] >> 16 | in[8] << 16,
  1115                                in[6] >> 28 | in[7] << 4, in[6],
  1116                                in[5] >> 20 | in[6] << 12, in[5]);
  1117    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1118    _mm256_storeu_si256((__m256i*)(out), results);
  1119    out += 8;
  1120  
  1121    // shift the third 8 outs
  1122    reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
  1123                                  0, 8, 0, 0);
  1124    reg_inls = _mm256_set_epi32(in[14], in[13] >> 24 | in[14] << 8,
  1125                                in[13], in[12] >> 16 | in[13] << 16,
  1126                                in[11] >> 28 | in[12] << 4, in[11],
  1127                                in[10] >> 20 | in[11] << 12, in[10]);
  1128    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1129    _mm256_storeu_si256((__m256i*)(out), results);
  1130    out += 8;
  1131  
  1132    // shift the last 8 outs
  1133    reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
  1134                                  0, 8, 0, 0);
  1135    reg_inls = _mm256_set_epi32(in[19], in[18] >> 24 | in[19] << 8,
  1136                                in[18], in[17] >> 16 | in[18] << 16,
  1137                                in[16] >> 28 | in[17] << 4, in[16],
  1138                                in[15] >> 20 | in[16] << 12, in[15]);
  1139    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1140    _mm256_storeu_si256((__m256i*)(out), results);
  1141    out += 8;
  1142  
  1143    in += 20;
  1144  
  1145    return in;
  1146  }
  1147  
  1148  inline static const uint32_t* unpack21_32_avx2(const uint32_t* in, uint32_t* out) {
  1149    uint32_t mask = 0x1fffff;
  1150    __m256i reg_shifts, reg_inls, reg_masks;
  1151    __m256i results;
  1152  
  1153    reg_masks = _mm256_set1_epi32(mask);
  1154  
  1155    // shift the first 8 outs
  1156    reg_shifts = _mm256_set_epi32(0, 0, 9, 0,
  1157                                 0, 10, 0, 0);
  1158    reg_inls = _mm256_set_epi32(in[4] >> 19 | in[5] << 13, in[3] >> 30 | in[4] << 2,
  1159                               in[3], in[2] >> 20 | in[3] << 12,
  1160                               in[1] >> 31 | in[2] << 1, in[1],
  1161                               in[0] >> 21 | in[1] << 11, in[0]);
  1162    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1163    _mm256_storeu_si256((__m256i*)(out), results);
  1164    out += 8;
  1165  
  1166    // shift the second 8 outs
  1167    reg_shifts = _mm256_set_epi32(0, 6, 0, 0,
  1168                                  7, 0, 0, 8);
  1169    reg_inls = _mm256_set_epi32(in[9] >> 27 | in[10] << 5, in[9],
  1170                                in[8] >> 17 | in[9] << 15, in[7] >> 28 | in[8] << 4,
  1171                                in[7], in[6] >> 18 | in[7] << 14,
  1172                                in[5] >> 29 | in[6] << 3, in[5]);
  1173    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1174    _mm256_storeu_si256((__m256i*)(out), results);
  1175    out += 8;
  1176  
  1177    // shift the third 8 outs
  1178    reg_shifts = _mm256_set_epi32(3, 0, 0, 4,
  1179                                  0, 0, 5, 0);
  1180    reg_inls = _mm256_set_epi32(in[15], in[14] >> 14 | in[15] << 18,
  1181                                in[13] >> 25 | in[14] << 7, in[13],
  1182                                in[12] >> 15 | in[13] << 17, in[11] >> 26 | in[12] << 6,
  1183                                in[11], in[10] >> 16 | in[11] << 16);
  1184    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1185    _mm256_storeu_si256((__m256i*)(out), results);
  1186    out += 8;
  1187  
  1188    // shift the last 8 outs
  1189    reg_shifts = _mm256_set_epi32(11, 0, 1, 0,
  1190                                  0, 2, 0, 0);
  1191    reg_inls = _mm256_set_epi32(in[20], in[19] >> 22 | in[20] << 10,
  1192                                in[19], in[18] >> 12 | in[19] << 20,
  1193                                in[17] >> 23 | in[18] << 9, in[17],
  1194                                in[16] >> 13 | in[17] << 19, in[15] >> 24 | in[16] << 8);
  1195    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1196    _mm256_storeu_si256((__m256i*)(out), results);
  1197    out += 8;
  1198  
  1199    in += 21;
  1200  
  1201    return in;
  1202  }
  1203  
  1204  inline static const uint32_t* unpack22_32_avx2(const uint32_t* in, uint32_t* out) {
  1205    uint32_t mask = 0x3fffff;
  1206    __m256i reg_shifts, reg_inls, reg_masks;
  1207    __m256i results;
  1208  
  1209    reg_masks = _mm256_set1_epi32(mask);
  1210  
  1211    // shift the first 8 outs
  1212    reg_shifts = _mm256_set_epi32(0, 4, 0, 0,
  1213                                 2, 0, 0, 0);
  1214    reg_inls = _mm256_set_epi32(in[4] >> 26 | in[5] << 6, in[4],
  1215                               in[3] >> 14 | in[4] << 18, in[2] >> 24 | in[3] << 8,
  1216                               in[2], in[1] >> 12 | in[2] << 20,
  1217                               in[0] >> 22 | in[1] << 10, in[0]);
  1218    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1219    _mm256_storeu_si256((__m256i*)(out), results);
  1220    out += 8;
  1221  
  1222    // shift the second 8 outs
  1223    reg_shifts = _mm256_set_epi32(10, 0, 0, 8,
  1224                                  0, 0, 6, 0);
  1225    reg_inls = _mm256_set_epi32(in[10], in[9] >> 20 | in[10] << 12,
  1226                                in[8] >> 30 | in[9] << 2, in[8],
  1227                                in[7] >> 18 | in[8] << 14, in[6] >> 28 | in[7] << 4,
  1228                                in[6], in[5] >> 16 | in[6] << 16);
  1229    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1230    _mm256_storeu_si256((__m256i*)(out), results);
  1231    out += 8;
  1232  
  1233    // shift the third 8 outs
  1234    reg_shifts = _mm256_set_epi32(0, 4, 0, 0,
  1235                                  2, 0, 0, 0);
  1236    reg_inls = _mm256_set_epi32(in[15] >> 26 | in[16] << 6, in[15],
  1237                                in[14] >> 14 | in[15] << 18, in[13] >> 24 | in[14] << 8,
  1238                                in[13], in[12] >> 12 | in[13] << 20,
  1239                                in[11] >> 22 | in[12] << 10, in[11]);
  1240    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1241    _mm256_storeu_si256((__m256i*)(out), results);
  1242    out += 8;
  1243  
  1244    // shift the last 8 outs
  1245    reg_shifts = _mm256_set_epi32(10, 0, 0, 8,
  1246                                  0, 0, 6, 0);
  1247    reg_inls = _mm256_set_epi32(in[21], in[20] >> 20 | in[21] << 12,
  1248                                in[19] >> 30 | in[20] << 2, in[19],
  1249                                in[18] >> 18 | in[19] << 14, in[17] >> 28 | in[18] << 4,
  1250                                in[17], in[16] >> 16 | in[17] << 16);
  1251    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1252    _mm256_storeu_si256((__m256i*)(out), results);
  1253    out += 8;
  1254  
  1255    in += 22;
  1256  
  1257    return in;
  1258  }
  1259  
  1260  inline static const uint32_t* unpack23_32_avx2(const uint32_t* in, uint32_t* out) {
  1261    uint32_t mask = 0x7fffff;
  1262    __m256i reg_shifts, reg_inls, reg_masks;
  1263    __m256i results;
  1264  
  1265    reg_masks = _mm256_set1_epi32(mask);
  1266  
  1267    // shift the first 8 outs
  1268    reg_shifts = _mm256_set_epi32(1, 0, 0, 0,
  1269                                 5, 0, 0, 0);
  1270    reg_inls = _mm256_set_epi32(in[5], in[4] >> 10 | in[5] << 22,
  1271                               in[3] >> 19 | in[4] << 13, in[2] >> 28 | in[3] << 4,
  1272                               in[2], in[1] >> 14 | in[2] << 18,
  1273                               in[0] >> 23 | in[1] << 9, in[0]);
  1274    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1275    _mm256_storeu_si256((__m256i*)(out), results);
  1276    out += 8;
  1277  
  1278    // shift the second 8 outs
  1279    reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
  1280                                  0, 6, 0, 0);
  1281    reg_inls = _mm256_set_epi32(in[10] >> 25 | in[11] << 7, in[10],
  1282                                in[9] >> 11 | in[10] << 21, in[8] >> 20 | in[9] << 12,
  1283                                in[7] >> 29 | in[8] << 3, in[7],
  1284                                in[6] >> 15 | in[7] << 17, in[5] >> 24 | in[6] << 8);
  1285    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1286    _mm256_storeu_si256((__m256i*)(out), results);
  1287    out += 8;
  1288  
  1289    // shift the third 8 outs
  1290    reg_shifts = _mm256_set_epi32(0, 0, 3, 0,
  1291                                  0, 0, 7, 0);
  1292    reg_inls = _mm256_set_epi32(in[16] >> 17 | in[17] << 15, in[15] >> 26 | in[16] << 6,
  1293                                in[15], in[14] >> 12 | in[15] << 20,
  1294                                in[13] >> 21 | in[14] << 11, in[12] >> 30 | in[13] << 2,
  1295                                in[12], in[11] >> 16 | in[12] << 16);
  1296    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1297    _mm256_storeu_si256((__m256i*)(out), results);
  1298    out += 8;
  1299  
  1300    // shift the last 8 outs
  1301    reg_shifts = _mm256_set_epi32(9, 0, 0, 4,
  1302                                  0, 0, 0, 8);
  1303    reg_inls = _mm256_set_epi32(in[22], in[21] >> 18 | in[22] << 14,
  1304                                in[20] >> 27 | in[21] << 5, in[20],
  1305                                in[19] >> 13 | in[20] << 19, in[18] >> 22 | in[19] << 10,
  1306                                in[17] >> 31 | in[18] << 1, in[17]);
  1307    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1308    _mm256_storeu_si256((__m256i*)(out), results);
  1309    out += 8;
  1310  
  1311    in += 23;
  1312  
  1313    return in;
  1314  }
  1315  
  1316  inline static const uint32_t* unpack24_32_avx2(const uint32_t* in, uint32_t* out) {
  1317    uint32_t mask = 0xffffff;
  1318    __m256i reg_shifts, reg_inls, reg_masks;
  1319    __m256i results;
  1320  
  1321    reg_masks = _mm256_set1_epi32(mask);
  1322  
  1323    // shift the first 8 outs
  1324    reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
  1325                                 8, 0, 0, 0);
  1326    reg_inls = _mm256_set_epi32(in[5], in[4] >> 16 | in[5] << 16,
  1327                               in[3] >> 24 | in[4] << 8, in[3],
  1328                               in[2], in[1] >> 16 | in[2] << 16,
  1329                               in[0] >> 24 | in[1] << 8, in[0]);
  1330    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1331    _mm256_storeu_si256((__m256i*)(out), results);
  1332    out += 8;
  1333  
  1334    // shift the second 8 outs
  1335    reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
  1336                                  8, 0, 0, 0);
  1337    reg_inls = _mm256_set_epi32(in[11], in[10] >> 16 | in[11] << 16,
  1338                                in[9] >> 24 | in[10] << 8, in[9],
  1339                                in[8], in[7] >> 16 | in[8] << 16,
  1340                                in[6] >> 24 | in[7] << 8, in[6]);
  1341    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1342    _mm256_storeu_si256((__m256i*)(out), results);
  1343    out += 8;
  1344  
  1345    // shift the third 8 outs
  1346    reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
  1347                                  8, 0, 0, 0);
  1348    reg_inls = _mm256_set_epi32(in[17], in[16] >> 16 | in[17] << 16,
  1349                                in[15] >> 24 | in[16] << 8, in[15],
  1350                                in[14], in[13] >> 16 | in[14] << 16,
  1351                                in[12] >> 24 | in[13] << 8, in[12]);
  1352    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1353    _mm256_storeu_si256((__m256i*)(out), results);
  1354    out += 8;
  1355  
  1356    // shift the last 8 outs
  1357    reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
  1358                                  8, 0, 0, 0);
  1359    reg_inls = _mm256_set_epi32(in[23], in[22] >> 16 | in[23] << 16,
  1360                                in[21] >> 24 | in[22] << 8, in[21],
  1361                                in[20], in[19] >> 16 | in[20] << 16,
  1362                                in[18] >> 24 | in[19] << 8, in[18]);
  1363    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1364    _mm256_storeu_si256((__m256i*)(out), results);
  1365    out += 8;
  1366  
  1367    in += 24;
  1368  
  1369    return in;
  1370  }
  1371  
  1372  inline static const uint32_t* unpack25_32_avx2(const uint32_t* in, uint32_t* out) {
  1373    uint32_t mask = 0x1ffffff;
  1374    __m256i reg_shifts, reg_inls, reg_masks;
  1375    __m256i results;
  1376  
  1377    reg_masks = _mm256_set1_epi32(mask);
  1378  
  1379    // shift the first 8 outs
  1380    reg_shifts = _mm256_set_epi32(0, 0, 0, 4,
  1381                                 0, 0, 0, 0);
  1382    reg_inls = _mm256_set_epi32(in[5] >> 15 | in[6] << 17, in[4] >> 22 | in[5] << 10,
  1383                               in[3] >> 29 | in[4] << 3, in[3],
  1384                               in[2] >> 11 | in[3] << 21, in[1] >> 18 | in[2] << 14,
  1385                               in[0] >> 25 | in[1] << 7, in[0]);
  1386    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1387    _mm256_storeu_si256((__m256i*)(out), results);
  1388    out += 8;
  1389  
  1390    // shift the second 8 outs
  1391    reg_shifts = _mm256_set_epi32(0, 0, 5, 0,
  1392                                  0, 0, 1, 0);
  1393    reg_inls = _mm256_set_epi32(in[11] >> 23 | in[12] << 9, in[10] >> 30 | in[11] << 2,
  1394                                in[10], in[9] >> 12 | in[10] << 20,
  1395                                in[8] >> 19 | in[9] << 13, in[7] >> 26 | in[8] << 6,
  1396                                in[7], in[6] >> 8 | in[7] << 24);
  1397    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1398    _mm256_storeu_si256((__m256i*)(out), results);
  1399    out += 8;
  1400  
  1401    // shift the third 8 outs
  1402    reg_shifts = _mm256_set_epi32(0, 6, 0, 0,
  1403                                  0, 2, 0, 0);
  1404    reg_inls = _mm256_set_epi32(in[17] >> 31 | in[18] << 1, in[17],
  1405                                in[16] >> 13 | in[17] << 19, in[15] >> 20 | in[16] << 12,
  1406                                in[14] >> 27 | in[15] << 5, in[14],
  1407                                in[13] >> 9 | in[14] << 23, in[12] >> 16 | in[13] << 16);
  1408    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1409    _mm256_storeu_si256((__m256i*)(out), results);
  1410    out += 8;
  1411  
  1412    // shift the last 8 outs
  1413    reg_shifts = _mm256_set_epi32(7, 0, 0, 0,
  1414                                  3, 0, 0, 0);
  1415    reg_inls = _mm256_set_epi32(in[24], in[23] >> 14 | in[24] << 18,
  1416                                in[22] >> 21 | in[23] << 11, in[21] >> 28 | in[22] << 4,
  1417                                in[21], in[20] >> 10 | in[21] << 22,
  1418                                in[19] >> 17 | in[20] << 15, in[18] >> 24 | in[19] << 8);
  1419    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1420    _mm256_storeu_si256((__m256i*)(out), results);
  1421    out += 8;
  1422  
  1423    in += 25;
  1424  
  1425    return in;
  1426  }
  1427  
  1428  inline static const uint32_t* unpack26_32_avx2(const uint32_t* in, uint32_t* out) {
  1429    uint32_t mask = 0x3ffffff;
  1430    __m256i reg_shifts, reg_inls, reg_masks;
  1431    __m256i results;
  1432  
  1433    reg_masks = _mm256_set1_epi32(mask);
  1434  
  1435    // shift the first 8 outs
  1436    reg_shifts = _mm256_set_epi32(0, 0, 2, 0,
  1437                                 0, 0, 0, 0);
  1438    reg_inls = _mm256_set_epi32(in[5] >> 22 | in[6] << 10, in[4] >> 28 | in[5] << 4,
  1439                               in[4], in[3] >> 8 | in[4] << 24,
  1440                               in[2] >> 14 | in[3] << 18, in[1] >> 20 | in[2] << 12,
  1441                               in[0] >> 26 | in[1] << 6, in[0]);
  1442    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1443    _mm256_storeu_si256((__m256i*)(out), results);
  1444    out += 8;
  1445  
  1446    // shift the second 8 outs
  1447    reg_shifts = _mm256_set_epi32(6, 0, 0, 0,
  1448                                  0, 4, 0, 0);
  1449    reg_inls = _mm256_set_epi32(in[12], in[11] >> 12 | in[12] << 20,
  1450                                in[10] >> 18 | in[11] << 14, in[9] >> 24 | in[10] << 8,
  1451                                in[8] >> 30 | in[9] << 2, in[8],
  1452                                in[7] >> 10 | in[8] << 22, in[6] >> 16 | in[7] << 16);
  1453    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1454    _mm256_storeu_si256((__m256i*)(out), results);
  1455    out += 8;
  1456  
  1457    // shift the third 8 outs
  1458    reg_shifts = _mm256_set_epi32(0, 0, 2, 0,
  1459                                  0, 0, 0, 0);
  1460    reg_inls = _mm256_set_epi32(in[18] >> 22 | in[19] << 10, in[17] >> 28 | in[18] << 4,
  1461                                in[17], in[16] >> 8 | in[17] << 24,
  1462                                in[15] >> 14 | in[16] << 18, in[14] >> 20 | in[15] << 12,
  1463                                in[13] >> 26 | in[14] << 6, in[13]);
  1464    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1465    _mm256_storeu_si256((__m256i*)(out), results);
  1466    out += 8;
  1467  
  1468    // shift the last 8 outs
  1469    reg_shifts = _mm256_set_epi32(6, 0, 0, 0,
  1470                                  0, 4, 0, 0);
  1471    reg_inls = _mm256_set_epi32(in[25], in[24] >> 12 | in[25] << 20,
  1472                                in[23] >> 18 | in[24] << 14, in[22] >> 24 | in[23] << 8,
  1473                                in[21] >> 30 | in[22] << 2, in[21],
  1474                                in[20] >> 10 | in[21] << 22, in[19] >> 16 | in[20] << 16);
  1475    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1476    _mm256_storeu_si256((__m256i*)(out), results);
  1477    out += 8;
  1478  
  1479    in += 26;
  1480  
  1481    return in;
  1482  }
  1483  
  1484  inline static const uint32_t* unpack27_32_avx2(const uint32_t* in, uint32_t* out) {
  1485    uint32_t mask = 0x7ffffff;
  1486    __m256i reg_shifts, reg_inls, reg_masks;
  1487    __m256i results;
  1488  
  1489    reg_masks = _mm256_set1_epi32(mask);
  1490  
  1491    // shift the first 8 outs
  1492    reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
  1493                                 0, 0, 0, 0);
  1494    reg_inls = _mm256_set_epi32(in[5] >> 29 | in[6] << 3, in[5],
  1495                               in[4] >> 7 | in[5] << 25, in[3] >> 12 | in[4] << 20,
  1496                               in[2] >> 17 | in[3] << 15, in[1] >> 22 | in[2] << 10,
  1497                               in[0] >> 27 | in[1] << 5, in[0]);
  1498    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1499    _mm256_storeu_si256((__m256i*)(out), results);
  1500    out += 8;
  1501  
  1502    // shift the second 8 outs
  1503    reg_shifts = _mm256_set_epi32(0, 0, 0, 4,
  1504                                  0, 0, 0, 0);
  1505    reg_inls = _mm256_set_epi32(in[12] >> 21 | in[13] << 11, in[11] >> 26 | in[12] << 6,
  1506                                in[10] >> 31 | in[11] << 1, in[10],
  1507                                in[9] >> 9 | in[10] << 23, in[8] >> 14 | in[9] << 18,
  1508                                in[7] >> 19 | in[8] << 13, in[6] >> 24 | in[7] << 8);
  1509    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1510    _mm256_storeu_si256((__m256i*)(out), results);
  1511    out += 8;
  1512  
  1513    // shift the third 8 outs
  1514    reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
  1515                                  1, 0, 0, 0);
  1516    reg_inls = _mm256_set_epi32(in[19] >> 13 | in[20] << 19, in[18] >> 18 | in[19] << 14,
  1517                                in[17] >> 23 | in[18] << 9, in[16] >> 28 | in[17] << 4,
  1518                                in[16], in[15] >> 6 | in[16] << 26,
  1519                                in[14] >> 11 | in[15] << 21, in[13] >> 16 | in[14] << 16);
  1520    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1521    _mm256_storeu_si256((__m256i*)(out), results);
  1522    out += 8;
  1523  
  1524    // shift the last 8 outs
  1525    reg_shifts = _mm256_set_epi32(5, 0, 0, 0,
  1526                                  0, 0, 3, 0);
  1527    reg_inls = _mm256_set_epi32(in[26], in[25] >> 10 | in[26] << 22,
  1528                                in[24] >> 15 | in[25] << 17, in[23] >> 20 | in[24] << 12,
  1529                                in[22] >> 25 | in[23] << 7, in[21] >> 30 | in[22] << 2,
  1530                                in[21], in[20] >> 8 | in[21] << 24);
  1531    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1532    _mm256_storeu_si256((__m256i*)(out), results);
  1533    out += 8;
  1534  
  1535    in += 27;
  1536  
  1537    return in;
  1538  }
  1539  
  1540  inline static const uint32_t* unpack28_32_avx2(const uint32_t* in, uint32_t* out) {
  1541    uint32_t mask = 0xfffffff;
  1542    __m256i reg_shifts, reg_inls, reg_masks;
  1543    __m256i results;
  1544  
  1545    reg_masks = _mm256_set1_epi32(mask);
  1546  
  1547    // shift the first 8 outs
  1548    reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
  1549                                 0, 0, 0, 0);
  1550    reg_inls = _mm256_set_epi32(in[6], in[5] >> 8 | in[6] << 24,
  1551                               in[4] >> 12 | in[5] << 20, in[3] >> 16 | in[4] << 16,
  1552                               in[2] >> 20 | in[3] << 12, in[1] >> 24 | in[2] << 8,
  1553                               in[0] >> 28 | in[1] << 4, in[0]);
  1554    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1555    _mm256_storeu_si256((__m256i*)(out), results);
  1556    out += 8;
  1557  
  1558    // shift the second 8 outs
  1559    reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
  1560                                  0, 0, 0, 0);
  1561    reg_inls = _mm256_set_epi32(in[13], in[12] >> 8 | in[13] << 24,
  1562                                in[11] >> 12 | in[12] << 20, in[10] >> 16 | in[11] << 16,
  1563                                in[9] >> 20 | in[10] << 12, in[8] >> 24 | in[9] << 8,
  1564                                in[7] >> 28 | in[8] << 4, in[7]);
  1565    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1566    _mm256_storeu_si256((__m256i*)(out), results);
  1567    out += 8;
  1568  
  1569    // shift the third 8 outs
  1570    reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
  1571                                  0, 0, 0, 0);
  1572    reg_inls = _mm256_set_epi32(in[20], in[19] >> 8 | in[20] << 24,
  1573                                in[18] >> 12 | in[19] << 20, in[17] >> 16 | in[18] << 16,
  1574                                in[16] >> 20 | in[17] << 12, in[15] >> 24 | in[16] << 8,
  1575                                in[14] >> 28 | in[15] << 4, in[14]);
  1576    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1577    _mm256_storeu_si256((__m256i*)(out), results);
  1578    out += 8;
  1579  
  1580    // shift the last 8 outs
  1581    reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
  1582                                  0, 0, 0, 0);
  1583    reg_inls = _mm256_set_epi32(in[27], in[26] >> 8 | in[27] << 24,
  1584                                in[25] >> 12 | in[26] << 20, in[24] >> 16 | in[25] << 16,
  1585                                in[23] >> 20 | in[24] << 12, in[22] >> 24 | in[23] << 8,
  1586                                in[21] >> 28 | in[22] << 4, in[21]);
  1587    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1588    _mm256_storeu_si256((__m256i*)(out), results);
  1589    out += 8;
  1590  
  1591    in += 28;
  1592  
  1593    return in;
  1594  }
  1595  
  1596  inline static const uint32_t* unpack29_32_avx2(const uint32_t* in, uint32_t* out) {
  1597    uint32_t mask = 0x1fffffff;
  1598    __m256i reg_shifts, reg_inls, reg_masks;
  1599    __m256i results;
  1600  
  1601    reg_masks = _mm256_set1_epi32(mask);
  1602  
  1603    // shift the first 8 outs
  1604    reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
  1605                                 0, 0, 0, 0);
  1606    reg_inls = _mm256_set_epi32(in[6] >> 11 | in[7] << 21, in[5] >> 14 | in[6] << 18,
  1607                               in[4] >> 17 | in[5] << 15, in[3] >> 20 | in[4] << 12,
  1608                               in[2] >> 23 | in[3] << 9, in[1] >> 26 | in[2] << 6,
  1609                               in[0] >> 29 | in[1] << 3, in[0]);
  1610    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1611    _mm256_storeu_si256((__m256i*)(out), results);
  1612    out += 8;
  1613  
  1614    // shift the second 8 outs
  1615    reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
  1616                                  0, 2, 0, 0);
  1617    reg_inls = _mm256_set_epi32(in[13] >> 19 | in[14] << 13, in[12] >> 22 | in[13] << 10,
  1618                                in[11] >> 25 | in[12] << 7, in[10] >> 28 | in[11] << 4,
  1619                                in[9] >> 31 | in[10] << 1, in[9],
  1620                                in[8] >> 5 | in[9] << 27, in[7] >> 8 | in[8] << 24);
  1621    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1622    _mm256_storeu_si256((__m256i*)(out), results);
  1623    out += 8;
  1624  
  1625    // shift the third 8 outs
  1626    reg_shifts = _mm256_set_epi32(0, 0, 1, 0,
  1627                                  0, 0, 0, 0);
  1628    reg_inls = _mm256_set_epi32(in[20] >> 27 | in[21] << 5, in[19] >> 30 | in[20] << 2,
  1629                                in[19], in[18] >> 4 | in[19] << 28,
  1630                                in[17] >> 7 | in[18] << 25, in[16] >> 10 | in[17] << 22,
  1631                                in[15] >> 13 | in[16] << 19, in[14] >> 16 | in[15] << 16);
  1632    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1633    _mm256_storeu_si256((__m256i*)(out), results);
  1634    out += 8;
  1635  
  1636    // shift the last 8 outs
  1637    reg_shifts = _mm256_set_epi32(3, 0, 0, 0,
  1638                                  0, 0, 0, 0);
  1639    reg_inls = _mm256_set_epi32(in[28], in[27] >> 6 | in[28] << 26,
  1640                                in[26] >> 9 | in[27] << 23, in[25] >> 12 | in[26] << 20,
  1641                                in[24] >> 15 | in[25] << 17, in[23] >> 18 | in[24] << 14,
  1642                                in[22] >> 21 | in[23] << 11, in[21] >> 24 | in[22] << 8);
  1643    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1644    _mm256_storeu_si256((__m256i*)(out), results);
  1645    out += 8;
  1646  
  1647    in += 29;
  1648  
  1649    return in;
  1650  }
  1651  
  1652  inline static const uint32_t* unpack30_32_avx2(const uint32_t* in, uint32_t* out) {
  1653    uint32_t mask = 0x3fffffff;
  1654    __m256i reg_shifts, reg_inls, reg_masks;
  1655    __m256i results;
  1656  
  1657    reg_masks = _mm256_set1_epi32(mask);
  1658  
  1659    // shift the first 8 outs
  1660    reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
  1661                                 0, 0, 0, 0);
  1662    reg_inls = _mm256_set_epi32(in[6] >> 18 | in[7] << 14, in[5] >> 20 | in[6] << 12,
  1663                               in[4] >> 22 | in[5] << 10, in[3] >> 24 | in[4] << 8,
  1664                               in[2] >> 26 | in[3] << 6, in[1] >> 28 | in[2] << 4,
  1665                               in[0] >> 30 | in[1] << 2, in[0]);
  1666    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1667    _mm256_storeu_si256((__m256i*)(out), results);
  1668    out += 8;
  1669  
  1670    // shift the second 8 outs
  1671    reg_shifts = _mm256_set_epi32(2, 0, 0, 0,
  1672                                  0, 0, 0, 0);
  1673    reg_inls = _mm256_set_epi32(in[14], in[13] >> 4 | in[14] << 28,
  1674                                in[12] >> 6 | in[13] << 26, in[11] >> 8 | in[12] << 24,
  1675                                in[10] >> 10 | in[11] << 22, in[9] >> 12 | in[10] << 20,
  1676                                in[8] >> 14 | in[9] << 18, in[7] >> 16 | in[8] << 16);
  1677    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1678    _mm256_storeu_si256((__m256i*)(out), results);
  1679    out += 8;
  1680  
  1681    // shift the third 8 outs
  1682    reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
  1683                                  0, 0, 0, 0);
  1684    reg_inls = _mm256_set_epi32(in[21] >> 18 | in[22] << 14, in[20] >> 20 | in[21] << 12,
  1685                                in[19] >> 22 | in[20] << 10, in[18] >> 24 | in[19] << 8,
  1686                                in[17] >> 26 | in[18] << 6, in[16] >> 28 | in[17] << 4,
  1687                                in[15] >> 30 | in[16] << 2, in[15]);
  1688    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1689    _mm256_storeu_si256((__m256i*)(out), results);
  1690    out += 8;
  1691  
  1692    // shift the last 8 outs
  1693    reg_shifts = _mm256_set_epi32(2, 0, 0, 0,
  1694                                  0, 0, 0, 0);
  1695    reg_inls = _mm256_set_epi32(in[29], in[28] >> 4 | in[29] << 28,
  1696                                in[27] >> 6 | in[28] << 26, in[26] >> 8 | in[27] << 24,
  1697                                in[25] >> 10 | in[26] << 22, in[24] >> 12 | in[25] << 20,
  1698                                in[23] >> 14 | in[24] << 18, in[22] >> 16 | in[23] << 16);
  1699    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1700    _mm256_storeu_si256((__m256i*)(out), results);
  1701    out += 8;
  1702  
  1703    in += 30;
  1704  
  1705    return in;
  1706  }
  1707  
  1708  inline static const uint32_t* unpack31_32_avx2(const uint32_t* in, uint32_t* out) {
  1709    uint32_t mask = 0x7fffffff;
  1710    __m256i reg_shifts, reg_inls, reg_masks;
  1711    __m256i results;
  1712  
  1713    reg_masks = _mm256_set1_epi32(mask);
  1714  
  1715    // shift the first 8 outs
  1716    reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
  1717                                 0, 0, 0, 0);
  1718    reg_inls = _mm256_set_epi32(in[6] >> 25 | in[7] << 7, in[5] >> 26 | in[6] << 6,
  1719                               in[4] >> 27 | in[5] << 5, in[3] >> 28 | in[4] << 4,
  1720                               in[2] >> 29 | in[3] << 3, in[1] >> 30 | in[2] << 2,
  1721                               in[0] >> 31 | in[1] << 1, in[0]);
  1722    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1723    _mm256_storeu_si256((__m256i*)(out), results);
  1724    out += 8;
  1725  
  1726    // shift the second 8 outs
  1727    reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
  1728                                  0, 0, 0, 0);
  1729    reg_inls = _mm256_set_epi32(in[14] >> 17 | in[15] << 15, in[13] >> 18 | in[14] << 14,
  1730                                in[12] >> 19 | in[13] << 13, in[11] >> 20 | in[12] << 12,
  1731                                in[10] >> 21 | in[11] << 11, in[9] >> 22 | in[10] << 10,
  1732                                in[8] >> 23 | in[9] << 9, in[7] >> 24 | in[8] << 8);
  1733    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1734    _mm256_storeu_si256((__m256i*)(out), results);
  1735    out += 8;
  1736  
  1737    // shift the third 8 outs
  1738    reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
  1739                                  0, 0, 0, 0);
  1740    reg_inls = _mm256_set_epi32(in[22] >> 9 | in[23] << 23, in[21] >> 10 | in[22] << 22,
  1741                                in[20] >> 11 | in[21] << 21, in[19] >> 12 | in[20] << 20,
  1742                                in[18] >> 13 | in[19] << 19, in[17] >> 14 | in[18] << 18,
  1743                                in[16] >> 15 | in[17] << 17, in[15] >> 16 | in[16] << 16);
  1744    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1745    _mm256_storeu_si256((__m256i*)(out), results);
  1746    out += 8;
  1747  
  1748    // shift the last 8 outs
  1749    reg_shifts = _mm256_set_epi32(1, 0, 0, 0,
  1750                                  0, 0, 0, 0);
  1751    reg_inls = _mm256_set_epi32(in[30], in[29] >> 2 | in[30] << 30,
  1752                                in[28] >> 3 | in[29] << 29, in[27] >> 4 | in[28] << 28,
  1753                                in[26] >> 5 | in[27] << 27, in[25] >> 6 | in[26] << 26,
  1754                                in[24] >> 7 | in[25] << 25, in[23] >> 8 | in[24] << 24);
  1755    results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
  1756    _mm256_storeu_si256((__m256i*)(out), results);
  1757    out += 8;
  1758  
  1759    in += 31;
  1760  
  1761    return in;
  1762  }
  1763  
  1764  inline const uint32_t* unpack32_32_avx2(const uint32_t* in, uint32_t* out) {
  1765    memcpy(out, in, 32 * sizeof(*out));
  1766    in += 32;
  1767    out += 32;
  1768  
  1769    return in;
  1770  }
  1771  
  1772  int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
  1773    batch_size = batch_size / 32 * 32;
  1774    int num_loops = batch_size / 32;
  1775  
  1776    switch (num_bits) {
  1777      case 0:
  1778        for (int i = 0; i < num_loops; ++i) in = unpack0_32_avx2(in, out + i * 32);
  1779        break;
  1780      case 1:
  1781        for (int i = 0; i < num_loops; ++i) in = unpack1_32_avx2(in, out + i * 32);
  1782        break;
  1783      case 2:
  1784        for (int i = 0; i < num_loops; ++i) in = unpack2_32_avx2(in, out + i * 32);
  1785        break;
  1786      case 3:
  1787        for (int i = 0; i < num_loops; ++i) in = unpack3_32_avx2(in, out + i * 32);
  1788        break;
  1789      case 4:
  1790        for (int i = 0; i < num_loops; ++i) in = unpack4_32_avx2(in, out + i * 32);
  1791        break;
  1792      case 5:
  1793        for (int i = 0; i < num_loops; ++i) in = unpack5_32_avx2(in, out + i * 32);
  1794        break;
  1795      case 6:
  1796        for (int i = 0; i < num_loops; ++i) in = unpack6_32_avx2(in, out + i * 32);
  1797        break;
  1798      case 7:
  1799        for (int i = 0; i < num_loops; ++i) in = unpack7_32_avx2(in, out + i * 32);
  1800        break;
  1801      case 8:
  1802        for (int i = 0; i < num_loops; ++i) in = unpack8_32_avx2(in, out + i * 32);
  1803        break;
  1804      case 9:
  1805        for (int i = 0; i < num_loops; ++i) in = unpack9_32_avx2(in, out + i * 32);
  1806        break;
  1807      case 10:
  1808        for (int i = 0; i < num_loops; ++i) in = unpack10_32_avx2(in, out + i * 32);
  1809        break;
  1810      case 11:
  1811        for (int i = 0; i < num_loops; ++i) in = unpack11_32_avx2(in, out + i * 32);
  1812        break;
  1813      case 12:
  1814        for (int i = 0; i < num_loops; ++i) in = unpack12_32_avx2(in, out + i * 32);
  1815        break;
  1816      case 13:
  1817        for (int i = 0; i < num_loops; ++i) in = unpack13_32_avx2(in, out + i * 32);
  1818        break;
  1819      case 14:
  1820        for (int i = 0; i < num_loops; ++i) in = unpack14_32_avx2(in, out + i * 32);
  1821        break;
  1822      case 15:
  1823        for (int i = 0; i < num_loops; ++i) in = unpack15_32_avx2(in, out + i * 32);
  1824        break;
  1825      case 16:
  1826        for (int i = 0; i < num_loops; ++i) in = unpack16_32_avx2(in, out + i * 32);
  1827        break;
  1828      case 17:
  1829        for (int i = 0; i < num_loops; ++i) in = unpack17_32_avx2(in, out + i * 32);
  1830        break;
  1831      case 18:
  1832        for (int i = 0; i < num_loops; ++i) in = unpack18_32_avx2(in, out + i * 32);
  1833        break;
  1834      case 19:
  1835        for (int i = 0; i < num_loops; ++i) in = unpack19_32_avx2(in, out + i * 32);
  1836        break;
  1837      case 20:
  1838        for (int i = 0; i < num_loops; ++i) in = unpack20_32_avx2(in, out + i * 32);
  1839        break;
  1840      case 21:
  1841        for (int i = 0; i < num_loops; ++i) in = unpack21_32_avx2(in, out + i * 32);
  1842        break;
  1843      case 22:
  1844        for (int i = 0; i < num_loops; ++i) in = unpack22_32_avx2(in, out + i * 32);
  1845        break;
  1846      case 23:
  1847        for (int i = 0; i < num_loops; ++i) in = unpack23_32_avx2(in, out + i * 32);
  1848        break;
  1849      case 24:
  1850        for (int i = 0; i < num_loops; ++i) in = unpack24_32_avx2(in, out + i * 32);
  1851        break;
  1852      case 25:
  1853        for (int i = 0; i < num_loops; ++i) in = unpack25_32_avx2(in, out + i * 32);
  1854        break;
  1855      case 26:
  1856        for (int i = 0; i < num_loops; ++i) in = unpack26_32_avx2(in, out + i * 32);
  1857        break;
  1858      case 27:
  1859        for (int i = 0; i < num_loops; ++i) in = unpack27_32_avx2(in, out + i * 32);
  1860        break;
  1861      case 28:
  1862        for (int i = 0; i < num_loops; ++i) in = unpack28_32_avx2(in, out + i * 32);
  1863        break;
  1864      case 29:
  1865        for (int i = 0; i < num_loops; ++i) in = unpack29_32_avx2(in, out + i * 32);
  1866        break;
  1867      case 30:
  1868        for (int i = 0; i < num_loops; ++i) in = unpack30_32_avx2(in, out + i * 32);
  1869        break;
  1870      case 31:
  1871        for (int i = 0; i < num_loops; ++i) in = unpack31_32_avx2(in, out + i * 32);
  1872        break;
  1873      case 32:
  1874        for (int i = 0; i < num_loops; ++i) in = unpack32_32_avx2(in, out + i * 32);
  1875        break;
  1876    }
  1877  
  1878    return batch_size;
  1879  }