github.com/apache/arrow/go/v14@v14.0.1/parquet/internal/utils/_lib/bit_packing_avx2.c (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 #include <stdint.h> 18 #include <immintrin.h> 19 #include <string.h> 20 21 inline const uint32_t* unpack0_32_avx2(const uint32_t* in, uint32_t* out) { 22 memset(out, 0x0, 32 * sizeof(*out)); 23 out += 32; 24 25 return in; 26 } 27 28 inline static const uint32_t* unpack1_32_avx2(const uint32_t* in, uint32_t* out) { 29 uint32_t mask = 0x1; 30 __m256i reg_shifts, reg_inls, reg_masks; 31 __m256i results; 32 33 reg_masks = _mm256_set1_epi32(mask); 34 35 // shift the first 8 outs 36 reg_shifts = _mm256_set_epi32(7, 6, 5, 4, 37 3, 2, 1, 0); 38 reg_inls = _mm256_set_epi32(in[0], in[0], 39 in[0], in[0], 40 in[0], in[0], 41 in[0], in[0]); 42 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 43 _mm256_storeu_si256((__m256i*)(out), results); 44 out += 8; 45 46 // shift the second 8 outs 47 reg_shifts = _mm256_set_epi32(15, 14, 13, 12, 48 11, 10, 9, 8); 49 reg_inls = _mm256_set_epi32(in[0], in[0], 50 in[0], in[0], 51 in[0], in[0], 52 in[0], in[0]); 53 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 54 _mm256_storeu_si256((__m256i*)(out), results); 55 out += 8; 56 57 // shift the third 8 outs 58 reg_shifts = _mm256_set_epi32(23, 22, 21, 20, 59 19, 18, 17, 16); 60 reg_inls = _mm256_set_epi32(in[0], in[0], 61 in[0], in[0], 62 in[0], in[0], 63 in[0], in[0]); 64 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 65 _mm256_storeu_si256((__m256i*)(out), results); 66 out += 8; 67 68 // shift the last 8 outs 69 reg_shifts = _mm256_set_epi32(31, 30, 29, 28, 70 27, 26, 25, 24); 71 reg_inls = _mm256_set_epi32(in[0], in[0], 72 in[0], in[0], 73 in[0], in[0], 74 in[0], in[0]); 75 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 76 _mm256_storeu_si256((__m256i*)(out), results); 77 out += 8; 78 79 in += 1; 80 81 return in; 82 } 83 84 inline static const uint32_t* unpack2_32_avx2(const uint32_t* in, uint32_t* out) { 85 uint32_t mask = 0x3; 86 __m256i reg_shifts, reg_inls, reg_masks; 87 __m256i results; 88 89 reg_masks = _mm256_set1_epi32(mask); 90 91 // shift the first 8 outs 92 reg_shifts = _mm256_set_epi32(14, 12, 10, 8, 93 6, 4, 2, 0); 94 reg_inls = _mm256_set_epi32(in[0], in[0], 95 in[0], in[0], 96 in[0], in[0], 97 in[0], in[0]); 98 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 99 _mm256_storeu_si256((__m256i*)(out), results); 100 out += 8; 101 102 // shift the second 8 outs 103 reg_shifts = _mm256_set_epi32(30, 28, 26, 24, 104 22, 20, 18, 16); 105 reg_inls = _mm256_set_epi32(in[0], in[0], 106 in[0], in[0], 107 in[0], in[0], 108 in[0], in[0]); 109 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 110 _mm256_storeu_si256((__m256i*)(out), results); 111 out += 8; 112 113 // shift the third 8 outs 114 reg_shifts = _mm256_set_epi32(14, 12, 10, 8, 115 6, 4, 2, 0); 116 reg_inls = _mm256_set_epi32(in[1], in[1], 117 in[1], in[1], 118 in[1], in[1], 119 in[1], in[1]); 120 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 121 _mm256_storeu_si256((__m256i*)(out), results); 122 out += 8; 123 124 // shift the last 8 outs 125 reg_shifts = _mm256_set_epi32(30, 28, 26, 24, 126 22, 20, 18, 16); 127 reg_inls = _mm256_set_epi32(in[1], in[1], 128 in[1], in[1], 129 in[1], in[1], 130 in[1], in[1]); 131 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 132 _mm256_storeu_si256((__m256i*)(out), results); 133 out += 8; 134 135 in += 2; 136 137 return in; 138 } 139 140 inline static const uint32_t* unpack3_32_avx2(const uint32_t* in, uint32_t* out) { 141 uint32_t mask = 0x7; 142 __m256i reg_shifts, reg_inls, reg_masks; 143 __m256i results; 144 145 reg_masks = _mm256_set1_epi32(mask); 146 147 // shift the first 8 outs 148 reg_shifts = _mm256_set_epi32(21, 18, 15, 12, 149 9, 6, 3, 0); 150 reg_inls = _mm256_set_epi32(in[0], in[0], 151 in[0], in[0], 152 in[0], in[0], 153 in[0], in[0]); 154 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 155 _mm256_storeu_si256((__m256i*)(out), results); 156 out += 8; 157 158 // shift the second 8 outs 159 reg_shifts = _mm256_set_epi32(13, 10, 7, 4, 160 1, 0, 27, 24); 161 reg_inls = _mm256_set_epi32(in[1], in[1], 162 in[1], in[1], 163 in[1], in[0] >> 30 | in[1] << 2, 164 in[0], in[0]); 165 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 166 _mm256_storeu_si256((__m256i*)(out), results); 167 out += 8; 168 169 // shift the third 8 outs 170 reg_shifts = _mm256_set_epi32(5, 2, 0, 28, 171 25, 22, 19, 16); 172 reg_inls = _mm256_set_epi32(in[2], in[2], 173 in[1] >> 31 | in[2] << 1, in[1], 174 in[1], in[1], 175 in[1], in[1]); 176 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 177 _mm256_storeu_si256((__m256i*)(out), results); 178 out += 8; 179 180 // shift the last 8 outs 181 reg_shifts = _mm256_set_epi32(29, 26, 23, 20, 182 17, 14, 11, 8); 183 reg_inls = _mm256_set_epi32(in[2], in[2], 184 in[2], in[2], 185 in[2], in[2], 186 in[2], in[2]); 187 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 188 _mm256_storeu_si256((__m256i*)(out), results); 189 out += 8; 190 191 in += 3; 192 193 return in; 194 } 195 196 inline static const uint32_t* unpack4_32_avx2(const uint32_t* in, uint32_t* out) { 197 uint32_t mask = 0xf; 198 __m256i reg_shifts, reg_inls, reg_masks; 199 __m256i results; 200 201 reg_masks = _mm256_set1_epi32(mask); 202 203 // shift the first 8 outs 204 reg_shifts = _mm256_set_epi32(28, 24, 20, 16, 205 12, 8, 4, 0); 206 reg_inls = _mm256_set_epi32(in[0], in[0], 207 in[0], in[0], 208 in[0], in[0], 209 in[0], in[0]); 210 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 211 _mm256_storeu_si256((__m256i*)(out), results); 212 out += 8; 213 214 // shift the second 8 outs 215 reg_shifts = _mm256_set_epi32(28, 24, 20, 16, 216 12, 8, 4, 0); 217 reg_inls = _mm256_set_epi32(in[1], in[1], 218 in[1], in[1], 219 in[1], in[1], 220 in[1], in[1]); 221 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 222 _mm256_storeu_si256((__m256i*)(out), results); 223 out += 8; 224 225 // shift the third 8 outs 226 reg_shifts = _mm256_set_epi32(28, 24, 20, 16, 227 12, 8, 4, 0); 228 reg_inls = _mm256_set_epi32(in[2], in[2], 229 in[2], in[2], 230 in[2], in[2], 231 in[2], in[2]); 232 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 233 _mm256_storeu_si256((__m256i*)(out), results); 234 out += 8; 235 236 // shift the last 8 outs 237 reg_shifts = _mm256_set_epi32(28, 24, 20, 16, 238 12, 8, 4, 0); 239 reg_inls = _mm256_set_epi32(in[3], in[3], 240 in[3], in[3], 241 in[3], in[3], 242 in[3], in[3]); 243 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 244 _mm256_storeu_si256((__m256i*)(out), results); 245 out += 8; 246 247 in += 4; 248 249 return in; 250 } 251 252 inline static const uint32_t* unpack5_32_avx2(const uint32_t* in, uint32_t* out) { 253 uint32_t mask = 0x1f; 254 __m256i reg_shifts, reg_inls, reg_masks; 255 __m256i results; 256 257 reg_masks = _mm256_set1_epi32(mask); 258 259 // shift the first 8 outs 260 reg_shifts = _mm256_set_epi32(3, 0, 25, 20, 261 15, 10, 5, 0); 262 reg_inls = _mm256_set_epi32(in[1], in[0] >> 30 | in[1] << 2, 263 in[0], in[0], 264 in[0], in[0], 265 in[0], in[0]); 266 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 267 _mm256_storeu_si256((__m256i*)(out), results); 268 out += 8; 269 270 // shift the second 8 outs 271 reg_shifts = _mm256_set_epi32(11, 6, 1, 0, 272 23, 18, 13, 8); 273 reg_inls = _mm256_set_epi32(in[2], in[2], 274 in[2], in[1] >> 28 | in[2] << 4, 275 in[1], in[1], 276 in[1], in[1]); 277 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 278 _mm256_storeu_si256((__m256i*)(out), results); 279 out += 8; 280 281 // shift the third 8 outs 282 reg_shifts = _mm256_set_epi32(19, 14, 9, 4, 283 0, 26, 21, 16); 284 reg_inls = _mm256_set_epi32(in[3], in[3], 285 in[3], in[3], 286 in[2] >> 31 | in[3] << 1, in[2], 287 in[2], in[2]); 288 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 289 _mm256_storeu_si256((__m256i*)(out), results); 290 out += 8; 291 292 // shift the last 8 outs 293 reg_shifts = _mm256_set_epi32(27, 22, 17, 12, 294 7, 2, 0, 24); 295 reg_inls = _mm256_set_epi32(in[4], in[4], 296 in[4], in[4], 297 in[4], in[4], 298 in[3] >> 29 | in[4] << 3, in[3]); 299 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 300 _mm256_storeu_si256((__m256i*)(out), results); 301 out += 8; 302 303 in += 5; 304 305 return in; 306 } 307 308 inline static const uint32_t* unpack6_32_avx2(const uint32_t* in, uint32_t* out) { 309 uint32_t mask = 0x3f; 310 __m256i reg_shifts, reg_inls, reg_masks; 311 __m256i results; 312 313 reg_masks = _mm256_set1_epi32(mask); 314 315 // shift the first 8 outs 316 reg_shifts = _mm256_set_epi32(10, 4, 0, 24, 317 18, 12, 6, 0); 318 reg_inls = _mm256_set_epi32(in[1], in[1], 319 in[0] >> 30 | in[1] << 2, in[0], 320 in[0], in[0], 321 in[0], in[0]); 322 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 323 _mm256_storeu_si256((__m256i*)(out), results); 324 out += 8; 325 326 // shift the second 8 outs 327 reg_shifts = _mm256_set_epi32(26, 20, 14, 8, 328 2, 0, 22, 16); 329 reg_inls = _mm256_set_epi32(in[2], in[2], 330 in[2], in[2], 331 in[2], in[1] >> 28 | in[2] << 4, 332 in[1], in[1]); 333 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 334 _mm256_storeu_si256((__m256i*)(out), results); 335 out += 8; 336 337 // shift the third 8 outs 338 reg_shifts = _mm256_set_epi32(10, 4, 0, 24, 339 18, 12, 6, 0); 340 reg_inls = _mm256_set_epi32(in[4], in[4], 341 in[3] >> 30 | in[4] << 2, in[3], 342 in[3], in[3], 343 in[3], in[3]); 344 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 345 _mm256_storeu_si256((__m256i*)(out), results); 346 out += 8; 347 348 // shift the last 8 outs 349 reg_shifts = _mm256_set_epi32(26, 20, 14, 8, 350 2, 0, 22, 16); 351 reg_inls = _mm256_set_epi32(in[5], in[5], 352 in[5], in[5], 353 in[5], in[4] >> 28 | in[5] << 4, 354 in[4], in[4]); 355 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 356 _mm256_storeu_si256((__m256i*)(out), results); 357 out += 8; 358 359 in += 6; 360 361 return in; 362 } 363 364 inline static const uint32_t* unpack7_32_avx2(const uint32_t* in, uint32_t* out) { 365 uint32_t mask = 0x7f; 366 __m256i reg_shifts, reg_inls, reg_masks; 367 __m256i results; 368 369 reg_masks = _mm256_set1_epi32(mask); 370 371 // shift the first 8 outs 372 reg_shifts = _mm256_set_epi32(17, 10, 3, 0, 373 21, 14, 7, 0); 374 reg_inls = _mm256_set_epi32(in[1], in[1], 375 in[1], in[0] >> 28 | in[1] << 4, 376 in[0], in[0], 377 in[0], in[0]); 378 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 379 _mm256_storeu_si256((__m256i*)(out), results); 380 out += 8; 381 382 // shift the second 8 outs 383 reg_shifts = _mm256_set_epi32(9, 2, 0, 20, 384 13, 6, 0, 24); 385 reg_inls = _mm256_set_epi32(in[3], in[3], 386 in[2] >> 27 | in[3] << 5, in[2], 387 in[2], in[2], 388 in[1] >> 31 | in[2] << 1, in[1]); 389 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 390 _mm256_storeu_si256((__m256i*)(out), results); 391 out += 8; 392 393 // shift the third 8 outs 394 reg_shifts = _mm256_set_epi32(1, 0, 19, 12, 395 5, 0, 23, 16); 396 reg_inls = _mm256_set_epi32(in[5], in[4] >> 26 | in[5] << 6, 397 in[4], in[4], 398 in[4], in[3] >> 30 | in[4] << 2, 399 in[3], in[3]); 400 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 401 _mm256_storeu_si256((__m256i*)(out), results); 402 out += 8; 403 404 // shift the last 8 outs 405 reg_shifts = _mm256_set_epi32(25, 18, 11, 4, 406 0, 22, 15, 8); 407 reg_inls = _mm256_set_epi32(in[6], in[6], 408 in[6], in[6], 409 in[5] >> 29 | in[6] << 3, in[5], 410 in[5], in[5]); 411 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 412 _mm256_storeu_si256((__m256i*)(out), results); 413 out += 8; 414 415 in += 7; 416 417 return in; 418 } 419 420 inline static const uint32_t* unpack8_32_avx2(const uint32_t* in, uint32_t* out) { 421 uint32_t mask = 0xff; 422 __m256i reg_shifts, reg_inls, reg_masks; 423 __m256i results; 424 425 reg_masks = _mm256_set1_epi32(mask); 426 427 // shift the first 8 outs 428 reg_shifts = _mm256_set_epi32(24, 16, 8, 0, 429 24, 16, 8, 0); 430 reg_inls = _mm256_set_epi32(in[1], in[1], 431 in[1], in[1], 432 in[0], in[0], 433 in[0], in[0]); 434 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 435 _mm256_storeu_si256((__m256i*)(out), results); 436 out += 8; 437 438 // shift the second 8 outs 439 reg_shifts = _mm256_set_epi32(24, 16, 8, 0, 440 24, 16, 8, 0); 441 reg_inls = _mm256_set_epi32(in[3], in[3], 442 in[3], in[3], 443 in[2], in[2], 444 in[2], in[2]); 445 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 446 _mm256_storeu_si256((__m256i*)(out), results); 447 out += 8; 448 449 // shift the third 8 outs 450 reg_shifts = _mm256_set_epi32(24, 16, 8, 0, 451 24, 16, 8, 0); 452 reg_inls = _mm256_set_epi32(in[5], in[5], 453 in[5], in[5], 454 in[4], in[4], 455 in[4], in[4]); 456 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 457 _mm256_storeu_si256((__m256i*)(out), results); 458 out += 8; 459 460 // shift the last 8 outs 461 reg_shifts = _mm256_set_epi32(24, 16, 8, 0, 462 24, 16, 8, 0); 463 reg_inls = _mm256_set_epi32(in[7], in[7], 464 in[7], in[7], 465 in[6], in[6], 466 in[6], in[6]); 467 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 468 _mm256_storeu_si256((__m256i*)(out), results); 469 out += 8; 470 471 in += 8; 472 473 return in; 474 } 475 476 inline static const uint32_t* unpack9_32_avx2(const uint32_t* in, uint32_t* out) { 477 uint32_t mask = 0x1ff; 478 __m256i reg_shifts, reg_inls, reg_masks; 479 __m256i results; 480 481 reg_masks = _mm256_set1_epi32(mask); 482 483 // shift the first 8 outs 484 reg_shifts = _mm256_set_epi32(0, 22, 13, 4, 485 0, 18, 9, 0); 486 reg_inls = _mm256_set_epi32(in[1] >> 31 | in[2] << 1, in[1], 487 in[1], in[1], 488 in[0] >> 27 | in[1] << 5, in[0], 489 in[0], in[0]); 490 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 491 _mm256_storeu_si256((__m256i*)(out), results); 492 out += 8; 493 494 // shift the second 8 outs 495 reg_shifts = _mm256_set_epi32(7, 0, 21, 12, 496 3, 0, 17, 8); 497 reg_inls = _mm256_set_epi32(in[4], in[3] >> 30 | in[4] << 2, 498 in[3], in[3], 499 in[3], in[2] >> 26 | in[3] << 6, 500 in[2], in[2]); 501 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 502 _mm256_storeu_si256((__m256i*)(out), results); 503 out += 8; 504 505 // shift the third 8 outs 506 reg_shifts = _mm256_set_epi32(15, 6, 0, 20, 507 11, 2, 0, 16); 508 reg_inls = _mm256_set_epi32(in[6], in[6], 509 in[5] >> 29 | in[6] << 3, in[5], 510 in[5], in[5], 511 in[4] >> 25 | in[5] << 7, in[4]); 512 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 513 _mm256_storeu_si256((__m256i*)(out), results); 514 out += 8; 515 516 // shift the last 8 outs 517 reg_shifts = _mm256_set_epi32(23, 14, 5, 0, 518 19, 10, 1, 0); 519 reg_inls = _mm256_set_epi32(in[8], in[8], 520 in[8], in[7] >> 28 | in[8] << 4, 521 in[7], in[7], 522 in[7], in[6] >> 24 | in[7] << 8); 523 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 524 _mm256_storeu_si256((__m256i*)(out), results); 525 out += 8; 526 527 in += 9; 528 529 return in; 530 } 531 532 inline static const uint32_t* unpack10_32_avx2(const uint32_t* in, uint32_t* out) { 533 uint32_t mask = 0x3ff; 534 __m256i reg_shifts, reg_inls, reg_masks; 535 __m256i results; 536 537 reg_masks = _mm256_set1_epi32(mask); 538 539 // shift the first 8 outs 540 reg_shifts = _mm256_set_epi32(6, 0, 18, 8, 541 0, 20, 10, 0); 542 reg_inls = _mm256_set_epi32(in[2], in[1] >> 28 | in[2] << 4, 543 in[1], in[1], 544 in[0] >> 30 | in[1] << 2, in[0], 545 in[0], in[0]); 546 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 547 _mm256_storeu_si256((__m256i*)(out), results); 548 out += 8; 549 550 // shift the second 8 outs 551 reg_shifts = _mm256_set_epi32(22, 12, 2, 0, 552 14, 4, 0, 16); 553 reg_inls = _mm256_set_epi32(in[4], in[4], 554 in[4], in[3] >> 24 | in[4] << 8, 555 in[3], in[3], 556 in[2] >> 26 | in[3] << 6, in[2]); 557 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 558 _mm256_storeu_si256((__m256i*)(out), results); 559 out += 8; 560 561 // shift the third 8 outs 562 reg_shifts = _mm256_set_epi32(6, 0, 18, 8, 563 0, 20, 10, 0); 564 reg_inls = _mm256_set_epi32(in[7], in[6] >> 28 | in[7] << 4, 565 in[6], in[6], 566 in[5] >> 30 | in[6] << 2, in[5], 567 in[5], in[5]); 568 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 569 _mm256_storeu_si256((__m256i*)(out), results); 570 out += 8; 571 572 // shift the last 8 outs 573 reg_shifts = _mm256_set_epi32(22, 12, 2, 0, 574 14, 4, 0, 16); 575 reg_inls = _mm256_set_epi32(in[9], in[9], 576 in[9], in[8] >> 24 | in[9] << 8, 577 in[8], in[8], 578 in[7] >> 26 | in[8] << 6, in[7]); 579 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 580 _mm256_storeu_si256((__m256i*)(out), results); 581 out += 8; 582 583 in += 10; 584 585 return in; 586 } 587 588 inline static const uint32_t* unpack11_32_avx2(const uint32_t* in, uint32_t* out) { 589 uint32_t mask = 0x7ff; 590 __m256i reg_shifts, reg_inls, reg_masks; 591 __m256i results; 592 593 reg_masks = _mm256_set1_epi32(mask); 594 595 // shift the first 8 outs 596 reg_shifts = _mm256_set_epi32(13, 2, 0, 12, 597 1, 0, 11, 0); 598 reg_inls = _mm256_set_epi32(in[2], in[2], 599 in[1] >> 23 | in[2] << 9, in[1], 600 in[1], in[0] >> 22 | in[1] << 10, 601 in[0], in[0]); 602 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 603 _mm256_storeu_si256((__m256i*)(out), results); 604 out += 8; 605 606 // shift the second 8 outs 607 reg_shifts = _mm256_set_epi32(5, 0, 15, 4, 608 0, 14, 3, 0); 609 reg_inls = _mm256_set_epi32(in[5], in[4] >> 26 | in[5] << 6, 610 in[4], in[4], 611 in[3] >> 25 | in[4] << 7, in[3], 612 in[3], in[2] >> 24 | in[3] << 8); 613 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 614 _mm256_storeu_si256((__m256i*)(out), results); 615 out += 8; 616 617 // shift the third 8 outs 618 reg_shifts = _mm256_set_epi32(0, 18, 7, 0, 619 17, 6, 0, 16); 620 reg_inls = _mm256_set_epi32(in[7] >> 29 | in[8] << 3, in[7], 621 in[7], in[6] >> 28 | in[7] << 4, 622 in[6], in[6], 623 in[5] >> 27 | in[6] << 5, in[5]); 624 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 625 _mm256_storeu_si256((__m256i*)(out), results); 626 out += 8; 627 628 // shift the last 8 outs 629 reg_shifts = _mm256_set_epi32(21, 10, 0, 20, 630 9, 0, 19, 8); 631 reg_inls = _mm256_set_epi32(in[10], in[10], 632 in[9] >> 31 | in[10] << 1, in[9], 633 in[9], in[8] >> 30 | in[9] << 2, 634 in[8], in[8]); 635 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 636 _mm256_storeu_si256((__m256i*)(out), results); 637 out += 8; 638 639 in += 11; 640 641 return in; 642 } 643 644 inline static const uint32_t* unpack12_32_avx2(const uint32_t* in, uint32_t* out) { 645 uint32_t mask = 0xfff; 646 __m256i reg_shifts, reg_inls, reg_masks; 647 __m256i results; 648 649 reg_masks = _mm256_set1_epi32(mask); 650 651 // shift the first 8 outs 652 reg_shifts = _mm256_set_epi32(20, 8, 0, 16, 653 4, 0, 12, 0); 654 reg_inls = _mm256_set_epi32(in[2], in[2], 655 in[1] >> 28 | in[2] << 4, in[1], 656 in[1], in[0] >> 24 | in[1] << 8, 657 in[0], in[0]); 658 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 659 _mm256_storeu_si256((__m256i*)(out), results); 660 out += 8; 661 662 // shift the second 8 outs 663 reg_shifts = _mm256_set_epi32(20, 8, 0, 16, 664 4, 0, 12, 0); 665 reg_inls = _mm256_set_epi32(in[5], in[5], 666 in[4] >> 28 | in[5] << 4, in[4], 667 in[4], in[3] >> 24 | in[4] << 8, 668 in[3], in[3]); 669 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 670 _mm256_storeu_si256((__m256i*)(out), results); 671 out += 8; 672 673 // shift the third 8 outs 674 reg_shifts = _mm256_set_epi32(20, 8, 0, 16, 675 4, 0, 12, 0); 676 reg_inls = _mm256_set_epi32(in[8], in[8], 677 in[7] >> 28 | in[8] << 4, in[7], 678 in[7], in[6] >> 24 | in[7] << 8, 679 in[6], in[6]); 680 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 681 _mm256_storeu_si256((__m256i*)(out), results); 682 out += 8; 683 684 // shift the last 8 outs 685 reg_shifts = _mm256_set_epi32(20, 8, 0, 16, 686 4, 0, 12, 0); 687 reg_inls = _mm256_set_epi32(in[11], in[11], 688 in[10] >> 28 | in[11] << 4, in[10], 689 in[10], in[9] >> 24 | in[10] << 8, 690 in[9], in[9]); 691 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 692 _mm256_storeu_si256((__m256i*)(out), results); 693 out += 8; 694 695 in += 12; 696 697 return in; 698 } 699 700 inline static const uint32_t* unpack13_32_avx2(const uint32_t* in, uint32_t* out) { 701 uint32_t mask = 0x1fff; 702 __m256i reg_shifts, reg_inls, reg_masks; 703 __m256i results; 704 705 reg_masks = _mm256_set1_epi32(mask); 706 707 // shift the first 8 outs 708 reg_shifts = _mm256_set_epi32(0, 14, 1, 0, 709 7, 0, 13, 0); 710 reg_inls = _mm256_set_epi32(in[2] >> 27 | in[3] << 5, in[2], 711 in[2], in[1] >> 20 | in[2] << 12, 712 in[1], in[0] >> 26 | in[1] << 6, 713 in[0], in[0]); 714 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 715 _mm256_storeu_si256((__m256i*)(out), results); 716 out += 8; 717 718 // shift the second 8 outs 719 reg_shifts = _mm256_set_epi32(3, 0, 9, 0, 720 15, 2, 0, 8); 721 reg_inls = _mm256_set_epi32(in[6], in[5] >> 22 | in[6] << 10, 722 in[5], in[4] >> 28 | in[5] << 4, 723 in[4], in[4], 724 in[3] >> 21 | in[4] << 11, in[3]); 725 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 726 _mm256_storeu_si256((__m256i*)(out), results); 727 out += 8; 728 729 // shift the third 8 outs 730 reg_shifts = _mm256_set_epi32(11, 0, 17, 4, 731 0, 10, 0, 16); 732 reg_inls = _mm256_set_epi32(in[9], in[8] >> 30 | in[9] << 2, 733 in[8], in[8], 734 in[7] >> 23 | in[8] << 9, in[7], 735 in[6] >> 29 | in[7] << 3, in[6]); 736 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 737 _mm256_storeu_si256((__m256i*)(out), results); 738 out += 8; 739 740 // shift the last 8 outs 741 reg_shifts = _mm256_set_epi32(19, 6, 0, 12, 742 0, 18, 5, 0); 743 reg_inls = _mm256_set_epi32(in[12], in[12], 744 in[11] >> 25 | in[12] << 7, in[11], 745 in[10] >> 31 | in[11] << 1, in[10], 746 in[10], in[9] >> 24 | in[10] << 8); 747 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 748 _mm256_storeu_si256((__m256i*)(out), results); 749 out += 8; 750 751 in += 13; 752 753 return in; 754 } 755 756 inline static const uint32_t* unpack14_32_avx2(const uint32_t* in, uint32_t* out) { 757 uint32_t mask = 0x3fff; 758 __m256i reg_shifts, reg_inls, reg_masks; 759 __m256i results; 760 761 reg_masks = _mm256_set1_epi32(mask); 762 763 // shift the first 8 outs 764 reg_shifts = _mm256_set_epi32(2, 0, 6, 0, 765 10, 0, 14, 0); 766 reg_inls = _mm256_set_epi32(in[3], in[2] >> 20 | in[3] << 12, 767 in[2], in[1] >> 24 | in[2] << 8, 768 in[1], in[0] >> 28 | in[1] << 4, 769 in[0], in[0]); 770 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 771 _mm256_storeu_si256((__m256i*)(out), results); 772 out += 8; 773 774 // shift the second 8 outs 775 reg_shifts = _mm256_set_epi32(18, 4, 0, 8, 776 0, 12, 0, 16); 777 reg_inls = _mm256_set_epi32(in[6], in[6], 778 in[5] >> 22 | in[6] << 10, in[5], 779 in[4] >> 26 | in[5] << 6, in[4], 780 in[3] >> 30 | in[4] << 2, in[3]); 781 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 782 _mm256_storeu_si256((__m256i*)(out), results); 783 out += 8; 784 785 // shift the third 8 outs 786 reg_shifts = _mm256_set_epi32(2, 0, 6, 0, 787 10, 0, 14, 0); 788 reg_inls = _mm256_set_epi32(in[10], in[9] >> 20 | in[10] << 12, 789 in[9], in[8] >> 24 | in[9] << 8, 790 in[8], in[7] >> 28 | in[8] << 4, 791 in[7], in[7]); 792 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 793 _mm256_storeu_si256((__m256i*)(out), results); 794 out += 8; 795 796 // shift the last 8 outs 797 reg_shifts = _mm256_set_epi32(18, 4, 0, 8, 798 0, 12, 0, 16); 799 reg_inls = _mm256_set_epi32(in[13], in[13], 800 in[12] >> 22 | in[13] << 10, in[12], 801 in[11] >> 26 | in[12] << 6, in[11], 802 in[10] >> 30 | in[11] << 2, in[10]); 803 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 804 _mm256_storeu_si256((__m256i*)(out), results); 805 out += 8; 806 807 in += 14; 808 809 return in; 810 } 811 812 inline static const uint32_t* unpack15_32_avx2(const uint32_t* in, uint32_t* out) { 813 uint32_t mask = 0x7fff; 814 __m256i reg_shifts, reg_inls, reg_masks; 815 __m256i results; 816 817 reg_masks = _mm256_set1_epi32(mask); 818 819 // shift the first 8 outs 820 reg_shifts = _mm256_set_epi32(9, 0, 11, 0, 821 13, 0, 15, 0); 822 reg_inls = _mm256_set_epi32(in[3], in[2] >> 26 | in[3] << 6, 823 in[2], in[1] >> 28 | in[2] << 4, 824 in[1], in[0] >> 30 | in[1] << 2, 825 in[0], in[0]); 826 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 827 _mm256_storeu_si256((__m256i*)(out), results); 828 out += 8; 829 830 // shift the second 8 outs 831 reg_shifts = _mm256_set_epi32(1, 0, 3, 0, 832 5, 0, 7, 0); 833 reg_inls = _mm256_set_epi32(in[7], in[6] >> 18 | in[7] << 14, 834 in[6], in[5] >> 20 | in[6] << 12, 835 in[5], in[4] >> 22 | in[5] << 10, 836 in[4], in[3] >> 24 | in[4] << 8); 837 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 838 _mm256_storeu_si256((__m256i*)(out), results); 839 out += 8; 840 841 // shift the third 8 outs 842 reg_shifts = _mm256_set_epi32(0, 10, 0, 12, 843 0, 14, 0, 16); 844 reg_inls = _mm256_set_epi32(in[10] >> 25 | in[11] << 7, in[10], 845 in[9] >> 27 | in[10] << 5, in[9], 846 in[8] >> 29 | in[9] << 3, in[8], 847 in[7] >> 31 | in[8] << 1, in[7]); 848 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 849 _mm256_storeu_si256((__m256i*)(out), results); 850 out += 8; 851 852 // shift the last 8 outs 853 reg_shifts = _mm256_set_epi32(17, 2, 0, 4, 854 0, 6, 0, 8); 855 reg_inls = _mm256_set_epi32(in[14], in[14], 856 in[13] >> 19 | in[14] << 13, in[13], 857 in[12] >> 21 | in[13] << 11, in[12], 858 in[11] >> 23 | in[12] << 9, in[11]); 859 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 860 _mm256_storeu_si256((__m256i*)(out), results); 861 out += 8; 862 863 in += 15; 864 865 return in; 866 } 867 868 inline static const uint32_t* unpack16_32_avx2(const uint32_t* in, uint32_t* out) { 869 uint32_t mask = 0xffff; 870 __m256i reg_shifts, reg_inls, reg_masks; 871 __m256i results; 872 873 reg_masks = _mm256_set1_epi32(mask); 874 875 // shift the first 8 outs 876 reg_shifts = _mm256_set_epi32(16, 0, 16, 0, 877 16, 0, 16, 0); 878 reg_inls = _mm256_set_epi32(in[3], in[3], 879 in[2], in[2], 880 in[1], in[1], 881 in[0], in[0]); 882 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 883 _mm256_storeu_si256((__m256i*)(out), results); 884 out += 8; 885 886 // shift the second 8 outs 887 reg_shifts = _mm256_set_epi32(16, 0, 16, 0, 888 16, 0, 16, 0); 889 reg_inls = _mm256_set_epi32(in[7], in[7], 890 in[6], in[6], 891 in[5], in[5], 892 in[4], in[4]); 893 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 894 _mm256_storeu_si256((__m256i*)(out), results); 895 out += 8; 896 897 // shift the third 8 outs 898 reg_shifts = _mm256_set_epi32(16, 0, 16, 0, 899 16, 0, 16, 0); 900 reg_inls = _mm256_set_epi32(in[11], in[11], 901 in[10], in[10], 902 in[9], in[9], 903 in[8], in[8]); 904 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 905 _mm256_storeu_si256((__m256i*)(out), results); 906 out += 8; 907 908 // shift the last 8 outs 909 reg_shifts = _mm256_set_epi32(16, 0, 16, 0, 910 16, 0, 16, 0); 911 reg_inls = _mm256_set_epi32(in[15], in[15], 912 in[14], in[14], 913 in[13], in[13], 914 in[12], in[12]); 915 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 916 _mm256_storeu_si256((__m256i*)(out), results); 917 out += 8; 918 919 in += 16; 920 921 return in; 922 } 923 924 inline static const uint32_t* unpack17_32_avx2(const uint32_t* in, uint32_t* out) { 925 uint32_t mask = 0x1ffff; 926 __m256i reg_shifts, reg_inls, reg_masks; 927 __m256i results; 928 929 reg_masks = _mm256_set1_epi32(mask); 930 931 // shift the first 8 outs 932 reg_shifts = _mm256_set_epi32(0, 6, 0, 4, 933 0, 2, 0, 0); 934 reg_inls = _mm256_set_epi32(in[3] >> 23 | in[4] << 9, in[3], 935 in[2] >> 21 | in[3] << 11, in[2], 936 in[1] >> 19 | in[2] << 13, in[1], 937 in[0] >> 17 | in[1] << 15, in[0]); 938 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 939 _mm256_storeu_si256((__m256i*)(out), results); 940 out += 8; 941 942 // shift the second 8 outs 943 reg_shifts = _mm256_set_epi32(0, 14, 0, 12, 944 0, 10, 0, 8); 945 reg_inls = _mm256_set_epi32(in[7] >> 31 | in[8] << 1, in[7], 946 in[6] >> 29 | in[7] << 3, in[6], 947 in[5] >> 27 | in[6] << 5, in[5], 948 in[4] >> 25 | in[5] << 7, in[4]); 949 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 950 _mm256_storeu_si256((__m256i*)(out), results); 951 out += 8; 952 953 // shift the third 8 outs 954 reg_shifts = _mm256_set_epi32(7, 0, 5, 0, 955 3, 0, 1, 0); 956 reg_inls = _mm256_set_epi32(in[12], in[11] >> 22 | in[12] << 10, 957 in[11], in[10] >> 20 | in[11] << 12, 958 in[10], in[9] >> 18 | in[10] << 14, 959 in[9], in[8] >> 16 | in[9] << 16); 960 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 961 _mm256_storeu_si256((__m256i*)(out), results); 962 out += 8; 963 964 // shift the last 8 outs 965 reg_shifts = _mm256_set_epi32(15, 0, 13, 0, 966 11, 0, 9, 0); 967 reg_inls = _mm256_set_epi32(in[16], in[15] >> 30 | in[16] << 2, 968 in[15], in[14] >> 28 | in[15] << 4, 969 in[14], in[13] >> 26 | in[14] << 6, 970 in[13], in[12] >> 24 | in[13] << 8); 971 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 972 _mm256_storeu_si256((__m256i*)(out), results); 973 out += 8; 974 975 in += 17; 976 977 return in; 978 } 979 980 inline static const uint32_t* unpack18_32_avx2(const uint32_t* in, uint32_t* out) { 981 uint32_t mask = 0x3ffff; 982 __m256i reg_shifts, reg_inls, reg_masks; 983 __m256i results; 984 985 reg_masks = _mm256_set1_epi32(mask); 986 987 // shift the first 8 outs 988 reg_shifts = _mm256_set_epi32(0, 12, 0, 8, 989 0, 4, 0, 0); 990 reg_inls = _mm256_set_epi32(in[3] >> 30 | in[4] << 2, in[3], 991 in[2] >> 26 | in[3] << 6, in[2], 992 in[1] >> 22 | in[2] << 10, in[1], 993 in[0] >> 18 | in[1] << 14, in[0]); 994 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 995 _mm256_storeu_si256((__m256i*)(out), results); 996 out += 8; 997 998 // shift the second 8 outs 999 reg_shifts = _mm256_set_epi32(14, 0, 10, 0, 1000 6, 0, 2, 0); 1001 reg_inls = _mm256_set_epi32(in[8], in[7] >> 28 | in[8] << 4, 1002 in[7], in[6] >> 24 | in[7] << 8, 1003 in[6], in[5] >> 20 | in[6] << 12, 1004 in[5], in[4] >> 16 | in[5] << 16); 1005 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1006 _mm256_storeu_si256((__m256i*)(out), results); 1007 out += 8; 1008 1009 // shift the third 8 outs 1010 reg_shifts = _mm256_set_epi32(0, 12, 0, 8, 1011 0, 4, 0, 0); 1012 reg_inls = _mm256_set_epi32(in[12] >> 30 | in[13] << 2, in[12], 1013 in[11] >> 26 | in[12] << 6, in[11], 1014 in[10] >> 22 | in[11] << 10, in[10], 1015 in[9] >> 18 | in[10] << 14, in[9]); 1016 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1017 _mm256_storeu_si256((__m256i*)(out), results); 1018 out += 8; 1019 1020 // shift the last 8 outs 1021 reg_shifts = _mm256_set_epi32(14, 0, 10, 0, 1022 6, 0, 2, 0); 1023 reg_inls = _mm256_set_epi32(in[17], in[16] >> 28 | in[17] << 4, 1024 in[16], in[15] >> 24 | in[16] << 8, 1025 in[15], in[14] >> 20 | in[15] << 12, 1026 in[14], in[13] >> 16 | in[14] << 16); 1027 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1028 _mm256_storeu_si256((__m256i*)(out), results); 1029 out += 8; 1030 1031 in += 18; 1032 1033 return in; 1034 } 1035 1036 inline static const uint32_t* unpack19_32_avx2(const uint32_t* in, uint32_t* out) { 1037 uint32_t mask = 0x7ffff; 1038 __m256i reg_shifts, reg_inls, reg_masks; 1039 __m256i results; 1040 1041 reg_masks = _mm256_set1_epi32(mask); 1042 1043 // shift the first 8 outs 1044 reg_shifts = _mm256_set_epi32(5, 0, 0, 12, 1045 0, 6, 0, 0); 1046 reg_inls = _mm256_set_epi32(in[4], in[3] >> 18 | in[4] << 14, 1047 in[2] >> 31 | in[3] << 1, in[2], 1048 in[1] >> 25 | in[2] << 7, in[1], 1049 in[0] >> 19 | in[1] << 13, in[0]); 1050 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1051 _mm256_storeu_si256((__m256i*)(out), results); 1052 out += 8; 1053 1054 // shift the second 8 outs 1055 reg_shifts = _mm256_set_epi32(0, 10, 0, 4, 1056 0, 0, 11, 0); 1057 reg_inls = _mm256_set_epi32(in[8] >> 29 | in[9] << 3, in[8], 1058 in[7] >> 23 | in[8] << 9, in[7], 1059 in[6] >> 17 | in[7] << 15, in[5] >> 30 | in[6] << 2, 1060 in[5], in[4] >> 24 | in[5] << 8); 1061 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1062 _mm256_storeu_si256((__m256i*)(out), results); 1063 out += 8; 1064 1065 // shift the third 8 outs 1066 reg_shifts = _mm256_set_epi32(0, 2, 0, 0, 1067 9, 0, 3, 0); 1068 reg_inls = _mm256_set_epi32(in[13] >> 21 | in[14] << 11, in[13], 1069 in[12] >> 15 | in[13] << 17, in[11] >> 28 | in[12] << 4, 1070 in[11], in[10] >> 22 | in[11] << 10, 1071 in[10], in[9] >> 16 | in[10] << 16); 1072 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1073 _mm256_storeu_si256((__m256i*)(out), results); 1074 out += 8; 1075 1076 // shift the last 8 outs 1077 reg_shifts = _mm256_set_epi32(13, 0, 7, 0, 1078 1, 0, 0, 8); 1079 reg_inls = _mm256_set_epi32(in[18], in[17] >> 26 | in[18] << 6, 1080 in[17], in[16] >> 20 | in[17] << 12, 1081 in[16], in[15] >> 14 | in[16] << 18, 1082 in[14] >> 27 | in[15] << 5, in[14]); 1083 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1084 _mm256_storeu_si256((__m256i*)(out), results); 1085 out += 8; 1086 1087 in += 19; 1088 1089 return in; 1090 } 1091 1092 inline static const uint32_t* unpack20_32_avx2(const uint32_t* in, uint32_t* out) { 1093 uint32_t mask = 0xfffff; 1094 __m256i reg_shifts, reg_inls, reg_masks; 1095 __m256i results; 1096 1097 reg_masks = _mm256_set1_epi32(mask); 1098 1099 // shift the first 8 outs 1100 reg_shifts = _mm256_set_epi32(12, 0, 4, 0, 1101 0, 8, 0, 0); 1102 reg_inls = _mm256_set_epi32(in[4], in[3] >> 24 | in[4] << 8, 1103 in[3], in[2] >> 16 | in[3] << 16, 1104 in[1] >> 28 | in[2] << 4, in[1], 1105 in[0] >> 20 | in[1] << 12, in[0]); 1106 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1107 _mm256_storeu_si256((__m256i*)(out), results); 1108 out += 8; 1109 1110 // shift the second 8 outs 1111 reg_shifts = _mm256_set_epi32(12, 0, 4, 0, 1112 0, 8, 0, 0); 1113 reg_inls = _mm256_set_epi32(in[9], in[8] >> 24 | in[9] << 8, 1114 in[8], in[7] >> 16 | in[8] << 16, 1115 in[6] >> 28 | in[7] << 4, in[6], 1116 in[5] >> 20 | in[6] << 12, in[5]); 1117 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1118 _mm256_storeu_si256((__m256i*)(out), results); 1119 out += 8; 1120 1121 // shift the third 8 outs 1122 reg_shifts = _mm256_set_epi32(12, 0, 4, 0, 1123 0, 8, 0, 0); 1124 reg_inls = _mm256_set_epi32(in[14], in[13] >> 24 | in[14] << 8, 1125 in[13], in[12] >> 16 | in[13] << 16, 1126 in[11] >> 28 | in[12] << 4, in[11], 1127 in[10] >> 20 | in[11] << 12, in[10]); 1128 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1129 _mm256_storeu_si256((__m256i*)(out), results); 1130 out += 8; 1131 1132 // shift the last 8 outs 1133 reg_shifts = _mm256_set_epi32(12, 0, 4, 0, 1134 0, 8, 0, 0); 1135 reg_inls = _mm256_set_epi32(in[19], in[18] >> 24 | in[19] << 8, 1136 in[18], in[17] >> 16 | in[18] << 16, 1137 in[16] >> 28 | in[17] << 4, in[16], 1138 in[15] >> 20 | in[16] << 12, in[15]); 1139 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1140 _mm256_storeu_si256((__m256i*)(out), results); 1141 out += 8; 1142 1143 in += 20; 1144 1145 return in; 1146 } 1147 1148 inline static const uint32_t* unpack21_32_avx2(const uint32_t* in, uint32_t* out) { 1149 uint32_t mask = 0x1fffff; 1150 __m256i reg_shifts, reg_inls, reg_masks; 1151 __m256i results; 1152 1153 reg_masks = _mm256_set1_epi32(mask); 1154 1155 // shift the first 8 outs 1156 reg_shifts = _mm256_set_epi32(0, 0, 9, 0, 1157 0, 10, 0, 0); 1158 reg_inls = _mm256_set_epi32(in[4] >> 19 | in[5] << 13, in[3] >> 30 | in[4] << 2, 1159 in[3], in[2] >> 20 | in[3] << 12, 1160 in[1] >> 31 | in[2] << 1, in[1], 1161 in[0] >> 21 | in[1] << 11, in[0]); 1162 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1163 _mm256_storeu_si256((__m256i*)(out), results); 1164 out += 8; 1165 1166 // shift the second 8 outs 1167 reg_shifts = _mm256_set_epi32(0, 6, 0, 0, 1168 7, 0, 0, 8); 1169 reg_inls = _mm256_set_epi32(in[9] >> 27 | in[10] << 5, in[9], 1170 in[8] >> 17 | in[9] << 15, in[7] >> 28 | in[8] << 4, 1171 in[7], in[6] >> 18 | in[7] << 14, 1172 in[5] >> 29 | in[6] << 3, in[5]); 1173 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1174 _mm256_storeu_si256((__m256i*)(out), results); 1175 out += 8; 1176 1177 // shift the third 8 outs 1178 reg_shifts = _mm256_set_epi32(3, 0, 0, 4, 1179 0, 0, 5, 0); 1180 reg_inls = _mm256_set_epi32(in[15], in[14] >> 14 | in[15] << 18, 1181 in[13] >> 25 | in[14] << 7, in[13], 1182 in[12] >> 15 | in[13] << 17, in[11] >> 26 | in[12] << 6, 1183 in[11], in[10] >> 16 | in[11] << 16); 1184 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1185 _mm256_storeu_si256((__m256i*)(out), results); 1186 out += 8; 1187 1188 // shift the last 8 outs 1189 reg_shifts = _mm256_set_epi32(11, 0, 1, 0, 1190 0, 2, 0, 0); 1191 reg_inls = _mm256_set_epi32(in[20], in[19] >> 22 | in[20] << 10, 1192 in[19], in[18] >> 12 | in[19] << 20, 1193 in[17] >> 23 | in[18] << 9, in[17], 1194 in[16] >> 13 | in[17] << 19, in[15] >> 24 | in[16] << 8); 1195 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1196 _mm256_storeu_si256((__m256i*)(out), results); 1197 out += 8; 1198 1199 in += 21; 1200 1201 return in; 1202 } 1203 1204 inline static const uint32_t* unpack22_32_avx2(const uint32_t* in, uint32_t* out) { 1205 uint32_t mask = 0x3fffff; 1206 __m256i reg_shifts, reg_inls, reg_masks; 1207 __m256i results; 1208 1209 reg_masks = _mm256_set1_epi32(mask); 1210 1211 // shift the first 8 outs 1212 reg_shifts = _mm256_set_epi32(0, 4, 0, 0, 1213 2, 0, 0, 0); 1214 reg_inls = _mm256_set_epi32(in[4] >> 26 | in[5] << 6, in[4], 1215 in[3] >> 14 | in[4] << 18, in[2] >> 24 | in[3] << 8, 1216 in[2], in[1] >> 12 | in[2] << 20, 1217 in[0] >> 22 | in[1] << 10, in[0]); 1218 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1219 _mm256_storeu_si256((__m256i*)(out), results); 1220 out += 8; 1221 1222 // shift the second 8 outs 1223 reg_shifts = _mm256_set_epi32(10, 0, 0, 8, 1224 0, 0, 6, 0); 1225 reg_inls = _mm256_set_epi32(in[10], in[9] >> 20 | in[10] << 12, 1226 in[8] >> 30 | in[9] << 2, in[8], 1227 in[7] >> 18 | in[8] << 14, in[6] >> 28 | in[7] << 4, 1228 in[6], in[5] >> 16 | in[6] << 16); 1229 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1230 _mm256_storeu_si256((__m256i*)(out), results); 1231 out += 8; 1232 1233 // shift the third 8 outs 1234 reg_shifts = _mm256_set_epi32(0, 4, 0, 0, 1235 2, 0, 0, 0); 1236 reg_inls = _mm256_set_epi32(in[15] >> 26 | in[16] << 6, in[15], 1237 in[14] >> 14 | in[15] << 18, in[13] >> 24 | in[14] << 8, 1238 in[13], in[12] >> 12 | in[13] << 20, 1239 in[11] >> 22 | in[12] << 10, in[11]); 1240 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1241 _mm256_storeu_si256((__m256i*)(out), results); 1242 out += 8; 1243 1244 // shift the last 8 outs 1245 reg_shifts = _mm256_set_epi32(10, 0, 0, 8, 1246 0, 0, 6, 0); 1247 reg_inls = _mm256_set_epi32(in[21], in[20] >> 20 | in[21] << 12, 1248 in[19] >> 30 | in[20] << 2, in[19], 1249 in[18] >> 18 | in[19] << 14, in[17] >> 28 | in[18] << 4, 1250 in[17], in[16] >> 16 | in[17] << 16); 1251 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1252 _mm256_storeu_si256((__m256i*)(out), results); 1253 out += 8; 1254 1255 in += 22; 1256 1257 return in; 1258 } 1259 1260 inline static const uint32_t* unpack23_32_avx2(const uint32_t* in, uint32_t* out) { 1261 uint32_t mask = 0x7fffff; 1262 __m256i reg_shifts, reg_inls, reg_masks; 1263 __m256i results; 1264 1265 reg_masks = _mm256_set1_epi32(mask); 1266 1267 // shift the first 8 outs 1268 reg_shifts = _mm256_set_epi32(1, 0, 0, 0, 1269 5, 0, 0, 0); 1270 reg_inls = _mm256_set_epi32(in[5], in[4] >> 10 | in[5] << 22, 1271 in[3] >> 19 | in[4] << 13, in[2] >> 28 | in[3] << 4, 1272 in[2], in[1] >> 14 | in[2] << 18, 1273 in[0] >> 23 | in[1] << 9, in[0]); 1274 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1275 _mm256_storeu_si256((__m256i*)(out), results); 1276 out += 8; 1277 1278 // shift the second 8 outs 1279 reg_shifts = _mm256_set_epi32(0, 2, 0, 0, 1280 0, 6, 0, 0); 1281 reg_inls = _mm256_set_epi32(in[10] >> 25 | in[11] << 7, in[10], 1282 in[9] >> 11 | in[10] << 21, in[8] >> 20 | in[9] << 12, 1283 in[7] >> 29 | in[8] << 3, in[7], 1284 in[6] >> 15 | in[7] << 17, in[5] >> 24 | in[6] << 8); 1285 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1286 _mm256_storeu_si256((__m256i*)(out), results); 1287 out += 8; 1288 1289 // shift the third 8 outs 1290 reg_shifts = _mm256_set_epi32(0, 0, 3, 0, 1291 0, 0, 7, 0); 1292 reg_inls = _mm256_set_epi32(in[16] >> 17 | in[17] << 15, in[15] >> 26 | in[16] << 6, 1293 in[15], in[14] >> 12 | in[15] << 20, 1294 in[13] >> 21 | in[14] << 11, in[12] >> 30 | in[13] << 2, 1295 in[12], in[11] >> 16 | in[12] << 16); 1296 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1297 _mm256_storeu_si256((__m256i*)(out), results); 1298 out += 8; 1299 1300 // shift the last 8 outs 1301 reg_shifts = _mm256_set_epi32(9, 0, 0, 4, 1302 0, 0, 0, 8); 1303 reg_inls = _mm256_set_epi32(in[22], in[21] >> 18 | in[22] << 14, 1304 in[20] >> 27 | in[21] << 5, in[20], 1305 in[19] >> 13 | in[20] << 19, in[18] >> 22 | in[19] << 10, 1306 in[17] >> 31 | in[18] << 1, in[17]); 1307 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1308 _mm256_storeu_si256((__m256i*)(out), results); 1309 out += 8; 1310 1311 in += 23; 1312 1313 return in; 1314 } 1315 1316 inline static const uint32_t* unpack24_32_avx2(const uint32_t* in, uint32_t* out) { 1317 uint32_t mask = 0xffffff; 1318 __m256i reg_shifts, reg_inls, reg_masks; 1319 __m256i results; 1320 1321 reg_masks = _mm256_set1_epi32(mask); 1322 1323 // shift the first 8 outs 1324 reg_shifts = _mm256_set_epi32(8, 0, 0, 0, 1325 8, 0, 0, 0); 1326 reg_inls = _mm256_set_epi32(in[5], in[4] >> 16 | in[5] << 16, 1327 in[3] >> 24 | in[4] << 8, in[3], 1328 in[2], in[1] >> 16 | in[2] << 16, 1329 in[0] >> 24 | in[1] << 8, in[0]); 1330 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1331 _mm256_storeu_si256((__m256i*)(out), results); 1332 out += 8; 1333 1334 // shift the second 8 outs 1335 reg_shifts = _mm256_set_epi32(8, 0, 0, 0, 1336 8, 0, 0, 0); 1337 reg_inls = _mm256_set_epi32(in[11], in[10] >> 16 | in[11] << 16, 1338 in[9] >> 24 | in[10] << 8, in[9], 1339 in[8], in[7] >> 16 | in[8] << 16, 1340 in[6] >> 24 | in[7] << 8, in[6]); 1341 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1342 _mm256_storeu_si256((__m256i*)(out), results); 1343 out += 8; 1344 1345 // shift the third 8 outs 1346 reg_shifts = _mm256_set_epi32(8, 0, 0, 0, 1347 8, 0, 0, 0); 1348 reg_inls = _mm256_set_epi32(in[17], in[16] >> 16 | in[17] << 16, 1349 in[15] >> 24 | in[16] << 8, in[15], 1350 in[14], in[13] >> 16 | in[14] << 16, 1351 in[12] >> 24 | in[13] << 8, in[12]); 1352 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1353 _mm256_storeu_si256((__m256i*)(out), results); 1354 out += 8; 1355 1356 // shift the last 8 outs 1357 reg_shifts = _mm256_set_epi32(8, 0, 0, 0, 1358 8, 0, 0, 0); 1359 reg_inls = _mm256_set_epi32(in[23], in[22] >> 16 | in[23] << 16, 1360 in[21] >> 24 | in[22] << 8, in[21], 1361 in[20], in[19] >> 16 | in[20] << 16, 1362 in[18] >> 24 | in[19] << 8, in[18]); 1363 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1364 _mm256_storeu_si256((__m256i*)(out), results); 1365 out += 8; 1366 1367 in += 24; 1368 1369 return in; 1370 } 1371 1372 inline static const uint32_t* unpack25_32_avx2(const uint32_t* in, uint32_t* out) { 1373 uint32_t mask = 0x1ffffff; 1374 __m256i reg_shifts, reg_inls, reg_masks; 1375 __m256i results; 1376 1377 reg_masks = _mm256_set1_epi32(mask); 1378 1379 // shift the first 8 outs 1380 reg_shifts = _mm256_set_epi32(0, 0, 0, 4, 1381 0, 0, 0, 0); 1382 reg_inls = _mm256_set_epi32(in[5] >> 15 | in[6] << 17, in[4] >> 22 | in[5] << 10, 1383 in[3] >> 29 | in[4] << 3, in[3], 1384 in[2] >> 11 | in[3] << 21, in[1] >> 18 | in[2] << 14, 1385 in[0] >> 25 | in[1] << 7, in[0]); 1386 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1387 _mm256_storeu_si256((__m256i*)(out), results); 1388 out += 8; 1389 1390 // shift the second 8 outs 1391 reg_shifts = _mm256_set_epi32(0, 0, 5, 0, 1392 0, 0, 1, 0); 1393 reg_inls = _mm256_set_epi32(in[11] >> 23 | in[12] << 9, in[10] >> 30 | in[11] << 2, 1394 in[10], in[9] >> 12 | in[10] << 20, 1395 in[8] >> 19 | in[9] << 13, in[7] >> 26 | in[8] << 6, 1396 in[7], in[6] >> 8 | in[7] << 24); 1397 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1398 _mm256_storeu_si256((__m256i*)(out), results); 1399 out += 8; 1400 1401 // shift the third 8 outs 1402 reg_shifts = _mm256_set_epi32(0, 6, 0, 0, 1403 0, 2, 0, 0); 1404 reg_inls = _mm256_set_epi32(in[17] >> 31 | in[18] << 1, in[17], 1405 in[16] >> 13 | in[17] << 19, in[15] >> 20 | in[16] << 12, 1406 in[14] >> 27 | in[15] << 5, in[14], 1407 in[13] >> 9 | in[14] << 23, in[12] >> 16 | in[13] << 16); 1408 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1409 _mm256_storeu_si256((__m256i*)(out), results); 1410 out += 8; 1411 1412 // shift the last 8 outs 1413 reg_shifts = _mm256_set_epi32(7, 0, 0, 0, 1414 3, 0, 0, 0); 1415 reg_inls = _mm256_set_epi32(in[24], in[23] >> 14 | in[24] << 18, 1416 in[22] >> 21 | in[23] << 11, in[21] >> 28 | in[22] << 4, 1417 in[21], in[20] >> 10 | in[21] << 22, 1418 in[19] >> 17 | in[20] << 15, in[18] >> 24 | in[19] << 8); 1419 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1420 _mm256_storeu_si256((__m256i*)(out), results); 1421 out += 8; 1422 1423 in += 25; 1424 1425 return in; 1426 } 1427 1428 inline static const uint32_t* unpack26_32_avx2(const uint32_t* in, uint32_t* out) { 1429 uint32_t mask = 0x3ffffff; 1430 __m256i reg_shifts, reg_inls, reg_masks; 1431 __m256i results; 1432 1433 reg_masks = _mm256_set1_epi32(mask); 1434 1435 // shift the first 8 outs 1436 reg_shifts = _mm256_set_epi32(0, 0, 2, 0, 1437 0, 0, 0, 0); 1438 reg_inls = _mm256_set_epi32(in[5] >> 22 | in[6] << 10, in[4] >> 28 | in[5] << 4, 1439 in[4], in[3] >> 8 | in[4] << 24, 1440 in[2] >> 14 | in[3] << 18, in[1] >> 20 | in[2] << 12, 1441 in[0] >> 26 | in[1] << 6, in[0]); 1442 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1443 _mm256_storeu_si256((__m256i*)(out), results); 1444 out += 8; 1445 1446 // shift the second 8 outs 1447 reg_shifts = _mm256_set_epi32(6, 0, 0, 0, 1448 0, 4, 0, 0); 1449 reg_inls = _mm256_set_epi32(in[12], in[11] >> 12 | in[12] << 20, 1450 in[10] >> 18 | in[11] << 14, in[9] >> 24 | in[10] << 8, 1451 in[8] >> 30 | in[9] << 2, in[8], 1452 in[7] >> 10 | in[8] << 22, in[6] >> 16 | in[7] << 16); 1453 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1454 _mm256_storeu_si256((__m256i*)(out), results); 1455 out += 8; 1456 1457 // shift the third 8 outs 1458 reg_shifts = _mm256_set_epi32(0, 0, 2, 0, 1459 0, 0, 0, 0); 1460 reg_inls = _mm256_set_epi32(in[18] >> 22 | in[19] << 10, in[17] >> 28 | in[18] << 4, 1461 in[17], in[16] >> 8 | in[17] << 24, 1462 in[15] >> 14 | in[16] << 18, in[14] >> 20 | in[15] << 12, 1463 in[13] >> 26 | in[14] << 6, in[13]); 1464 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1465 _mm256_storeu_si256((__m256i*)(out), results); 1466 out += 8; 1467 1468 // shift the last 8 outs 1469 reg_shifts = _mm256_set_epi32(6, 0, 0, 0, 1470 0, 4, 0, 0); 1471 reg_inls = _mm256_set_epi32(in[25], in[24] >> 12 | in[25] << 20, 1472 in[23] >> 18 | in[24] << 14, in[22] >> 24 | in[23] << 8, 1473 in[21] >> 30 | in[22] << 2, in[21], 1474 in[20] >> 10 | in[21] << 22, in[19] >> 16 | in[20] << 16); 1475 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1476 _mm256_storeu_si256((__m256i*)(out), results); 1477 out += 8; 1478 1479 in += 26; 1480 1481 return in; 1482 } 1483 1484 inline static const uint32_t* unpack27_32_avx2(const uint32_t* in, uint32_t* out) { 1485 uint32_t mask = 0x7ffffff; 1486 __m256i reg_shifts, reg_inls, reg_masks; 1487 __m256i results; 1488 1489 reg_masks = _mm256_set1_epi32(mask); 1490 1491 // shift the first 8 outs 1492 reg_shifts = _mm256_set_epi32(0, 2, 0, 0, 1493 0, 0, 0, 0); 1494 reg_inls = _mm256_set_epi32(in[5] >> 29 | in[6] << 3, in[5], 1495 in[4] >> 7 | in[5] << 25, in[3] >> 12 | in[4] << 20, 1496 in[2] >> 17 | in[3] << 15, in[1] >> 22 | in[2] << 10, 1497 in[0] >> 27 | in[1] << 5, in[0]); 1498 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1499 _mm256_storeu_si256((__m256i*)(out), results); 1500 out += 8; 1501 1502 // shift the second 8 outs 1503 reg_shifts = _mm256_set_epi32(0, 0, 0, 4, 1504 0, 0, 0, 0); 1505 reg_inls = _mm256_set_epi32(in[12] >> 21 | in[13] << 11, in[11] >> 26 | in[12] << 6, 1506 in[10] >> 31 | in[11] << 1, in[10], 1507 in[9] >> 9 | in[10] << 23, in[8] >> 14 | in[9] << 18, 1508 in[7] >> 19 | in[8] << 13, in[6] >> 24 | in[7] << 8); 1509 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1510 _mm256_storeu_si256((__m256i*)(out), results); 1511 out += 8; 1512 1513 // shift the third 8 outs 1514 reg_shifts = _mm256_set_epi32(0, 0, 0, 0, 1515 1, 0, 0, 0); 1516 reg_inls = _mm256_set_epi32(in[19] >> 13 | in[20] << 19, in[18] >> 18 | in[19] << 14, 1517 in[17] >> 23 | in[18] << 9, in[16] >> 28 | in[17] << 4, 1518 in[16], in[15] >> 6 | in[16] << 26, 1519 in[14] >> 11 | in[15] << 21, in[13] >> 16 | in[14] << 16); 1520 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1521 _mm256_storeu_si256((__m256i*)(out), results); 1522 out += 8; 1523 1524 // shift the last 8 outs 1525 reg_shifts = _mm256_set_epi32(5, 0, 0, 0, 1526 0, 0, 3, 0); 1527 reg_inls = _mm256_set_epi32(in[26], in[25] >> 10 | in[26] << 22, 1528 in[24] >> 15 | in[25] << 17, in[23] >> 20 | in[24] << 12, 1529 in[22] >> 25 | in[23] << 7, in[21] >> 30 | in[22] << 2, 1530 in[21], in[20] >> 8 | in[21] << 24); 1531 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1532 _mm256_storeu_si256((__m256i*)(out), results); 1533 out += 8; 1534 1535 in += 27; 1536 1537 return in; 1538 } 1539 1540 inline static const uint32_t* unpack28_32_avx2(const uint32_t* in, uint32_t* out) { 1541 uint32_t mask = 0xfffffff; 1542 __m256i reg_shifts, reg_inls, reg_masks; 1543 __m256i results; 1544 1545 reg_masks = _mm256_set1_epi32(mask); 1546 1547 // shift the first 8 outs 1548 reg_shifts = _mm256_set_epi32(4, 0, 0, 0, 1549 0, 0, 0, 0); 1550 reg_inls = _mm256_set_epi32(in[6], in[5] >> 8 | in[6] << 24, 1551 in[4] >> 12 | in[5] << 20, in[3] >> 16 | in[4] << 16, 1552 in[2] >> 20 | in[3] << 12, in[1] >> 24 | in[2] << 8, 1553 in[0] >> 28 | in[1] << 4, in[0]); 1554 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1555 _mm256_storeu_si256((__m256i*)(out), results); 1556 out += 8; 1557 1558 // shift the second 8 outs 1559 reg_shifts = _mm256_set_epi32(4, 0, 0, 0, 1560 0, 0, 0, 0); 1561 reg_inls = _mm256_set_epi32(in[13], in[12] >> 8 | in[13] << 24, 1562 in[11] >> 12 | in[12] << 20, in[10] >> 16 | in[11] << 16, 1563 in[9] >> 20 | in[10] << 12, in[8] >> 24 | in[9] << 8, 1564 in[7] >> 28 | in[8] << 4, in[7]); 1565 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1566 _mm256_storeu_si256((__m256i*)(out), results); 1567 out += 8; 1568 1569 // shift the third 8 outs 1570 reg_shifts = _mm256_set_epi32(4, 0, 0, 0, 1571 0, 0, 0, 0); 1572 reg_inls = _mm256_set_epi32(in[20], in[19] >> 8 | in[20] << 24, 1573 in[18] >> 12 | in[19] << 20, in[17] >> 16 | in[18] << 16, 1574 in[16] >> 20 | in[17] << 12, in[15] >> 24 | in[16] << 8, 1575 in[14] >> 28 | in[15] << 4, in[14]); 1576 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1577 _mm256_storeu_si256((__m256i*)(out), results); 1578 out += 8; 1579 1580 // shift the last 8 outs 1581 reg_shifts = _mm256_set_epi32(4, 0, 0, 0, 1582 0, 0, 0, 0); 1583 reg_inls = _mm256_set_epi32(in[27], in[26] >> 8 | in[27] << 24, 1584 in[25] >> 12 | in[26] << 20, in[24] >> 16 | in[25] << 16, 1585 in[23] >> 20 | in[24] << 12, in[22] >> 24 | in[23] << 8, 1586 in[21] >> 28 | in[22] << 4, in[21]); 1587 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1588 _mm256_storeu_si256((__m256i*)(out), results); 1589 out += 8; 1590 1591 in += 28; 1592 1593 return in; 1594 } 1595 1596 inline static const uint32_t* unpack29_32_avx2(const uint32_t* in, uint32_t* out) { 1597 uint32_t mask = 0x1fffffff; 1598 __m256i reg_shifts, reg_inls, reg_masks; 1599 __m256i results; 1600 1601 reg_masks = _mm256_set1_epi32(mask); 1602 1603 // shift the first 8 outs 1604 reg_shifts = _mm256_set_epi32(0, 0, 0, 0, 1605 0, 0, 0, 0); 1606 reg_inls = _mm256_set_epi32(in[6] >> 11 | in[7] << 21, in[5] >> 14 | in[6] << 18, 1607 in[4] >> 17 | in[5] << 15, in[3] >> 20 | in[4] << 12, 1608 in[2] >> 23 | in[3] << 9, in[1] >> 26 | in[2] << 6, 1609 in[0] >> 29 | in[1] << 3, in[0]); 1610 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1611 _mm256_storeu_si256((__m256i*)(out), results); 1612 out += 8; 1613 1614 // shift the second 8 outs 1615 reg_shifts = _mm256_set_epi32(0, 0, 0, 0, 1616 0, 2, 0, 0); 1617 reg_inls = _mm256_set_epi32(in[13] >> 19 | in[14] << 13, in[12] >> 22 | in[13] << 10, 1618 in[11] >> 25 | in[12] << 7, in[10] >> 28 | in[11] << 4, 1619 in[9] >> 31 | in[10] << 1, in[9], 1620 in[8] >> 5 | in[9] << 27, in[7] >> 8 | in[8] << 24); 1621 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1622 _mm256_storeu_si256((__m256i*)(out), results); 1623 out += 8; 1624 1625 // shift the third 8 outs 1626 reg_shifts = _mm256_set_epi32(0, 0, 1, 0, 1627 0, 0, 0, 0); 1628 reg_inls = _mm256_set_epi32(in[20] >> 27 | in[21] << 5, in[19] >> 30 | in[20] << 2, 1629 in[19], in[18] >> 4 | in[19] << 28, 1630 in[17] >> 7 | in[18] << 25, in[16] >> 10 | in[17] << 22, 1631 in[15] >> 13 | in[16] << 19, in[14] >> 16 | in[15] << 16); 1632 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1633 _mm256_storeu_si256((__m256i*)(out), results); 1634 out += 8; 1635 1636 // shift the last 8 outs 1637 reg_shifts = _mm256_set_epi32(3, 0, 0, 0, 1638 0, 0, 0, 0); 1639 reg_inls = _mm256_set_epi32(in[28], in[27] >> 6 | in[28] << 26, 1640 in[26] >> 9 | in[27] << 23, in[25] >> 12 | in[26] << 20, 1641 in[24] >> 15 | in[25] << 17, in[23] >> 18 | in[24] << 14, 1642 in[22] >> 21 | in[23] << 11, in[21] >> 24 | in[22] << 8); 1643 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1644 _mm256_storeu_si256((__m256i*)(out), results); 1645 out += 8; 1646 1647 in += 29; 1648 1649 return in; 1650 } 1651 1652 inline static const uint32_t* unpack30_32_avx2(const uint32_t* in, uint32_t* out) { 1653 uint32_t mask = 0x3fffffff; 1654 __m256i reg_shifts, reg_inls, reg_masks; 1655 __m256i results; 1656 1657 reg_masks = _mm256_set1_epi32(mask); 1658 1659 // shift the first 8 outs 1660 reg_shifts = _mm256_set_epi32(0, 0, 0, 0, 1661 0, 0, 0, 0); 1662 reg_inls = _mm256_set_epi32(in[6] >> 18 | in[7] << 14, in[5] >> 20 | in[6] << 12, 1663 in[4] >> 22 | in[5] << 10, in[3] >> 24 | in[4] << 8, 1664 in[2] >> 26 | in[3] << 6, in[1] >> 28 | in[2] << 4, 1665 in[0] >> 30 | in[1] << 2, in[0]); 1666 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1667 _mm256_storeu_si256((__m256i*)(out), results); 1668 out += 8; 1669 1670 // shift the second 8 outs 1671 reg_shifts = _mm256_set_epi32(2, 0, 0, 0, 1672 0, 0, 0, 0); 1673 reg_inls = _mm256_set_epi32(in[14], in[13] >> 4 | in[14] << 28, 1674 in[12] >> 6 | in[13] << 26, in[11] >> 8 | in[12] << 24, 1675 in[10] >> 10 | in[11] << 22, in[9] >> 12 | in[10] << 20, 1676 in[8] >> 14 | in[9] << 18, in[7] >> 16 | in[8] << 16); 1677 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1678 _mm256_storeu_si256((__m256i*)(out), results); 1679 out += 8; 1680 1681 // shift the third 8 outs 1682 reg_shifts = _mm256_set_epi32(0, 0, 0, 0, 1683 0, 0, 0, 0); 1684 reg_inls = _mm256_set_epi32(in[21] >> 18 | in[22] << 14, in[20] >> 20 | in[21] << 12, 1685 in[19] >> 22 | in[20] << 10, in[18] >> 24 | in[19] << 8, 1686 in[17] >> 26 | in[18] << 6, in[16] >> 28 | in[17] << 4, 1687 in[15] >> 30 | in[16] << 2, in[15]); 1688 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1689 _mm256_storeu_si256((__m256i*)(out), results); 1690 out += 8; 1691 1692 // shift the last 8 outs 1693 reg_shifts = _mm256_set_epi32(2, 0, 0, 0, 1694 0, 0, 0, 0); 1695 reg_inls = _mm256_set_epi32(in[29], in[28] >> 4 | in[29] << 28, 1696 in[27] >> 6 | in[28] << 26, in[26] >> 8 | in[27] << 24, 1697 in[25] >> 10 | in[26] << 22, in[24] >> 12 | in[25] << 20, 1698 in[23] >> 14 | in[24] << 18, in[22] >> 16 | in[23] << 16); 1699 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1700 _mm256_storeu_si256((__m256i*)(out), results); 1701 out += 8; 1702 1703 in += 30; 1704 1705 return in; 1706 } 1707 1708 inline static const uint32_t* unpack31_32_avx2(const uint32_t* in, uint32_t* out) { 1709 uint32_t mask = 0x7fffffff; 1710 __m256i reg_shifts, reg_inls, reg_masks; 1711 __m256i results; 1712 1713 reg_masks = _mm256_set1_epi32(mask); 1714 1715 // shift the first 8 outs 1716 reg_shifts = _mm256_set_epi32(0, 0, 0, 0, 1717 0, 0, 0, 0); 1718 reg_inls = _mm256_set_epi32(in[6] >> 25 | in[7] << 7, in[5] >> 26 | in[6] << 6, 1719 in[4] >> 27 | in[5] << 5, in[3] >> 28 | in[4] << 4, 1720 in[2] >> 29 | in[3] << 3, in[1] >> 30 | in[2] << 2, 1721 in[0] >> 31 | in[1] << 1, in[0]); 1722 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1723 _mm256_storeu_si256((__m256i*)(out), results); 1724 out += 8; 1725 1726 // shift the second 8 outs 1727 reg_shifts = _mm256_set_epi32(0, 0, 0, 0, 1728 0, 0, 0, 0); 1729 reg_inls = _mm256_set_epi32(in[14] >> 17 | in[15] << 15, in[13] >> 18 | in[14] << 14, 1730 in[12] >> 19 | in[13] << 13, in[11] >> 20 | in[12] << 12, 1731 in[10] >> 21 | in[11] << 11, in[9] >> 22 | in[10] << 10, 1732 in[8] >> 23 | in[9] << 9, in[7] >> 24 | in[8] << 8); 1733 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1734 _mm256_storeu_si256((__m256i*)(out), results); 1735 out += 8; 1736 1737 // shift the third 8 outs 1738 reg_shifts = _mm256_set_epi32(0, 0, 0, 0, 1739 0, 0, 0, 0); 1740 reg_inls = _mm256_set_epi32(in[22] >> 9 | in[23] << 23, in[21] >> 10 | in[22] << 22, 1741 in[20] >> 11 | in[21] << 21, in[19] >> 12 | in[20] << 20, 1742 in[18] >> 13 | in[19] << 19, in[17] >> 14 | in[18] << 18, 1743 in[16] >> 15 | in[17] << 17, in[15] >> 16 | in[16] << 16); 1744 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1745 _mm256_storeu_si256((__m256i*)(out), results); 1746 out += 8; 1747 1748 // shift the last 8 outs 1749 reg_shifts = _mm256_set_epi32(1, 0, 0, 0, 1750 0, 0, 0, 0); 1751 reg_inls = _mm256_set_epi32(in[30], in[29] >> 2 | in[30] << 30, 1752 in[28] >> 3 | in[29] << 29, in[27] >> 4 | in[28] << 28, 1753 in[26] >> 5 | in[27] << 27, in[25] >> 6 | in[26] << 26, 1754 in[24] >> 7 | in[25] << 25, in[23] >> 8 | in[24] << 24); 1755 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks); 1756 _mm256_storeu_si256((__m256i*)(out), results); 1757 out += 8; 1758 1759 in += 31; 1760 1761 return in; 1762 } 1763 1764 inline const uint32_t* unpack32_32_avx2(const uint32_t* in, uint32_t* out) { 1765 memcpy(out, in, 32 * sizeof(*out)); 1766 in += 32; 1767 out += 32; 1768 1769 return in; 1770 } 1771 1772 int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { 1773 batch_size = batch_size / 32 * 32; 1774 int num_loops = batch_size / 32; 1775 1776 switch (num_bits) { 1777 case 0: 1778 for (int i = 0; i < num_loops; ++i) in = unpack0_32_avx2(in, out + i * 32); 1779 break; 1780 case 1: 1781 for (int i = 0; i < num_loops; ++i) in = unpack1_32_avx2(in, out + i * 32); 1782 break; 1783 case 2: 1784 for (int i = 0; i < num_loops; ++i) in = unpack2_32_avx2(in, out + i * 32); 1785 break; 1786 case 3: 1787 for (int i = 0; i < num_loops; ++i) in = unpack3_32_avx2(in, out + i * 32); 1788 break; 1789 case 4: 1790 for (int i = 0; i < num_loops; ++i) in = unpack4_32_avx2(in, out + i * 32); 1791 break; 1792 case 5: 1793 for (int i = 0; i < num_loops; ++i) in = unpack5_32_avx2(in, out + i * 32); 1794 break; 1795 case 6: 1796 for (int i = 0; i < num_loops; ++i) in = unpack6_32_avx2(in, out + i * 32); 1797 break; 1798 case 7: 1799 for (int i = 0; i < num_loops; ++i) in = unpack7_32_avx2(in, out + i * 32); 1800 break; 1801 case 8: 1802 for (int i = 0; i < num_loops; ++i) in = unpack8_32_avx2(in, out + i * 32); 1803 break; 1804 case 9: 1805 for (int i = 0; i < num_loops; ++i) in = unpack9_32_avx2(in, out + i * 32); 1806 break; 1807 case 10: 1808 for (int i = 0; i < num_loops; ++i) in = unpack10_32_avx2(in, out + i * 32); 1809 break; 1810 case 11: 1811 for (int i = 0; i < num_loops; ++i) in = unpack11_32_avx2(in, out + i * 32); 1812 break; 1813 case 12: 1814 for (int i = 0; i < num_loops; ++i) in = unpack12_32_avx2(in, out + i * 32); 1815 break; 1816 case 13: 1817 for (int i = 0; i < num_loops; ++i) in = unpack13_32_avx2(in, out + i * 32); 1818 break; 1819 case 14: 1820 for (int i = 0; i < num_loops; ++i) in = unpack14_32_avx2(in, out + i * 32); 1821 break; 1822 case 15: 1823 for (int i = 0; i < num_loops; ++i) in = unpack15_32_avx2(in, out + i * 32); 1824 break; 1825 case 16: 1826 for (int i = 0; i < num_loops; ++i) in = unpack16_32_avx2(in, out + i * 32); 1827 break; 1828 case 17: 1829 for (int i = 0; i < num_loops; ++i) in = unpack17_32_avx2(in, out + i * 32); 1830 break; 1831 case 18: 1832 for (int i = 0; i < num_loops; ++i) in = unpack18_32_avx2(in, out + i * 32); 1833 break; 1834 case 19: 1835 for (int i = 0; i < num_loops; ++i) in = unpack19_32_avx2(in, out + i * 32); 1836 break; 1837 case 20: 1838 for (int i = 0; i < num_loops; ++i) in = unpack20_32_avx2(in, out + i * 32); 1839 break; 1840 case 21: 1841 for (int i = 0; i < num_loops; ++i) in = unpack21_32_avx2(in, out + i * 32); 1842 break; 1843 case 22: 1844 for (int i = 0; i < num_loops; ++i) in = unpack22_32_avx2(in, out + i * 32); 1845 break; 1846 case 23: 1847 for (int i = 0; i < num_loops; ++i) in = unpack23_32_avx2(in, out + i * 32); 1848 break; 1849 case 24: 1850 for (int i = 0; i < num_loops; ++i) in = unpack24_32_avx2(in, out + i * 32); 1851 break; 1852 case 25: 1853 for (int i = 0; i < num_loops; ++i) in = unpack25_32_avx2(in, out + i * 32); 1854 break; 1855 case 26: 1856 for (int i = 0; i < num_loops; ++i) in = unpack26_32_avx2(in, out + i * 32); 1857 break; 1858 case 27: 1859 for (int i = 0; i < num_loops; ++i) in = unpack27_32_avx2(in, out + i * 32); 1860 break; 1861 case 28: 1862 for (int i = 0; i < num_loops; ++i) in = unpack28_32_avx2(in, out + i * 32); 1863 break; 1864 case 29: 1865 for (int i = 0; i < num_loops; ++i) in = unpack29_32_avx2(in, out + i * 32); 1866 break; 1867 case 30: 1868 for (int i = 0; i < num_loops; ++i) in = unpack30_32_avx2(in, out + i * 32); 1869 break; 1870 case 31: 1871 for (int i = 0; i < num_loops; ++i) in = unpack31_32_avx2(in, out + i * 32); 1872 break; 1873 case 32: 1874 for (int i = 0; i < num_loops; ++i) in = unpack32_32_avx2(in, out + i * 32); 1875 break; 1876 } 1877 1878 return batch_size; 1879 }