github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/utils/_lib/bit_packing_neon.c (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 #include <stdint.h> 18 #include <string.h> 19 20 #include "arm_neon.h" 21 22 inline const uint32_t* unpack0_32_neon(const uint32_t* in, uint32_t* out) { 23 for (const uint32_t* end = out + 32; out != end; out++) { 24 *out = 0; 25 } 26 27 return in; 28 } 29 30 inline static const uint32_t* unpack1_32_neon(const uint32_t* in, uint32_t* out) { 31 uint32_t mask = 0x1; 32 uint32_t ind[4]; 33 uint32_t shifts_1st[4] = {0, 1, 2, 3}; 34 uint32_t shifts_2nd[4] = {4, 5, 6, 7}; 35 uint32_t shifts_3rd[4] = {8, 9, 10, 11}; 36 uint32_t shifts_4th[4] = {12, 13, 14, 15}; 37 uint32_t shifts_5th[4] = {16, 17, 18, 19}; 38 uint32_t shifts_6th[4] = {20, 21, 22, 23}; 39 uint32_t shifts_7th[4] = {24, 25, 26, 27}; 40 uint32_t shifts_8th[4] = {28, 29, 30, 31}; 41 uint32x4_t reg_shft, reg_masks; 42 uint32x4_t results; 43 44 reg_masks = vdupq_n_u32(mask); 45 46 // shift the first 4 outs 47 ind[0] = in[0] >> shifts_1st[0]; 48 ind[1] = in[0] >> shifts_1st[1]; 49 ind[2] = in[0] >> shifts_1st[2]; 50 ind[3] = in[0] >> shifts_1st[3]; 51 reg_shft = vld1q_u32(ind); 52 results = vandq_u32(reg_shft, reg_masks); 53 vst1q_u32(out, results); 54 out += 4; 55 56 // shift the 2nd 4 outs 57 ind[0] = in[0] >> shifts_2nd[0]; 58 ind[1] = in[0] >> shifts_2nd[1]; 59 ind[2] = in[0] >> shifts_2nd[2]; 60 ind[3] = in[0] >> shifts_2nd[3]; 61 reg_shft = vld1q_u32(ind); 62 results = vandq_u32(reg_shft, reg_masks); 63 vst1q_u32(out, results); 64 out += 4; 65 66 // shift the 3rd 4 outs 67 ind[0] = in[0] >> shifts_3rd[0]; 68 ind[1] = in[0] >> shifts_3rd[1]; 69 ind[2] = in[0] >> shifts_3rd[2]; 70 ind[3] = in[0] >> shifts_3rd[3]; 71 reg_shft = vld1q_u32(ind); 72 results = vandq_u32(reg_shft, reg_masks); 73 vst1q_u32(out, results); 74 out += 4; 75 76 // shift the 4th 4 outs 77 ind[0] = in[0] >> shifts_4th[0]; 78 ind[1] = in[0] >> shifts_4th[1]; 79 ind[2] = in[0] >> shifts_4th[2]; 80 ind[3] = in[0] >> shifts_4th[3]; 81 reg_shft = vld1q_u32(ind); 82 results = vandq_u32(reg_shft, reg_masks); 83 vst1q_u32(out, results); 84 out += 4; 85 86 // shift the 5th 4 outs 87 ind[0] = in[0] >> shifts_5th[0]; 88 ind[1] = in[0] >> shifts_5th[1]; 89 ind[2] = in[0] >> shifts_5th[2]; 90 ind[3] = in[0] >> shifts_5th[3]; 91 reg_shft = vld1q_u32(ind); 92 results = vandq_u32(reg_shft, reg_masks); 93 vst1q_u32(out, results); 94 out += 4; 95 96 // shift the 6th 4 outs 97 ind[0] = in[0] >> shifts_6th[0]; 98 ind[1] = in[0] >> shifts_6th[1]; 99 ind[2] = in[0] >> shifts_6th[2]; 100 ind[3] = in[0] >> shifts_6th[3]; 101 reg_shft = vld1q_u32(ind); 102 results = vandq_u32(reg_shft, reg_masks); 103 vst1q_u32(out, results); 104 out += 4; 105 106 // shift the 7th 4 outs 107 ind[0] = in[0] >> shifts_7th[0]; 108 ind[1] = in[0] >> shifts_7th[1]; 109 ind[2] = in[0] >> shifts_7th[2]; 110 ind[3] = in[0] >> shifts_7th[3]; 111 reg_shft = vld1q_u32(ind); 112 results = vandq_u32(reg_shft, reg_masks); 113 vst1q_u32(out, results); 114 out += 4; 115 116 // shift the 8th 4 outs 117 ind[0] = in[0] >> shifts_8th[0]; 118 ind[1] = in[0] >> shifts_8th[1]; 119 ind[2] = in[0] >> shifts_8th[2]; 120 ind[3] = in[0] >> shifts_8th[3]; 121 reg_shft = vld1q_u32(ind); 122 results = vandq_u32(reg_shft, reg_masks); 123 vst1q_u32(out, results); 124 out += 4; 125 126 in += 1; 127 128 return in; 129 } 130 131 inline static const uint32_t* unpack2_32_neon(const uint32_t* in, uint32_t* out) { 132 uint32_t mask = 0x3; 133 uint32_t ind[4]; 134 uint32_t shifts_1st[4] = {0, 2, 4, 6}; 135 uint32_t shifts_2nd[4] = {8, 10, 12, 14}; 136 uint32_t shifts_3rd[4] = {16, 18, 20, 22}; 137 uint32_t shifts_4th[4] = {24, 26, 28, 30}; 138 139 uint32x4_t reg_shft, reg_masks; 140 uint32x4_t results; 141 142 reg_masks = vdupq_n_u32(mask); 143 144 // shift the first 4 outs 145 ind[0] = in[0] >> shifts_1st[0]; 146 ind[1] = in[0] >> shifts_1st[1]; 147 ind[2] = in[0] >> shifts_1st[2]; 148 ind[3] = in[0] >> shifts_1st[3]; 149 reg_shft = vld1q_u32(ind); 150 results = vandq_u32(reg_shft, reg_masks); 151 vst1q_u32(out, results); 152 out += 4; 153 154 // shift the 2nd 4 outs 155 ind[0] = in[0] >> shifts_2nd[0]; 156 ind[1] = in[0] >> shifts_2nd[1]; 157 ind[2] = in[0] >> shifts_2nd[2]; 158 ind[3] = in[0] >> shifts_2nd[3]; 159 reg_shft = vld1q_u32(ind); 160 results = vandq_u32(reg_shft, reg_masks); 161 vst1q_u32(out, results); 162 out += 4; 163 164 // shift the 3rd 4 outs 165 ind[0] = in[0] >> shifts_3rd[0]; 166 ind[1] = in[0] >> shifts_3rd[1]; 167 ind[2] = in[0] >> shifts_3rd[2]; 168 ind[3] = in[0] >> shifts_3rd[3]; 169 reg_shft = vld1q_u32(ind); 170 results = vandq_u32(reg_shft, reg_masks); 171 vst1q_u32(out, results); 172 out += 4; 173 174 // shift the 4th 4 outs 175 ind[0] = in[0] >> shifts_4th[0]; 176 ind[1] = in[0] >> shifts_4th[1]; 177 ind[2] = in[0] >> shifts_4th[2]; 178 ind[3] = in[0] >> shifts_4th[3]; 179 reg_shft = vld1q_u32(ind); 180 results = vandq_u32(reg_shft, reg_masks); 181 vst1q_u32(out, results); 182 out += 4; 183 184 // shift the 5th 4 outs 185 ind[0] = in[1] >> shifts_1st[0]; 186 ind[1] = in[1] >> shifts_1st[1]; 187 ind[2] = in[1] >> shifts_1st[2]; 188 ind[3] = in[1] >> shifts_1st[3]; 189 reg_shft = vld1q_u32(ind); 190 results = vandq_u32(reg_shft, reg_masks); 191 vst1q_u32(out, results); 192 out += 4; 193 194 // shift the 6th 4 outs 195 ind[0] = in[1] >> shifts_2nd[0]; 196 ind[1] = in[1] >> shifts_2nd[1]; 197 ind[2] = in[1] >> shifts_2nd[2]; 198 ind[3] = in[1] >> shifts_2nd[3]; 199 reg_shft = vld1q_u32(ind); 200 results = vandq_u32(reg_shft, reg_masks); 201 vst1q_u32(out, results); 202 out += 4; 203 204 // shift the 7th 4 outs 205 ind[0] = in[1] >> shifts_3rd[0]; 206 ind[1] = in[1] >> shifts_3rd[1]; 207 ind[2] = in[1] >> shifts_3rd[2]; 208 ind[3] = in[1] >> shifts_3rd[3]; 209 reg_shft = vld1q_u32(ind); 210 results = vandq_u32(reg_shft, reg_masks); 211 vst1q_u32(out, results); 212 out += 4; 213 214 // shift the 8th 4 outs 215 ind[0] = in[1] >> shifts_4th[0]; 216 ind[1] = in[1] >> shifts_4th[1]; 217 ind[2] = in[1] >> shifts_4th[2]; 218 ind[3] = in[1] >> shifts_4th[3]; 219 reg_shft = vld1q_u32(ind); 220 results = vandq_u32(reg_shft, reg_masks); 221 vst1q_u32(out, results); 222 out += 4; 223 224 in += 2; 225 226 return in; 227 } 228 229 inline static const uint32_t* unpack3_32_neon(const uint32_t* in, uint32_t* out) { 230 uint32_t mask = 0x7; 231 uint32_t ind[4]; 232 uint32_t shifts_1st[4] = {0, 3, 6, 9}; 233 uint32_t shifts_2nd[4] = {12, 15, 18, 21}; 234 uint32_t shifts_3rd[4] = {24, 27, 0, 1}; 235 uint32_t shifts_4th[4] = {4, 7, 10, 13}; 236 uint32_t shifts_5th[4] = {16, 19, 22, 25}; 237 uint32_t shifts_6th[4] = {28, 0, 2, 5}; 238 uint32_t shifts_7th[4] = {8, 11, 14, 17}; 239 uint32_t shifts_8th[4] = {20, 23, 26, 29}; 240 uint32x4_t reg_shft, reg_masks; 241 uint32x4_t results; 242 243 reg_masks = vdupq_n_u32(mask); 244 245 // shift the first 4 outs 246 ind[0] = in[0] >> shifts_1st[0]; 247 ind[1] = in[0] >> shifts_1st[1]; 248 ind[2] = in[0] >> shifts_1st[2]; 249 ind[3] = in[0] >> shifts_1st[3]; 250 reg_shft = vld1q_u32(ind); 251 results = vandq_u32(reg_shft, reg_masks); 252 vst1q_u32(out, results); 253 out += 4; 254 255 // shift the 2nd 4 outs 256 ind[0] = in[0] >> shifts_2nd[0]; 257 ind[1] = in[0] >> shifts_2nd[1]; 258 ind[2] = in[0] >> shifts_2nd[2]; 259 ind[3] = in[0] >> shifts_2nd[3]; 260 reg_shft = vld1q_u32(ind); 261 results = vandq_u32(reg_shft, reg_masks); 262 vst1q_u32(out, results); 263 out += 4; 264 265 // shift the 3rd 4 outs 266 ind[0] = in[0] >> shifts_3rd[0]; 267 ind[1] = in[0] >> shifts_3rd[1]; 268 ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_3rd[2]; 269 ind[3] = in[1] >> shifts_3rd[3]; 270 reg_shft = vld1q_u32(ind); 271 results = vandq_u32(reg_shft, reg_masks); 272 vst1q_u32(out, results); 273 out += 4; 274 275 // shift the 4th 4 outs 276 ind[0] = in[1] >> shifts_4th[0]; 277 ind[1] = in[1] >> shifts_4th[1]; 278 ind[2] = in[1] >> shifts_4th[2]; 279 ind[3] = in[1] >> shifts_4th[3]; 280 reg_shft = vld1q_u32(ind); 281 results = vandq_u32(reg_shft, reg_masks); 282 vst1q_u32(out, results); 283 out += 4; 284 285 // shift the 5th 4 outs 286 ind[0] = in[1] >> shifts_5th[0]; 287 ind[1] = in[1] >> shifts_5th[1]; 288 ind[2] = in[1] >> shifts_5th[2]; 289 ind[3] = in[1] >> shifts_5th[3]; 290 reg_shft = vld1q_u32(ind); 291 results = vandq_u32(reg_shft, reg_masks); 292 vst1q_u32(out, results); 293 out += 4; 294 295 // shift the 6th 4 outs 296 ind[0] = in[1] >> shifts_6th[0]; 297 ind[1] = (in[1] >> 31 | in[2] << 1) >> shifts_6th[1]; 298 ind[2] = in[2] >> shifts_6th[2]; 299 ind[3] = in[2] >> shifts_6th[3]; 300 reg_shft = vld1q_u32(ind); 301 results = vandq_u32(reg_shft, reg_masks); 302 vst1q_u32(out, results); 303 out += 4; 304 305 // shift the 7th 4 outs 306 ind[0] = in[2] >> shifts_7th[0]; 307 ind[1] = in[2] >> shifts_7th[1]; 308 ind[2] = in[2] >> shifts_7th[2]; 309 ind[3] = in[2] >> shifts_7th[3]; 310 reg_shft = vld1q_u32(ind); 311 results = vandq_u32(reg_shft, reg_masks); 312 vst1q_u32(out, results); 313 out += 4; 314 315 // shift the 8th 4 outs 316 ind[0] = in[2] >> shifts_8th[0]; 317 ind[1] = in[2] >> shifts_8th[1]; 318 ind[2] = in[2] >> shifts_8th[2]; 319 ind[3] = in[2] >> shifts_8th[3]; 320 reg_shft = vld1q_u32(ind); 321 results = vandq_u32(reg_shft, reg_masks); 322 vst1q_u32(out, results); 323 out += 4; 324 325 in += 3; 326 327 return in; 328 } 329 330 inline static const uint32_t* unpack4_32_neon(const uint32_t* in, uint32_t* out) { 331 uint32_t mask = 0xf; 332 uint32_t ind[4]; 333 uint32_t shifts_1st[4] = {0, 4, 8, 12}; 334 uint32_t shifts_2nd[4] = {16, 20, 24, 28}; 335 uint32x4_t reg_shft, reg_masks; 336 uint32x4_t results; 337 338 reg_masks = vdupq_n_u32(mask); 339 340 // shift the first 4 outs 341 ind[0] = in[0] >> shifts_1st[0]; 342 ind[1] = in[0] >> shifts_1st[1]; 343 ind[2] = in[0] >> shifts_1st[2]; 344 ind[3] = in[0] >> shifts_1st[3]; 345 reg_shft = vld1q_u32(ind); 346 results = vandq_u32(reg_shft, reg_masks); 347 vst1q_u32(out, results); 348 out += 4; 349 350 // shift the 2nd 4 outs 351 ind[0] = in[0] >> shifts_2nd[0]; 352 ind[1] = in[0] >> shifts_2nd[1]; 353 ind[2] = in[0] >> shifts_2nd[2]; 354 ind[3] = in[0] >> shifts_2nd[3]; 355 reg_shft = vld1q_u32(ind); 356 results = vandq_u32(reg_shft, reg_masks); 357 vst1q_u32(out, results); 358 out += 4; 359 360 // shift the 3rd 4 outs 361 ind[0] = in[1] >> shifts_1st[0]; 362 ind[1] = in[1] >> shifts_1st[1]; 363 ind[2] = in[1] >> shifts_1st[2]; 364 ind[3] = in[1] >> shifts_1st[3]; 365 reg_shft = vld1q_u32(ind); 366 results = vandq_u32(reg_shft, reg_masks); 367 vst1q_u32(out, results); 368 out += 4; 369 370 // shift the 4th 4 outs 371 ind[0] = in[1] >> shifts_2nd[0]; 372 ind[1] = in[1] >> shifts_2nd[1]; 373 ind[2] = in[1] >> shifts_2nd[2]; 374 ind[3] = in[1] >> shifts_2nd[3]; 375 reg_shft = vld1q_u32(ind); 376 results = vandq_u32(reg_shft, reg_masks); 377 vst1q_u32(out, results); 378 out += 4; 379 380 // shift the 5th 4 outs 381 ind[0] = in[2] >> shifts_1st[0]; 382 ind[1] = in[2] >> shifts_1st[1]; 383 ind[2] = in[2] >> shifts_1st[2]; 384 ind[3] = in[2] >> shifts_1st[3]; 385 reg_shft = vld1q_u32(ind); 386 results = vandq_u32(reg_shft, reg_masks); 387 vst1q_u32(out, results); 388 out += 4; 389 390 // shift the 6th 4 outs 391 ind[0] = in[2] >> shifts_2nd[0]; 392 ind[1] = in[2] >> shifts_2nd[1]; 393 ind[2] = in[2] >> shifts_2nd[2]; 394 ind[3] = in[2] >> shifts_2nd[3]; 395 reg_shft = vld1q_u32(ind); 396 results = vandq_u32(reg_shft, reg_masks); 397 vst1q_u32(out, results); 398 out += 4; 399 400 // shift the 7th 4 outs 401 ind[0] = in[3] >> shifts_1st[0]; 402 ind[1] = in[3] >> shifts_1st[1]; 403 ind[2] = in[3] >> shifts_1st[2]; 404 ind[3] = in[3] >> shifts_1st[3]; 405 reg_shft = vld1q_u32(ind); 406 results = vandq_u32(reg_shft, reg_masks); 407 vst1q_u32(out, results); 408 out += 4; 409 410 // shift the 8th 4 outs 411 ind[0] = in[3] >> shifts_2nd[0]; 412 ind[1] = in[3] >> shifts_2nd[1]; 413 ind[2] = in[3] >> shifts_2nd[2]; 414 ind[3] = in[3] >> shifts_2nd[3]; 415 reg_shft = vld1q_u32(ind); 416 results = vandq_u32(reg_shft, reg_masks); 417 vst1q_u32(out, results); 418 out += 4; 419 420 in += 4; 421 422 return in; 423 } 424 425 inline static const uint32_t* unpack5_32_neon(const uint32_t* in, uint32_t* out) { 426 uint32_t mask = 0x1f; 427 uint32_t ind[4]; 428 uint32_t shifts_1st[4] = {0, 5, 10, 15}; 429 uint32_t shifts_2nd[4] = {20, 25, 0, 3}; 430 uint32_t shifts_3rd[4] = {8, 13, 18, 23}; 431 uint32_t shifts_4th[4] = {0, 1, 6, 11}; 432 uint32_t shifts_5th[4] = {16, 21, 26, 0}; 433 uint32_t shifts_6th[4] = {4, 9, 14, 19}; 434 uint32_t shifts_7th[4] = {24, 0, 2, 7}; 435 uint32_t shifts_8th[4] = {12, 17, 22, 27}; 436 uint32x4_t reg_shft, reg_masks; 437 uint32x4_t results; 438 439 reg_masks = vdupq_n_u32(mask); 440 441 // shift the first 4 outs 442 ind[0] = in[0] >> shifts_1st[0]; 443 ind[1] = in[0] >> shifts_1st[1]; 444 ind[2] = in[0] >> shifts_1st[2]; 445 ind[3] = in[0] >> shifts_1st[3]; 446 reg_shft = vld1q_u32(ind); 447 results = vandq_u32(reg_shft, reg_masks); 448 vst1q_u32(out, results); 449 out += 4; 450 451 // shift the 2nd 4 outs 452 ind[0] = in[0] >> shifts_2nd[0]; 453 ind[1] = in[0] >> shifts_2nd[1]; 454 ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_2nd[2]; 455 ind[3] = in[1] >> shifts_2nd[3]; 456 reg_shft = vld1q_u32(ind); 457 results = vandq_u32(reg_shft, reg_masks); 458 vst1q_u32(out, results); 459 out += 4; 460 461 // shift the 3rd 4 outs 462 ind[0] = in[1] >> shifts_3rd[0]; 463 ind[1] = in[1] >> shifts_3rd[1]; 464 ind[2] = in[1] >> shifts_3rd[2]; 465 ind[3] = in[1] >> shifts_3rd[3]; 466 reg_shft = vld1q_u32(ind); 467 results = vandq_u32(reg_shft, reg_masks); 468 vst1q_u32(out, results); 469 out += 4; 470 471 // shift the 4th 4 outs 472 ind[0] = (in[1] >> 28 | in[2] << 4) >> shifts_4th[0]; 473 ind[1] = in[2] >> shifts_4th[1]; 474 ind[2] = in[2] >> shifts_4th[2]; 475 ind[3] = in[2] >> shifts_4th[3]; 476 reg_shft = vld1q_u32(ind); 477 results = vandq_u32(reg_shft, reg_masks); 478 vst1q_u32(out, results); 479 out += 4; 480 481 // shift the 5th 4 outs 482 ind[0] = in[2] >> shifts_5th[0]; 483 ind[1] = in[2] >> shifts_5th[1]; 484 ind[2] = in[2] >> shifts_5th[2]; 485 ind[3] = (in[2] >> 31 | in[3] << 1) >> shifts_5th[3]; 486 reg_shft = vld1q_u32(ind); 487 results = vandq_u32(reg_shft, reg_masks); 488 vst1q_u32(out, results); 489 out += 4; 490 491 // shift the 6th 4 outs 492 ind[0] = in[3] >> shifts_6th[0]; 493 ind[1] = in[3] >> shifts_6th[1]; 494 ind[2] = in[3] >> shifts_6th[2]; 495 ind[3] = in[3] >> shifts_6th[3]; 496 reg_shft = vld1q_u32(ind); 497 results = vandq_u32(reg_shft, reg_masks); 498 vst1q_u32(out, results); 499 out += 4; 500 501 // shift the 7th 4 outs 502 ind[0] = in[3] >> shifts_7th[0]; 503 ind[1] = (in[3] >> 29 | in[4] << 3) >> shifts_7th[1]; 504 ind[2] = in[4] >> shifts_7th[2]; 505 ind[3] = in[4] >> shifts_7th[3]; 506 reg_shft = vld1q_u32(ind); 507 results = vandq_u32(reg_shft, reg_masks); 508 vst1q_u32(out, results); 509 out += 4; 510 511 // shift the 8th 4 outs 512 ind[0] = in[4] >> shifts_8th[0]; 513 ind[1] = in[4] >> shifts_8th[1]; 514 ind[2] = in[4] >> shifts_8th[2]; 515 ind[3] = in[4] >> shifts_8th[3]; 516 reg_shft = vld1q_u32(ind); 517 results = vandq_u32(reg_shft, reg_masks); 518 vst1q_u32(out, results); 519 out += 4; 520 521 in += 5; 522 523 return in; 524 } 525 526 inline static const uint32_t* unpack6_32_neon(const uint32_t* in, uint32_t* out) { 527 uint32_t mask = 0x3f; 528 uint32_t ind[4]; 529 uint32_t shifts_1st[4] = {0, 6, 12, 18}; 530 uint32_t shifts_2nd[4] = {24, 0, 4, 10}; 531 uint32_t shifts_3rd[4] = {16, 22, 0, 2}; 532 uint32_t shifts_4th[4] = {8, 14, 20, 26}; 533 534 uint32x4_t reg_shft, reg_masks; 535 uint32x4_t results; 536 537 reg_masks = vdupq_n_u32(mask); 538 539 // shift the first 4 outs 540 ind[0] = in[0] >> shifts_1st[0]; 541 ind[1] = in[0] >> shifts_1st[1]; 542 ind[2] = in[0] >> shifts_1st[2]; 543 ind[3] = in[0] >> shifts_1st[3]; 544 reg_shft = vld1q_u32(ind); 545 results = vandq_u32(reg_shft, reg_masks); 546 vst1q_u32(out, results); 547 out += 4; 548 549 // shift the 2nd 4 outs 550 ind[0] = in[0] >> shifts_2nd[0]; 551 ind[1] = (in[0] >> 30 | in[1] << 2) >> shifts_2nd[1]; 552 ind[2] = in[1] >> shifts_2nd[2]; 553 ind[3] = in[1] >> shifts_2nd[3]; 554 reg_shft = vld1q_u32(ind); 555 results = vandq_u32(reg_shft, reg_masks); 556 vst1q_u32(out, results); 557 out += 4; 558 559 // shift the 3rd 4 outs 560 ind[0] = in[1] >> shifts_3rd[0]; 561 ind[1] = in[1] >> shifts_3rd[1]; 562 ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_3rd[2]; 563 ind[3] = in[2] >> shifts_3rd[3]; 564 reg_shft = vld1q_u32(ind); 565 results = vandq_u32(reg_shft, reg_masks); 566 vst1q_u32(out, results); 567 out += 4; 568 569 // shift the 4th 4 outs 570 ind[0] = in[2] >> shifts_4th[0]; 571 ind[1] = in[2] >> shifts_4th[1]; 572 ind[2] = in[2] >> shifts_4th[2]; 573 ind[3] = in[2] >> shifts_4th[3]; 574 reg_shft = vld1q_u32(ind); 575 results = vandq_u32(reg_shft, reg_masks); 576 vst1q_u32(out, results); 577 out += 4; 578 579 // shift the 5th 4 outs 580 ind[0] = in[3] >> shifts_1st[0]; 581 ind[1] = in[3] >> shifts_1st[1]; 582 ind[2] = in[3] >> shifts_1st[2]; 583 ind[3] = in[3] >> shifts_1st[3]; 584 reg_shft = vld1q_u32(ind); 585 results = vandq_u32(reg_shft, reg_masks); 586 vst1q_u32(out, results); 587 out += 4; 588 589 // shift the 6th 4 outs 590 ind[0] = in[3] >> shifts_2nd[0]; 591 ind[1] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[1]; 592 ind[2] = in[4] >> shifts_2nd[2]; 593 ind[3] = in[4] >> shifts_2nd[3]; 594 reg_shft = vld1q_u32(ind); 595 results = vandq_u32(reg_shft, reg_masks); 596 vst1q_u32(out, results); 597 out += 4; 598 599 // shift the 7th 4 outs 600 ind[0] = in[4] >> shifts_3rd[0]; 601 ind[1] = in[4] >> shifts_3rd[1]; 602 ind[2] = (in[4] >> 28 | in[5] << 4) >> shifts_3rd[2]; 603 ind[3] = in[5] >> shifts_3rd[3]; 604 reg_shft = vld1q_u32(ind); 605 results = vandq_u32(reg_shft, reg_masks); 606 vst1q_u32(out, results); 607 out += 4; 608 609 // shift the 8th 4 outs 610 ind[0] = in[5] >> shifts_4th[0]; 611 ind[1] = in[5] >> shifts_4th[1]; 612 ind[2] = in[5] >> shifts_4th[2]; 613 ind[3] = in[5] >> shifts_4th[3]; 614 reg_shft = vld1q_u32(ind); 615 results = vandq_u32(reg_shft, reg_masks); 616 vst1q_u32(out, results); 617 out += 4; 618 619 in += 6; 620 621 return in; 622 } 623 624 inline static const uint32_t* unpack7_32_neon(const uint32_t* in, uint32_t* out) { 625 uint32_t mask = 0x7f; 626 uint32_t ind[4]; 627 uint32_t shifts_1st[4] = {0, 7, 14, 21}; 628 uint32_t shifts_2nd[4] = {0, 3, 10, 17}; 629 uint32_t shifts_3rd[4] = {24, 0, 6, 13}; 630 uint32_t shifts_4th[4] = {20, 0, 2, 9}; 631 uint32_t shifts_5th[4] = {16, 23, 0, 5}; 632 uint32_t shifts_6th[4] = {12, 19, 0, 1}; 633 uint32_t shifts_7th[4] = {8, 15, 22, 0}; 634 uint32_t shifts_8th[4] = {4, 11, 18, 25}; 635 uint32x4_t reg_shft, reg_masks; 636 uint32x4_t results; 637 638 reg_masks = vdupq_n_u32(mask); 639 640 // shift the first 4 outs 641 ind[0] = in[0] >> shifts_1st[0]; 642 ind[1] = in[0] >> shifts_1st[1]; 643 ind[2] = in[0] >> shifts_1st[2]; 644 ind[3] = in[0] >> shifts_1st[3]; 645 reg_shft = vld1q_u32(ind); 646 results = vandq_u32(reg_shft, reg_masks); 647 vst1q_u32(out, results); 648 out += 4; 649 650 // shift the 2nd 4 outs 651 ind[0] = (in[0] >> 28 | in[1] << 4) >> shifts_2nd[0]; 652 ind[1] = in[1] >> shifts_2nd[1]; 653 ind[2] = in[1] >> shifts_2nd[2]; 654 ind[3] = in[1] >> shifts_2nd[3]; 655 reg_shft = vld1q_u32(ind); 656 results = vandq_u32(reg_shft, reg_masks); 657 vst1q_u32(out, results); 658 out += 4; 659 660 // shift the 3rd 4 outs 661 ind[0] = in[1] >> shifts_3rd[0]; 662 ind[1] = (in[1] >> 31 | in[2] << 1) >> shifts_3rd[1]; 663 ind[2] = in[2] >> shifts_3rd[2]; 664 ind[3] = in[2] >> shifts_3rd[3]; 665 reg_shft = vld1q_u32(ind); 666 results = vandq_u32(reg_shft, reg_masks); 667 vst1q_u32(out, results); 668 out += 4; 669 670 // shift the 4th 4 outs 671 ind[0] = in[2] >> shifts_4th[0]; 672 ind[1] = (in[2] >> 27 | in[3] << 5) >> shifts_4th[1]; 673 ind[2] = in[3] >> shifts_4th[2]; 674 ind[3] = in[3] >> shifts_4th[3]; 675 reg_shft = vld1q_u32(ind); 676 results = vandq_u32(reg_shft, reg_masks); 677 vst1q_u32(out, results); 678 out += 4; 679 680 // shift the 5th 4 outs 681 ind[0] = in[3] >> shifts_5th[0]; 682 ind[1] = in[3] >> shifts_5th[1]; 683 ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_5th[2]; 684 ind[3] = in[4] >> shifts_5th[3]; 685 reg_shft = vld1q_u32(ind); 686 results = vandq_u32(reg_shft, reg_masks); 687 vst1q_u32(out, results); 688 out += 4; 689 690 // shift the 6th 4 outs 691 ind[0] = in[4] >> shifts_6th[0]; 692 ind[1] = in[4] >> shifts_6th[1]; 693 ind[2] = (in[4] >> 26 | in[5] << 6) >> shifts_6th[2]; 694 ind[3] = in[5] >> shifts_6th[3]; 695 reg_shft = vld1q_u32(ind); 696 results = vandq_u32(reg_shft, reg_masks); 697 vst1q_u32(out, results); 698 out += 4; 699 700 // shift the 7th 4 outs 701 ind[0] = in[5] >> shifts_7th[0]; 702 ind[1] = in[5] >> shifts_7th[1]; 703 ind[2] = in[5] >> shifts_7th[2]; 704 ind[3] = (in[5] >> 29 | in[6] << 3) >> shifts_7th[3]; 705 reg_shft = vld1q_u32(ind); 706 results = vandq_u32(reg_shft, reg_masks); 707 vst1q_u32(out, results); 708 out += 4; 709 710 // shift the 8th 4 outs 711 ind[0] = in[6] >> shifts_8th[0]; 712 ind[1] = in[6] >> shifts_8th[1]; 713 ind[2] = in[6] >> shifts_8th[2]; 714 ind[3] = in[6] >> shifts_8th[3]; 715 reg_shft = vld1q_u32(ind); 716 results = vandq_u32(reg_shft, reg_masks); 717 vst1q_u32(out, results); 718 out += 4; 719 720 in += 7; 721 722 return in; 723 } 724 725 inline static const uint32_t* unpack8_32_neon(const uint32_t* in, uint32_t* out) { 726 uint32_t mask = 0xff; 727 uint32_t ind[4]; 728 uint32_t shifts_1st[4] = {0, 8, 16, 24}; 729 uint32x4_t reg_shft, reg_masks; 730 uint32x4_t results; 731 732 reg_masks = vdupq_n_u32(mask); 733 734 // shift the first 4 outs 735 ind[0] = in[0] >> shifts_1st[0]; 736 ind[1] = in[0] >> shifts_1st[1]; 737 ind[2] = in[0] >> shifts_1st[2]; 738 ind[3] = in[0] >> shifts_1st[3]; 739 reg_shft = vld1q_u32(ind); 740 results = vandq_u32(reg_shft, reg_masks); 741 vst1q_u32(out, results); 742 out += 4; 743 744 // shift the 2nd 4 outs 745 ind[0] = in[1] >> shifts_1st[0]; 746 ind[1] = in[1] >> shifts_1st[1]; 747 ind[2] = in[1] >> shifts_1st[2]; 748 ind[3] = in[1] >> shifts_1st[3]; 749 reg_shft = vld1q_u32(ind); 750 results = vandq_u32(reg_shft, reg_masks); 751 vst1q_u32(out, results); 752 out += 4; 753 754 // shift the 3rd 4 outs 755 ind[0] = in[2] >> shifts_1st[0]; 756 ind[1] = in[2] >> shifts_1st[1]; 757 ind[2] = in[2] >> shifts_1st[2]; 758 ind[3] = in[2] >> shifts_1st[3]; 759 reg_shft = vld1q_u32(ind); 760 results = vandq_u32(reg_shft, reg_masks); 761 vst1q_u32(out, results); 762 out += 4; 763 764 // shift the 4th 4 outs 765 ind[0] = in[3] >> shifts_1st[0]; 766 ind[1] = in[3] >> shifts_1st[1]; 767 ind[2] = in[3] >> shifts_1st[2]; 768 ind[3] = in[3] >> shifts_1st[3]; 769 reg_shft = vld1q_u32(ind); 770 results = vandq_u32(reg_shft, reg_masks); 771 vst1q_u32(out, results); 772 out += 4; 773 774 // shift the 5th 4 outs 775 ind[0] = in[4] >> shifts_1st[0]; 776 ind[1] = in[4] >> shifts_1st[1]; 777 ind[2] = in[4] >> shifts_1st[2]; 778 ind[3] = in[4] >> shifts_1st[3]; 779 reg_shft = vld1q_u32(ind); 780 results = vandq_u32(reg_shft, reg_masks); 781 vst1q_u32(out, results); 782 out += 4; 783 784 // shift the 6th 4 outs 785 ind[0] = in[5] >> shifts_1st[0]; 786 ind[1] = in[5] >> shifts_1st[1]; 787 ind[2] = in[5] >> shifts_1st[2]; 788 ind[3] = in[5] >> shifts_1st[3]; 789 reg_shft = vld1q_u32(ind); 790 results = vandq_u32(reg_shft, reg_masks); 791 vst1q_u32(out, results); 792 out += 4; 793 794 // shift the 7th 4 outs 795 ind[0] = in[6] >> shifts_1st[0]; 796 ind[1] = in[6] >> shifts_1st[1]; 797 ind[2] = in[6] >> shifts_1st[2]; 798 ind[3] = in[6] >> shifts_1st[3]; 799 reg_shft = vld1q_u32(ind); 800 results = vandq_u32(reg_shft, reg_masks); 801 vst1q_u32(out, results); 802 out += 4; 803 804 // shift the 8th 4 outs 805 ind[0] = in[7] >> shifts_1st[0]; 806 ind[1] = in[7] >> shifts_1st[1]; 807 ind[2] = in[7] >> shifts_1st[2]; 808 ind[3] = in[7] >> shifts_1st[3]; 809 reg_shft = vld1q_u32(ind); 810 results = vandq_u32(reg_shft, reg_masks); 811 vst1q_u32(out, results); 812 out += 4; 813 814 in += 8; 815 816 return in; 817 } 818 819 inline static const uint32_t* unpack9_32_neon(const uint32_t* in, uint32_t* out) { 820 uint32_t mask = 0x1ff; 821 uint32_t ind[4]; 822 uint32_t shifts_1st[4] = {0, 9, 18, 0}; 823 uint32_t shifts_2nd[4] = {4, 13, 22, 0}; 824 uint32_t shifts_3rd[4] = {8, 17, 0, 3}; 825 uint32_t shifts_4th[4] = {12, 21, 0, 7}; 826 uint32_t shifts_5th[4] = {16, 0, 2, 11}; 827 uint32_t shifts_6th[4] = {20, 0, 6, 15}; 828 uint32_t shifts_7th[4] = {0, 1, 10, 19}; 829 uint32_t shifts_8th[4] = {0, 5, 14, 23}; 830 uint32x4_t reg_shft, reg_masks; 831 uint32x4_t results; 832 833 reg_masks = vdupq_n_u32(mask); 834 835 // shift the first 4 outs 836 ind[0] = in[0] >> shifts_1st[0]; 837 ind[1] = in[0] >> shifts_1st[1]; 838 ind[2] = in[0] >> shifts_1st[2]; 839 ind[3] = (in[0] >> 27 | in[1] << 5) >> shifts_1st[3]; 840 reg_shft = vld1q_u32(ind); 841 results = vandq_u32(reg_shft, reg_masks); 842 vst1q_u32(out, results); 843 out += 4; 844 845 // shift the 2nd 4 outs 846 ind[0] = in[1] >> shifts_2nd[0]; 847 ind[1] = in[1] >> shifts_2nd[1]; 848 ind[2] = in[1] >> shifts_2nd[2]; 849 ind[3] = (in[1] >> 31 | in[2] << 1) >> shifts_2nd[3]; 850 reg_shft = vld1q_u32(ind); 851 results = vandq_u32(reg_shft, reg_masks); 852 vst1q_u32(out, results); 853 out += 4; 854 855 // shift the 3rd 4 outs 856 ind[0] = in[2] >> shifts_3rd[0]; 857 ind[1] = in[2] >> shifts_3rd[1]; 858 ind[2] = (in[2] >> 26 | in[3] << 6) >> shifts_3rd[2]; 859 ind[3] = in[3] >> shifts_3rd[3]; 860 reg_shft = vld1q_u32(ind); 861 results = vandq_u32(reg_shft, reg_masks); 862 vst1q_u32(out, results); 863 out += 4; 864 865 // shift the 4th 4 outs 866 ind[0] = in[3] >> shifts_4th[0]; 867 ind[1] = in[3] >> shifts_4th[1]; 868 ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_4th[2]; 869 ind[3] = in[4] >> shifts_4th[3]; 870 reg_shft = vld1q_u32(ind); 871 results = vandq_u32(reg_shft, reg_masks); 872 vst1q_u32(out, results); 873 out += 4; 874 875 // shift the 5th 4 outs 876 ind[0] = in[4] >> shifts_5th[0]; 877 ind[1] = (in[4] >> 25 | in[5] << 7) >> shifts_5th[1]; 878 ind[2] = in[5] >> shifts_5th[2]; 879 ind[3] = in[5] >> shifts_5th[3]; 880 reg_shft = vld1q_u32(ind); 881 results = vandq_u32(reg_shft, reg_masks); 882 vst1q_u32(out, results); 883 out += 4; 884 885 // shift the 6th 4 outs 886 ind[0] = in[5] >> shifts_6th[0]; 887 ind[1] = (in[5] >> 29 | in[6] << 3) >> shifts_6th[1]; 888 ind[2] = in[6] >> shifts_6th[2]; 889 ind[3] = in[6] >> shifts_6th[3]; 890 reg_shft = vld1q_u32(ind); 891 results = vandq_u32(reg_shft, reg_masks); 892 vst1q_u32(out, results); 893 out += 4; 894 895 // shift the 7th 4 outs 896 ind[0] = (in[6] >> 24 | in[7] << 8) >> shifts_7th[0]; 897 ind[1] = in[7] >> shifts_7th[1]; 898 ind[2] = in[7] >> shifts_7th[2]; 899 ind[3] = in[7] >> shifts_7th[3]; 900 reg_shft = vld1q_u32(ind); 901 results = vandq_u32(reg_shft, reg_masks); 902 vst1q_u32(out, results); 903 out += 4; 904 905 // shift the 8th 4 outs 906 ind[0] = (in[7] >> 28 | in[8] << 4) >> shifts_8th[0]; 907 ind[1] = in[8] >> shifts_8th[1]; 908 ind[2] = in[8] >> shifts_8th[2]; 909 ind[3] = in[8] >> shifts_8th[3]; 910 reg_shft = vld1q_u32(ind); 911 results = vandq_u32(reg_shft, reg_masks); 912 vst1q_u32(out, results); 913 out += 4; 914 915 in += 9; 916 917 return in; 918 } 919 920 inline static const uint32_t* unpack10_32_neon(const uint32_t* in, uint32_t* out) { 921 uint32_t mask = 0x3ff; 922 uint32_t ind[4]; 923 uint32_t shifts_1st[4] = {0, 10, 20, 0}; 924 uint32_t shifts_2nd[4] = {8, 18, 0, 6}; 925 uint32_t shifts_3rd[4] = {16, 0, 4, 14}; 926 uint32_t shifts_4th[4] = {0, 2, 12, 22}; 927 uint32x4_t reg_shft, reg_masks; 928 uint32x4_t results; 929 930 reg_masks = vdupq_n_u32(mask); 931 932 // shift the first 4 outs 933 ind[0] = in[0] >> shifts_1st[0]; 934 ind[1] = in[0] >> shifts_1st[1]; 935 ind[2] = in[0] >> shifts_1st[2]; 936 ind[3] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[3]; 937 reg_shft = vld1q_u32(ind); 938 results = vandq_u32(reg_shft, reg_masks); 939 vst1q_u32(out, results); 940 out += 4; 941 942 // shift the 2nd 4 outs 943 ind[0] = in[1] >> shifts_2nd[0]; 944 ind[1] = in[1] >> shifts_2nd[1]; 945 ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[2]; 946 ind[3] = in[2] >> shifts_2nd[3]; 947 reg_shft = vld1q_u32(ind); 948 results = vandq_u32(reg_shft, reg_masks); 949 vst1q_u32(out, results); 950 out += 4; 951 952 // shift the 3rd 4 outs 953 ind[0] = in[2] >> shifts_3rd[0]; 954 ind[1] = (in[2] >> 26 | in[3] << 6) >> shifts_3rd[1]; 955 ind[2] = in[3] >> shifts_3rd[2]; 956 ind[3] = in[3] >> shifts_3rd[3]; 957 reg_shft = vld1q_u32(ind); 958 results = vandq_u32(reg_shft, reg_masks); 959 vst1q_u32(out, results); 960 out += 4; 961 962 // shift the 4th 4 outs 963 ind[0] = (in[3] >> 24 | in[4] << 8) >> shifts_4th[0]; 964 ind[1] = in[4] >> shifts_4th[1]; 965 ind[2] = in[4] >> shifts_4th[2]; 966 ind[3] = in[4] >> shifts_4th[3]; 967 reg_shft = vld1q_u32(ind); 968 results = vandq_u32(reg_shft, reg_masks); 969 vst1q_u32(out, results); 970 out += 4; 971 972 // shift the 5th 4 outs 973 ind[0] = in[5] >> shifts_1st[0]; 974 ind[1] = in[5] >> shifts_1st[1]; 975 ind[2] = in[5] >> shifts_1st[2]; 976 ind[3] = (in[5] >> 30 | in[6] << 2) >> shifts_1st[3]; 977 reg_shft = vld1q_u32(ind); 978 results = vandq_u32(reg_shft, reg_masks); 979 vst1q_u32(out, results); 980 out += 4; 981 982 // shift the 6th 4 outs 983 ind[0] = in[6] >> shifts_2nd[0]; 984 ind[1] = in[6] >> shifts_2nd[1]; 985 ind[2] = (in[6] >> 28 | in[7] << 4) >> shifts_2nd[2]; 986 ind[3] = in[7] >> shifts_2nd[3]; 987 reg_shft = vld1q_u32(ind); 988 results = vandq_u32(reg_shft, reg_masks); 989 vst1q_u32(out, results); 990 out += 4; 991 992 // shift the 7th 4 outs 993 ind[0] = in[7] >> shifts_3rd[0]; 994 ind[1] = (in[7] >> 26 | in[8] << 6) >> shifts_3rd[1]; 995 ind[2] = in[8] >> shifts_3rd[2]; 996 ind[3] = in[8] >> shifts_3rd[3]; 997 reg_shft = vld1q_u32(ind); 998 results = vandq_u32(reg_shft, reg_masks); 999 vst1q_u32(out, results); 1000 out += 4; 1001 1002 // shift the 8th 4 outs 1003 ind[0] = (in[8] >> 24 | in[9] << 8) >> shifts_4th[0]; 1004 ind[1] = in[9] >> shifts_4th[1]; 1005 ind[2] = in[9] >> shifts_4th[2]; 1006 ind[3] = in[9] >> shifts_4th[3]; 1007 reg_shft = vld1q_u32(ind); 1008 results = vandq_u32(reg_shft, reg_masks); 1009 vst1q_u32(out, results); 1010 out += 4; 1011 1012 in += 10; 1013 1014 return in; 1015 } 1016 1017 inline static const uint32_t* unpack11_32_neon(const uint32_t* in, uint32_t* out) { 1018 uint32_t mask = 0x7ff; 1019 uint32_t ind[4]; 1020 uint32_t shifts_1st[4] = {0, 11, 0, 1}; 1021 uint32_t shifts_2nd[4] = {12, 0, 2, 13}; 1022 uint32_t shifts_3rd[4] = {0, 3, 14, 0}; 1023 uint32_t shifts_4th[4] = {4, 15, 0, 5}; 1024 uint32_t shifts_5th[4] = {16, 0, 6, 17}; 1025 uint32_t shifts_6th[4] = {0, 7, 18, 0}; 1026 uint32_t shifts_7th[4] = {8, 19, 0, 9}; 1027 uint32_t shifts_8th[4] = {20, 0, 10, 21}; 1028 uint32x4_t reg_shft, reg_masks; 1029 uint32x4_t results; 1030 1031 reg_masks = vdupq_n_u32(mask); 1032 1033 // shift the first 4 outs 1034 ind[0] = in[0] >> shifts_1st[0]; 1035 ind[1] = in[0] >> shifts_1st[1]; 1036 ind[2] = (in[0] >> 22 | in[1] << 10) >> shifts_1st[2]; 1037 ind[3] = in[1] >> shifts_1st[3]; 1038 reg_shft = vld1q_u32(ind); 1039 results = vandq_u32(reg_shft, reg_masks); 1040 vst1q_u32(out, results); 1041 out += 4; 1042 1043 // shift the 2nd 4 outs 1044 ind[0] = in[1] >> shifts_2nd[0]; 1045 ind[1] = (in[1] >> 23 | in[2] << 9) >> shifts_2nd[1]; 1046 ind[2] = in[2] >> shifts_2nd[2]; 1047 ind[3] = in[2] >> shifts_2nd[3]; 1048 reg_shft = vld1q_u32(ind); 1049 results = vandq_u32(reg_shft, reg_masks); 1050 vst1q_u32(out, results); 1051 out += 4; 1052 1053 // shift the 3rd 4 outs 1054 ind[0] = (in[2] >> 24 | in[3] << 8) >> shifts_3rd[0]; 1055 ind[1] = in[3] >> shifts_3rd[1]; 1056 ind[2] = in[3] >> shifts_3rd[2]; 1057 ind[3] = (in[3] >> 25 | in[4] << 7) >> shifts_3rd[3]; 1058 reg_shft = vld1q_u32(ind); 1059 results = vandq_u32(reg_shft, reg_masks); 1060 vst1q_u32(out, results); 1061 out += 4; 1062 1063 // shift the 4th 4 outs 1064 ind[0] = in[4] >> shifts_4th[0]; 1065 ind[1] = in[4] >> shifts_4th[1]; 1066 ind[2] = (in[4] >> 26 | in[5] << 6) >> shifts_4th[2]; 1067 ind[3] = in[5] >> shifts_4th[3]; 1068 reg_shft = vld1q_u32(ind); 1069 results = vandq_u32(reg_shft, reg_masks); 1070 vst1q_u32(out, results); 1071 out += 4; 1072 1073 // shift the 5th 4 outs 1074 ind[0] = in[5] >> shifts_5th[0]; 1075 ind[1] = (in[5] >> 27 | in[6] << 5) >> shifts_5th[1]; 1076 ind[2] = in[6] >> shifts_5th[2]; 1077 ind[3] = in[6] >> shifts_5th[3]; 1078 reg_shft = vld1q_u32(ind); 1079 results = vandq_u32(reg_shft, reg_masks); 1080 vst1q_u32(out, results); 1081 out += 4; 1082 1083 // shift the 6th 4 outs 1084 ind[0] = (in[6] >> 28 | in[7] << 4) >> shifts_6th[0]; 1085 ind[1] = in[7] >> shifts_6th[1]; 1086 ind[2] = in[7] >> shifts_6th[2]; 1087 ind[3] = (in[7] >> 29 | in[8] << 3) >> shifts_6th[3]; 1088 reg_shft = vld1q_u32(ind); 1089 results = vandq_u32(reg_shft, reg_masks); 1090 vst1q_u32(out, results); 1091 out += 4; 1092 1093 // shift the 7th 4 outs 1094 ind[0] = in[8] >> shifts_7th[0]; 1095 ind[1] = in[8] >> shifts_7th[1]; 1096 ind[2] = (in[8] >> 30 | in[9] << 2) >> shifts_7th[2]; 1097 ind[3] = in[9] >> shifts_7th[3]; 1098 reg_shft = vld1q_u32(ind); 1099 results = vandq_u32(reg_shft, reg_masks); 1100 vst1q_u32(out, results); 1101 out += 4; 1102 1103 // shift the 8th 4 outs 1104 ind[0] = in[9] >> shifts_8th[0]; 1105 ind[1] = (in[9] >> 31 | in[10] << 1) >> shifts_8th[1]; 1106 ind[2] = in[10] >> shifts_8th[2]; 1107 ind[3] = in[10] >> shifts_8th[3]; 1108 reg_shft = vld1q_u32(ind); 1109 results = vandq_u32(reg_shft, reg_masks); 1110 vst1q_u32(out, results); 1111 out += 4; 1112 1113 in += 11; 1114 1115 return in; 1116 } 1117 1118 inline static const uint32_t* unpack12_32_neon(const uint32_t* in, uint32_t* out) { 1119 uint32_t mask = 0xfff; 1120 uint32_t ind[4]; 1121 uint32_t shifts_1st[4] = {0, 12, 0, 4}; 1122 uint32_t shifts_2nd[4] = {16, 0, 8, 20}; 1123 uint32x4_t reg_shft, reg_masks; 1124 uint32x4_t results; 1125 1126 reg_masks = vdupq_n_u32(mask); 1127 1128 // shift the first 4 outs 1129 ind[0] = in[0] >> shifts_1st[0]; 1130 ind[1] = in[0] >> shifts_1st[1]; 1131 ind[2] = (in[0] >> 24 | in[1] << 8) >> shifts_1st[2]; 1132 ind[3] = in[1] >> shifts_1st[3]; 1133 reg_shft = vld1q_u32(ind); 1134 results = vandq_u32(reg_shft, reg_masks); 1135 vst1q_u32(out, results); 1136 out += 4; 1137 1138 // shift the 2nd 4 outs 1139 ind[0] = in[1] >> shifts_2nd[0]; 1140 ind[1] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[1]; 1141 ind[2] = in[2] >> shifts_2nd[2]; 1142 ind[3] = in[2] >> shifts_2nd[3]; 1143 reg_shft = vld1q_u32(ind); 1144 results = vandq_u32(reg_shft, reg_masks); 1145 vst1q_u32(out, results); 1146 out += 4; 1147 1148 // shift the 3rd 4 outs 1149 ind[0] = in[3] >> shifts_1st[0]; 1150 ind[1] = in[3] >> shifts_1st[1]; 1151 ind[2] = (in[3] >> 24 | in[4] << 8) >> shifts_1st[2]; 1152 ind[3] = in[4] >> shifts_1st[3]; 1153 reg_shft = vld1q_u32(ind); 1154 results = vandq_u32(reg_shft, reg_masks); 1155 vst1q_u32(out, results); 1156 out += 4; 1157 1158 // shift the 4th 4 outs 1159 ind[0] = in[4] >> shifts_2nd[0]; 1160 ind[1] = (in[4] >> 28 | in[5] << 4) >> shifts_2nd[1]; 1161 ind[2] = in[5] >> shifts_2nd[2]; 1162 ind[3] = in[5] >> shifts_2nd[3]; 1163 reg_shft = vld1q_u32(ind); 1164 results = vandq_u32(reg_shft, reg_masks); 1165 vst1q_u32(out, results); 1166 out += 4; 1167 1168 // shift the 5th 4 outs 1169 ind[0] = in[6] >> shifts_1st[0]; 1170 ind[1] = in[6] >> shifts_1st[1]; 1171 ind[2] = (in[6] >> 24 | in[7] << 8) >> shifts_1st[2]; 1172 ind[3] = in[7] >> shifts_1st[3]; 1173 reg_shft = vld1q_u32(ind); 1174 results = vandq_u32(reg_shft, reg_masks); 1175 vst1q_u32(out, results); 1176 out += 4; 1177 1178 // shift the 6th 4 outs 1179 ind[0] = in[7] >> shifts_2nd[0]; 1180 ind[1] = (in[7] >> 28 | in[8] << 4) >> shifts_2nd[1]; 1181 ind[2] = in[8] >> shifts_2nd[2]; 1182 ind[3] = in[8] >> shifts_2nd[3]; 1183 reg_shft = vld1q_u32(ind); 1184 results = vandq_u32(reg_shft, reg_masks); 1185 vst1q_u32(out, results); 1186 out += 4; 1187 1188 // shift the 7th 4 outs 1189 ind[0] = in[9] >> shifts_1st[0]; 1190 ind[1] = in[9] >> shifts_1st[1]; 1191 ind[2] = (in[9] >> 24 | in[10] << 8) >> shifts_1st[2]; 1192 ind[3] = in[10] >> shifts_1st[3]; 1193 reg_shft = vld1q_u32(ind); 1194 results = vandq_u32(reg_shft, reg_masks); 1195 vst1q_u32(out, results); 1196 out += 4; 1197 1198 // shift the 8th 4 outs 1199 ind[0] = in[10] >> shifts_2nd[0]; 1200 ind[1] = (in[10] >> 28 | in[11] << 4) >> shifts_2nd[1]; 1201 ind[2] = in[11] >> shifts_2nd[2]; 1202 ind[3] = in[11] >> shifts_2nd[3]; 1203 reg_shft = vld1q_u32(ind); 1204 results = vandq_u32(reg_shft, reg_masks); 1205 vst1q_u32(out, results); 1206 out += 4; 1207 1208 in += 12; 1209 1210 return in; 1211 } 1212 1213 inline static const uint32_t* unpack13_32_neon(const uint32_t* in, uint32_t* out) { 1214 uint32_t mask = 0x1fff; 1215 uint32_t ind[4]; 1216 uint32_t shifts_1st[4] = {0, 13, 0, 7}; 1217 uint32_t shifts_2nd[4] = {0, 1, 14, 0}; 1218 uint32_t shifts_3rd[4] = {8, 0, 2, 15}; 1219 uint32_t shifts_4th[4] = {0, 9, 0, 3}; 1220 uint32_t shifts_5th[4] = {16, 0, 10, 0}; 1221 uint32_t shifts_6th[4] = {4, 17, 0, 11}; 1222 uint32_t shifts_7th[4] = {0, 5, 18, 0}; 1223 uint32_t shifts_8th[4] = {12, 0, 6, 19}; 1224 uint32x4_t reg_shft, reg_masks; 1225 uint32x4_t results; 1226 1227 reg_masks = vdupq_n_u32(mask); 1228 1229 // shift the first 4 outs 1230 ind[0] = in[0] >> shifts_1st[0]; 1231 ind[1] = in[0] >> shifts_1st[1]; 1232 ind[2] = (in[0] >> 26 | in[1] << 6) >> shifts_1st[2]; 1233 ind[3] = in[1] >> shifts_1st[3]; 1234 reg_shft = vld1q_u32(ind); 1235 results = vandq_u32(reg_shft, reg_masks); 1236 vst1q_u32(out, results); 1237 out += 4; 1238 1239 // shift the 2nd 4 outs 1240 ind[0] = (in[1] >> 20 | in[2] << 12) >> shifts_2nd[0]; 1241 ind[1] = in[2] >> shifts_2nd[1]; 1242 ind[2] = in[2] >> shifts_2nd[2]; 1243 ind[3] = (in[2] >> 27 | in[3] << 5) >> shifts_2nd[3]; 1244 reg_shft = vld1q_u32(ind); 1245 results = vandq_u32(reg_shft, reg_masks); 1246 vst1q_u32(out, results); 1247 out += 4; 1248 1249 // shift the 3rd 4 outs 1250 ind[0] = in[3] >> shifts_3rd[0]; 1251 ind[1] = (in[3] >> 21 | in[4] << 11) >> shifts_3rd[1]; 1252 ind[2] = in[4] >> shifts_3rd[2]; 1253 ind[3] = in[4] >> shifts_3rd[3]; 1254 reg_shft = vld1q_u32(ind); 1255 results = vandq_u32(reg_shft, reg_masks); 1256 vst1q_u32(out, results); 1257 out += 4; 1258 1259 // shift the 4th 4 outs 1260 ind[0] = (in[4] >> 28 | in[5] << 4) >> shifts_4th[0]; 1261 ind[1] = in[5] >> shifts_4th[1]; 1262 ind[2] = (in[5] >> 22 | in[6] << 10) >> shifts_4th[2]; 1263 ind[3] = in[6] >> shifts_4th[3]; 1264 reg_shft = vld1q_u32(ind); 1265 results = vandq_u32(reg_shft, reg_masks); 1266 vst1q_u32(out, results); 1267 out += 4; 1268 1269 // shift the 5th 4 outs 1270 ind[0] = in[6] >> shifts_5th[0]; 1271 ind[1] = (in[6] >> 29 | in[7] << 3) >> shifts_5th[1]; 1272 ind[2] = in[7] >> shifts_5th[2]; 1273 ind[3] = (in[7] >> 23 | in[8] << 9) >> shifts_5th[3]; 1274 reg_shft = vld1q_u32(ind); 1275 results = vandq_u32(reg_shft, reg_masks); 1276 vst1q_u32(out, results); 1277 out += 4; 1278 1279 // shift the 6th 4 outs 1280 ind[0] = in[8] >> shifts_6th[0]; 1281 ind[1] = in[8] >> shifts_6th[1]; 1282 ind[2] = (in[8] >> 30 | in[9] << 2) >> shifts_6th[2]; 1283 ind[3] = in[9] >> shifts_6th[3]; 1284 reg_shft = vld1q_u32(ind); 1285 results = vandq_u32(reg_shft, reg_masks); 1286 vst1q_u32(out, results); 1287 out += 4; 1288 1289 // shift the 7th 4 outs 1290 ind[0] = (in[9] >> 24 | in[10] << 8) >> shifts_7th[0]; 1291 ind[1] = in[10] >> shifts_7th[1]; 1292 ind[2] = in[10] >> shifts_7th[2]; 1293 ind[3] = (in[10] >> 31 | in[11] << 1) >> shifts_7th[3]; 1294 reg_shft = vld1q_u32(ind); 1295 results = vandq_u32(reg_shft, reg_masks); 1296 vst1q_u32(out, results); 1297 out += 4; 1298 1299 // shift the 8th 4 outs 1300 ind[0] = in[11] >> shifts_8th[0]; 1301 ind[1] = (in[11] >> 25 | in[12] << 7) >> shifts_8th[1]; 1302 ind[2] = in[12] >> shifts_8th[2]; 1303 ind[3] = in[12] >> shifts_8th[3]; 1304 reg_shft = vld1q_u32(ind); 1305 results = vandq_u32(reg_shft, reg_masks); 1306 vst1q_u32(out, results); 1307 out += 4; 1308 1309 in += 13; 1310 1311 return in; 1312 } 1313 1314 inline static const uint32_t* unpack14_32_neon(const uint32_t* in, uint32_t* out) { 1315 uint32_t mask = 0x3fff; 1316 uint32_t ind[4]; 1317 uint32_t shifts_1st[4] = {0, 14, 0, 10}; 1318 uint32_t shifts_2nd[4] = {0, 6, 0, 2}; 1319 uint32_t shifts_3rd[4] = {16, 0, 12, 0}; 1320 uint32_t shifts_4th[4] = {8, 0, 4, 18}; 1321 uint32x4_t reg_shft, reg_masks; 1322 uint32x4_t results; 1323 1324 reg_masks = vdupq_n_u32(mask); 1325 1326 // shift the first 4 outs 1327 ind[0] = in[0] >> shifts_1st[0]; 1328 ind[1] = in[0] >> shifts_1st[1]; 1329 ind[2] = (in[0] >> 28 | in[1] << 4) >> shifts_1st[2]; 1330 ind[3] = in[1] >> shifts_1st[3]; 1331 reg_shft = vld1q_u32(ind); 1332 results = vandq_u32(reg_shft, reg_masks); 1333 vst1q_u32(out, results); 1334 out += 4; 1335 1336 // shift the 2nd 4 outs 1337 ind[0] = (in[1] >> 24 | in[2] << 8) >> shifts_2nd[0]; 1338 ind[1] = in[2] >> shifts_2nd[1]; 1339 ind[2] = (in[2] >> 20 | in[3] << 12) >> shifts_2nd[2]; 1340 ind[3] = in[3] >> shifts_2nd[3]; 1341 reg_shft = vld1q_u32(ind); 1342 results = vandq_u32(reg_shft, reg_masks); 1343 vst1q_u32(out, results); 1344 out += 4; 1345 1346 // shift the 3rd 4 outs 1347 ind[0] = in[3] >> shifts_3rd[0]; 1348 ind[1] = (in[3] >> 30 | in[4] << 2) >> shifts_3rd[1]; 1349 ind[2] = in[4] >> shifts_3rd[2]; 1350 ind[3] = (in[4] >> 26 | in[5] << 6) >> shifts_3rd[3]; 1351 reg_shft = vld1q_u32(ind); 1352 results = vandq_u32(reg_shft, reg_masks); 1353 vst1q_u32(out, results); 1354 out += 4; 1355 1356 // shift the 4th 4 outs 1357 ind[0] = in[5] >> shifts_4th[0]; 1358 ind[1] = (in[5] >> 22 | in[6] << 10) >> shifts_4th[1]; 1359 ind[2] = in[6] >> shifts_4th[2]; 1360 ind[3] = in[6] >> shifts_4th[3]; 1361 reg_shft = vld1q_u32(ind); 1362 results = vandq_u32(reg_shft, reg_masks); 1363 vst1q_u32(out, results); 1364 out += 4; 1365 1366 // shift the 5th 4 outs 1367 ind[0] = in[7] >> shifts_1st[0]; 1368 ind[1] = in[7] >> shifts_1st[1]; 1369 ind[2] = (in[7] >> 28 | in[8] << 4) >> shifts_1st[2]; 1370 ind[3] = in[8] >> shifts_1st[3]; 1371 reg_shft = vld1q_u32(ind); 1372 results = vandq_u32(reg_shft, reg_masks); 1373 vst1q_u32(out, results); 1374 out += 4; 1375 1376 // shift the 6th 4 outs 1377 ind[0] = (in[8] >> 24 | in[9] << 8) >> shifts_2nd[0]; 1378 ind[1] = in[9] >> shifts_2nd[1]; 1379 ind[2] = (in[9] >> 20 | in[10] << 12) >> shifts_2nd[2]; 1380 ind[3] = in[10] >> shifts_2nd[3]; 1381 reg_shft = vld1q_u32(ind); 1382 results = vandq_u32(reg_shft, reg_masks); 1383 vst1q_u32(out, results); 1384 out += 4; 1385 1386 // shift the 7th 4 outs 1387 ind[0] = in[10] >> shifts_3rd[0]; 1388 ind[1] = (in[10] >> 30 | in[11] << 2) >> shifts_3rd[1]; 1389 ind[2] = in[11] >> shifts_3rd[2]; 1390 ind[3] = (in[11] >> 26 | in[12] << 6) >> shifts_3rd[3]; 1391 reg_shft = vld1q_u32(ind); 1392 results = vandq_u32(reg_shft, reg_masks); 1393 vst1q_u32(out, results); 1394 out += 4; 1395 1396 // shift the 8th 4 outs 1397 ind[0] = in[12] >> shifts_4th[0]; 1398 ind[1] = (in[12] >> 22 | in[13] << 10) >> shifts_4th[1]; 1399 ind[2] = in[13] >> shifts_4th[2]; 1400 ind[3] = in[13] >> shifts_4th[3]; 1401 reg_shft = vld1q_u32(ind); 1402 results = vandq_u32(reg_shft, reg_masks); 1403 vst1q_u32(out, results); 1404 out += 4; 1405 1406 in += 14; 1407 1408 return in; 1409 } 1410 1411 inline static const uint32_t* unpack15_32_neon(const uint32_t* in, uint32_t* out) { 1412 uint32_t mask = 0x7fff; 1413 uint32_t ind[4]; 1414 uint32_t shifts_1st[4] = {0, 15, 0, 13}; 1415 uint32_t shifts_2nd[4] = {0, 11, 0, 9}; 1416 uint32_t shifts_3rd[4] = {0, 7, 0, 5}; 1417 uint32_t shifts_4th[4] = {0, 3, 0, 1}; 1418 uint32_t shifts_5th[4] = {16, 0, 14, 0}; 1419 uint32_t shifts_6th[4] = {12, 0, 10, 0}; 1420 uint32_t shifts_7th[4] = {8, 0, 6, 0}; 1421 uint32_t shifts_8th[4] = {4, 0, 2, 17}; 1422 uint32x4_t reg_shft, reg_masks; 1423 uint32x4_t results; 1424 1425 reg_masks = vdupq_n_u32(mask); 1426 1427 // shift the first 4 outs 1428 ind[0] = in[0] >> shifts_1st[0]; 1429 ind[1] = in[0] >> shifts_1st[1]; 1430 ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[2]; 1431 ind[3] = in[1] >> shifts_1st[3]; 1432 reg_shft = vld1q_u32(ind); 1433 results = vandq_u32(reg_shft, reg_masks); 1434 vst1q_u32(out, results); 1435 out += 4; 1436 1437 // shift the 2nd 4 outs 1438 ind[0] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[0]; 1439 ind[1] = in[2] >> shifts_2nd[1]; 1440 ind[2] = (in[2] >> 26 | in[3] << 6) >> shifts_2nd[2]; 1441 ind[3] = in[3] >> shifts_2nd[3]; 1442 reg_shft = vld1q_u32(ind); 1443 results = vandq_u32(reg_shft, reg_masks); 1444 vst1q_u32(out, results); 1445 out += 4; 1446 1447 // shift the 3rd 4 outs 1448 ind[0] = (in[3] >> 24 | in[4] << 8) >> shifts_3rd[0]; 1449 ind[1] = in[4] >> shifts_3rd[1]; 1450 ind[2] = (in[4] >> 22 | in[5] << 10) >> shifts_3rd[2]; 1451 ind[3] = in[5] >> shifts_3rd[3]; 1452 reg_shft = vld1q_u32(ind); 1453 results = vandq_u32(reg_shft, reg_masks); 1454 vst1q_u32(out, results); 1455 out += 4; 1456 1457 // shift the 4th 4 outs 1458 ind[0] = (in[5] >> 20 | in[6] << 12) >> shifts_4th[0]; 1459 ind[1] = in[6] >> shifts_4th[1]; 1460 ind[2] = (in[6] >> 18 | in[7] << 14) >> shifts_4th[2]; 1461 ind[3] = in[7] >> shifts_4th[3]; 1462 reg_shft = vld1q_u32(ind); 1463 results = vandq_u32(reg_shft, reg_masks); 1464 vst1q_u32(out, results); 1465 out += 4; 1466 1467 // shift the 5th 4 outs 1468 ind[0] = in[7] >> shifts_5th[0]; 1469 ind[1] = (in[7] >> 31 | in[8] << 1) >> shifts_5th[1]; 1470 ind[2] = in[8] >> shifts_5th[2]; 1471 ind[3] = (in[8] >> 29 | in[9] << 3) >> shifts_5th[3]; 1472 reg_shft = vld1q_u32(ind); 1473 results = vandq_u32(reg_shft, reg_masks); 1474 vst1q_u32(out, results); 1475 out += 4; 1476 1477 // shift the 6th 4 outs 1478 ind[0] = in[9] >> shifts_6th[0]; 1479 ind[1] = (in[9] >> 27 | in[10] << 5) >> shifts_6th[1]; 1480 ind[2] = in[10] >> shifts_6th[2]; 1481 ind[3] = (in[10] >> 25 | in[11] << 7) >> shifts_6th[3]; 1482 reg_shft = vld1q_u32(ind); 1483 results = vandq_u32(reg_shft, reg_masks); 1484 vst1q_u32(out, results); 1485 out += 4; 1486 1487 // shift the 7th 4 outs 1488 ind[0] = in[11] >> shifts_7th[0]; 1489 ind[1] = (in[11] >> 23 | in[12] << 9) >> shifts_7th[1]; 1490 ind[2] = in[12] >> shifts_7th[2]; 1491 ind[3] = (in[12] >> 21 | in[13] << 11) >> shifts_7th[3]; 1492 reg_shft = vld1q_u32(ind); 1493 results = vandq_u32(reg_shft, reg_masks); 1494 vst1q_u32(out, results); 1495 out += 4; 1496 1497 // shift the 8th 4 outs 1498 ind[0] = in[13] >> shifts_8th[0]; 1499 ind[1] = (in[13] >> 19 | in[14] << 13) >> shifts_8th[1]; 1500 ind[2] = in[14] >> shifts_8th[2]; 1501 ind[3] = in[14] >> shifts_8th[3]; 1502 reg_shft = vld1q_u32(ind); 1503 results = vandq_u32(reg_shft, reg_masks); 1504 vst1q_u32(out, results); 1505 out += 4; 1506 1507 in += 15; 1508 1509 return in; 1510 } 1511 1512 inline static const uint32_t* unpack16_32_neon(const uint32_t* in, uint32_t* out) { 1513 uint32_t mask = 0xffff; 1514 uint32_t ind[4]; 1515 uint32_t shifts_1st[4] = {0, 16, 0, 16}; 1516 uint32x4_t reg_shft, reg_masks; 1517 uint32x4_t results; 1518 1519 reg_masks = vdupq_n_u32(mask); 1520 1521 // shift the first 4 outs 1522 ind[0] = in[0] >> shifts_1st[0]; 1523 ind[1] = in[0] >> shifts_1st[1]; 1524 ind[2] = in[1] >> shifts_1st[2]; 1525 ind[3] = in[1] >> shifts_1st[3]; 1526 reg_shft = vld1q_u32(ind); 1527 results = vandq_u32(reg_shft, reg_masks); 1528 vst1q_u32(out, results); 1529 out += 4; 1530 1531 // shift the 2nd 4 outs 1532 ind[0] = in[2] >> shifts_1st[0]; 1533 ind[1] = in[2] >> shifts_1st[1]; 1534 ind[2] = in[3] >> shifts_1st[2]; 1535 ind[3] = in[3] >> shifts_1st[3]; 1536 reg_shft = vld1q_u32(ind); 1537 results = vandq_u32(reg_shft, reg_masks); 1538 vst1q_u32(out, results); 1539 out += 4; 1540 1541 // shift the 3rd 4 outs 1542 ind[0] = in[4] >> shifts_1st[0]; 1543 ind[1] = in[4] >> shifts_1st[1]; 1544 ind[2] = in[5] >> shifts_1st[2]; 1545 ind[3] = in[5] >> shifts_1st[3]; 1546 reg_shft = vld1q_u32(ind); 1547 results = vandq_u32(reg_shft, reg_masks); 1548 vst1q_u32(out, results); 1549 out += 4; 1550 1551 // shift the 4th 4 outs 1552 ind[0] = in[6] >> shifts_1st[0]; 1553 ind[1] = in[6] >> shifts_1st[1]; 1554 ind[2] = in[7] >> shifts_1st[2]; 1555 ind[3] = in[7] >> shifts_1st[3]; 1556 reg_shft = vld1q_u32(ind); 1557 results = vandq_u32(reg_shft, reg_masks); 1558 vst1q_u32(out, results); 1559 out += 4; 1560 1561 // shift the 5th 4 outs 1562 ind[0] = in[8] >> shifts_1st[0]; 1563 ind[1] = in[8] >> shifts_1st[1]; 1564 ind[2] = in[9] >> shifts_1st[2]; 1565 ind[3] = in[9] >> shifts_1st[3]; 1566 reg_shft = vld1q_u32(ind); 1567 results = vandq_u32(reg_shft, reg_masks); 1568 vst1q_u32(out, results); 1569 out += 4; 1570 1571 // shift the 6th 4 outs 1572 ind[0] = in[10] >> shifts_1st[0]; 1573 ind[1] = in[10] >> shifts_1st[1]; 1574 ind[2] = in[11] >> shifts_1st[2]; 1575 ind[3] = in[11] >> shifts_1st[3]; 1576 reg_shft = vld1q_u32(ind); 1577 results = vandq_u32(reg_shft, reg_masks); 1578 vst1q_u32(out, results); 1579 out += 4; 1580 1581 // shift the 7th 4 outs 1582 ind[0] = in[12] >> shifts_1st[0]; 1583 ind[1] = in[12] >> shifts_1st[1]; 1584 ind[2] = in[13] >> shifts_1st[2]; 1585 ind[3] = in[13] >> shifts_1st[3]; 1586 reg_shft = vld1q_u32(ind); 1587 results = vandq_u32(reg_shft, reg_masks); 1588 vst1q_u32(out, results); 1589 out += 4; 1590 1591 // shift the 8th 4 outs 1592 ind[0] = in[14] >> shifts_1st[0]; 1593 ind[1] = in[14] >> shifts_1st[1]; 1594 ind[2] = in[15] >> shifts_1st[2]; 1595 ind[3] = in[15] >> shifts_1st[3]; 1596 reg_shft = vld1q_u32(ind); 1597 results = vandq_u32(reg_shft, reg_masks); 1598 vst1q_u32(out, results); 1599 out += 4; 1600 1601 in += 16; 1602 1603 return in; 1604 } 1605 1606 inline static const uint32_t* unpack17_32_neon(const uint32_t* in, uint32_t* out) { 1607 uint32_t mask = 0x1ffff; 1608 uint32_t ind[4]; 1609 uint32_t shifts_1st[4] = {0, 0, 2, 0}; 1610 uint32_t shifts_2nd[4] = {4, 0, 6, 0}; 1611 uint32_t shifts_3rd[4] = {8, 0, 10, 0}; 1612 uint32_t shifts_4th[4] = {12, 0, 14, 0}; 1613 uint32_t shifts_5th[4] = {0, 1, 0, 3}; 1614 uint32_t shifts_6th[4] = {0, 5, 0, 7}; 1615 uint32_t shifts_7th[4] = {0, 9, 0, 11}; 1616 uint32_t shifts_8th[4] = {0, 13, 0, 15}; 1617 uint32x4_t reg_shft, reg_masks; 1618 uint32x4_t results; 1619 1620 reg_masks = vdupq_n_u32(mask); 1621 1622 // shift the first 4 outs 1623 ind[0] = in[0] >> shifts_1st[0]; 1624 ind[1] = (in[0] >> 17 | in[1] << 15) >> shifts_1st[1]; 1625 ind[2] = in[1] >> shifts_1st[2]; 1626 ind[3] = (in[1] >> 19 | in[2] << 13) >> shifts_1st[3]; 1627 reg_shft = vld1q_u32(ind); 1628 results = vandq_u32(reg_shft, reg_masks); 1629 vst1q_u32(out, results); 1630 out += 4; 1631 1632 // shift the 2nd 4 outs 1633 ind[0] = in[2] >> shifts_2nd[0]; 1634 ind[1] = (in[2] >> 21 | in[3] << 11) >> shifts_2nd[1]; 1635 ind[2] = in[3] >> shifts_2nd[2]; 1636 ind[3] = (in[3] >> 23 | in[4] << 9) >> shifts_2nd[3]; 1637 reg_shft = vld1q_u32(ind); 1638 results = vandq_u32(reg_shft, reg_masks); 1639 vst1q_u32(out, results); 1640 out += 4; 1641 1642 // shift the 3rd 4 outs 1643 ind[0] = in[4] >> shifts_3rd[0]; 1644 ind[1] = (in[4] >> 25 | in[5] << 7) >> shifts_3rd[1]; 1645 ind[2] = in[5] >> shifts_3rd[2]; 1646 ind[3] = (in[5] >> 27 | in[6] << 5) >> shifts_3rd[3]; 1647 reg_shft = vld1q_u32(ind); 1648 results = vandq_u32(reg_shft, reg_masks); 1649 vst1q_u32(out, results); 1650 out += 4; 1651 1652 // shift the 4th 4 outs 1653 ind[0] = in[6] >> shifts_4th[0]; 1654 ind[1] = (in[6] >> 29 | in[7] << 3) >> shifts_4th[1]; 1655 ind[2] = in[7] >> shifts_4th[2]; 1656 ind[3] = (in[7] >> 31 | in[8] << 1) >> shifts_4th[3]; 1657 reg_shft = vld1q_u32(ind); 1658 results = vandq_u32(reg_shft, reg_masks); 1659 vst1q_u32(out, results); 1660 out += 4; 1661 1662 // shift the 5th 4 outs 1663 ind[0] = (in[8] >> 16 | in[9] << 16) >> shifts_5th[0]; 1664 ind[1] = in[9] >> shifts_5th[1]; 1665 ind[2] = (in[9] >> 18 | in[10] << 14) >> shifts_5th[2]; 1666 ind[3] = in[10] >> shifts_5th[3]; 1667 reg_shft = vld1q_u32(ind); 1668 results = vandq_u32(reg_shft, reg_masks); 1669 vst1q_u32(out, results); 1670 out += 4; 1671 1672 // shift the 6th 4 outs 1673 ind[0] = (in[10] >> 20 | in[11] << 12) >> shifts_6th[0]; 1674 ind[1] = in[11] >> shifts_6th[1]; 1675 ind[2] = (in[11] >> 22 | in[12] << 10) >> shifts_6th[2]; 1676 ind[3] = in[12] >> shifts_6th[3]; 1677 reg_shft = vld1q_u32(ind); 1678 results = vandq_u32(reg_shft, reg_masks); 1679 vst1q_u32(out, results); 1680 out += 4; 1681 1682 // shift the 7th 4 outs 1683 ind[0] = (in[12] >> 24 | in[13] << 8) >> shifts_7th[0]; 1684 ind[1] = in[13] >> shifts_7th[1]; 1685 ind[2] = (in[13] >> 26 | in[14] << 6) >> shifts_7th[2]; 1686 ind[3] = in[14] >> shifts_7th[3]; 1687 reg_shft = vld1q_u32(ind); 1688 results = vandq_u32(reg_shft, reg_masks); 1689 vst1q_u32(out, results); 1690 out += 4; 1691 1692 // shift the 8th 4 outs 1693 ind[0] = (in[14] >> 28 | in[15] << 4) >> shifts_8th[0]; 1694 ind[1] = in[15] >> shifts_8th[1]; 1695 ind[2] = (in[15] >> 30 | in[16] << 2) >> shifts_8th[2]; 1696 ind[3] = in[16] >> shifts_8th[3]; 1697 reg_shft = vld1q_u32(ind); 1698 results = vandq_u32(reg_shft, reg_masks); 1699 vst1q_u32(out, results); 1700 out += 4; 1701 1702 in += 17; 1703 1704 return in; 1705 } 1706 1707 inline static const uint32_t* unpack18_32_neon(const uint32_t* in, uint32_t* out) { 1708 uint32_t mask = 0x3ffff; 1709 uint32_t ind[4]; 1710 uint32_t shifts_1st[4] = {0, 0, 4, 0}; 1711 uint32_t shifts_2nd[4] = {8, 0, 12, 0}; 1712 uint32_t shifts_3rd[4] = {0, 2, 0, 6}; 1713 uint32_t shifts_4th[4] = {0, 10, 0, 14}; 1714 uint32x4_t reg_shft, reg_masks; 1715 uint32x4_t results; 1716 1717 reg_masks = vdupq_n_u32(mask); 1718 1719 // shift the first 4 outs 1720 ind[0] = in[0] >> shifts_1st[0]; 1721 ind[1] = (in[0] >> 18 | in[1] << 14) >> shifts_1st[1]; 1722 ind[2] = in[1] >> shifts_1st[2]; 1723 ind[3] = (in[1] >> 22 | in[2] << 10) >> shifts_1st[3]; 1724 reg_shft = vld1q_u32(ind); 1725 results = vandq_u32(reg_shft, reg_masks); 1726 vst1q_u32(out, results); 1727 out += 4; 1728 1729 // shift the 2nd 4 outs 1730 ind[0] = in[2] >> shifts_2nd[0]; 1731 ind[1] = (in[2] >> 26 | in[3] << 6) >> shifts_2nd[1]; 1732 ind[2] = in[3] >> shifts_2nd[2]; 1733 ind[3] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[3]; 1734 reg_shft = vld1q_u32(ind); 1735 results = vandq_u32(reg_shft, reg_masks); 1736 vst1q_u32(out, results); 1737 out += 4; 1738 1739 // shift the 3rd 4 outs 1740 ind[0] = (in[4] >> 16 | in[5] << 16) >> shifts_3rd[0]; 1741 ind[1] = in[5] >> shifts_3rd[1]; 1742 ind[2] = (in[5] >> 20 | in[6] << 12) >> shifts_3rd[2]; 1743 ind[3] = in[6] >> shifts_3rd[3]; 1744 reg_shft = vld1q_u32(ind); 1745 results = vandq_u32(reg_shft, reg_masks); 1746 vst1q_u32(out, results); 1747 out += 4; 1748 1749 // shift the 4th 4 outs 1750 ind[0] = (in[6] >> 24 | in[7] << 8) >> shifts_4th[0]; 1751 ind[1] = in[7] >> shifts_4th[1]; 1752 ind[2] = (in[7] >> 28 | in[8] << 4) >> shifts_4th[2]; 1753 ind[3] = in[8] >> shifts_4th[3]; 1754 reg_shft = vld1q_u32(ind); 1755 results = vandq_u32(reg_shft, reg_masks); 1756 vst1q_u32(out, results); 1757 out += 4; 1758 1759 // shift the 5th 4 outs 1760 ind[0] = in[9] >> shifts_1st[0]; 1761 ind[1] = (in[9] >> 18 | in[10] << 14) >> shifts_1st[1]; 1762 ind[2] = in[10] >> shifts_1st[2]; 1763 ind[3] = (in[10] >> 22 | in[11] << 10) >> shifts_1st[3]; 1764 reg_shft = vld1q_u32(ind); 1765 results = vandq_u32(reg_shft, reg_masks); 1766 vst1q_u32(out, results); 1767 out += 4; 1768 1769 // shift the 6th 4 outs 1770 ind[0] = in[11] >> shifts_2nd[0]; 1771 ind[1] = (in[11] >> 26 | in[12] << 6) >> shifts_2nd[1]; 1772 ind[2] = in[12] >> shifts_2nd[2]; 1773 ind[3] = (in[12] >> 30 | in[13] << 2) >> shifts_2nd[3]; 1774 reg_shft = vld1q_u32(ind); 1775 results = vandq_u32(reg_shft, reg_masks); 1776 vst1q_u32(out, results); 1777 out += 4; 1778 1779 // shift the 7th 4 outs 1780 ind[0] = (in[13] >> 16 | in[14] << 16) >> shifts_3rd[0]; 1781 ind[1] = in[14] >> shifts_3rd[1]; 1782 ind[2] = (in[14] >> 20 | in[15] << 12) >> shifts_3rd[2]; 1783 ind[3] = in[15] >> shifts_3rd[3]; 1784 reg_shft = vld1q_u32(ind); 1785 results = vandq_u32(reg_shft, reg_masks); 1786 vst1q_u32(out, results); 1787 out += 4; 1788 1789 // shift the 8th 4 outs 1790 ind[0] = (in[15] >> 24 | in[16] << 8) >> shifts_4th[0]; 1791 ind[1] = in[16] >> shifts_4th[1]; 1792 ind[2] = (in[16] >> 28 | in[17] << 4) >> shifts_4th[2]; 1793 ind[3] = in[17] >> shifts_4th[3]; 1794 reg_shft = vld1q_u32(ind); 1795 results = vandq_u32(reg_shft, reg_masks); 1796 vst1q_u32(out, results); 1797 out += 4; 1798 1799 in += 18; 1800 1801 return in; 1802 } 1803 1804 inline static const uint32_t* unpack19_32_neon(const uint32_t* in, uint32_t* out) { 1805 uint32_t mask = 0x7ffff; 1806 uint32_t ind[4]; 1807 uint32_t shifts_1st[4] = {0, 0, 6, 0}; 1808 uint32_t shifts_2nd[4] = {12, 0, 0, 5}; 1809 uint32_t shifts_3rd[4] = {0, 11, 0, 0}; 1810 uint32_t shifts_4th[4] = {4, 0, 10, 0}; 1811 uint32_t shifts_5th[4] = {0, 3, 0, 9}; 1812 uint32_t shifts_6th[4] = {0, 0, 2, 0}; 1813 uint32_t shifts_7th[4] = {8, 0, 0, 1}; 1814 uint32_t shifts_8th[4] = {0, 7, 0, 13}; 1815 uint32x4_t reg_shft, reg_masks; 1816 uint32x4_t results; 1817 1818 reg_masks = vdupq_n_u32(mask); 1819 1820 // shift the first 4 outs 1821 ind[0] = in[0] >> shifts_1st[0]; 1822 ind[1] = (in[0] >> 19 | in[1] << 13) >> shifts_1st[1]; 1823 ind[2] = in[1] >> shifts_1st[2]; 1824 ind[3] = (in[1] >> 25 | in[2] << 7) >> shifts_1st[3]; 1825 reg_shft = vld1q_u32(ind); 1826 results = vandq_u32(reg_shft, reg_masks); 1827 vst1q_u32(out, results); 1828 out += 4; 1829 1830 // shift the 2nd 4 outs 1831 ind[0] = in[2] >> shifts_2nd[0]; 1832 ind[1] = (in[2] >> 31 | in[3] << 1) >> shifts_2nd[1]; 1833 ind[2] = (in[3] >> 18 | in[4] << 14) >> shifts_2nd[2]; 1834 ind[3] = in[4] >> shifts_2nd[3]; 1835 reg_shft = vld1q_u32(ind); 1836 results = vandq_u32(reg_shft, reg_masks); 1837 vst1q_u32(out, results); 1838 out += 4; 1839 1840 // shift the 3rd 4 outs 1841 ind[0] = (in[4] >> 24 | in[5] << 8) >> shifts_3rd[0]; 1842 ind[1] = in[5] >> shifts_3rd[1]; 1843 ind[2] = (in[5] >> 30 | in[6] << 2) >> shifts_3rd[2]; 1844 ind[3] = (in[6] >> 17 | in[7] << 15) >> shifts_3rd[3]; 1845 reg_shft = vld1q_u32(ind); 1846 results = vandq_u32(reg_shft, reg_masks); 1847 vst1q_u32(out, results); 1848 out += 4; 1849 1850 // shift the 4th 4 outs 1851 ind[0] = in[7] >> shifts_4th[0]; 1852 ind[1] = (in[7] >> 23 | in[8] << 9) >> shifts_4th[1]; 1853 ind[2] = in[8] >> shifts_4th[2]; 1854 ind[3] = (in[8] >> 29 | in[9] << 3) >> shifts_4th[3]; 1855 reg_shft = vld1q_u32(ind); 1856 results = vandq_u32(reg_shft, reg_masks); 1857 vst1q_u32(out, results); 1858 out += 4; 1859 1860 // shift the 5th 4 outs 1861 ind[0] = (in[9] >> 16 | in[10] << 16) >> shifts_5th[0]; 1862 ind[1] = in[10] >> shifts_5th[1]; 1863 ind[2] = (in[10] >> 22 | in[11] << 10) >> shifts_5th[2]; 1864 ind[3] = in[11] >> shifts_5th[3]; 1865 reg_shft = vld1q_u32(ind); 1866 results = vandq_u32(reg_shft, reg_masks); 1867 vst1q_u32(out, results); 1868 out += 4; 1869 1870 // shift the 6th 4 outs 1871 ind[0] = (in[11] >> 28 | in[12] << 4) >> shifts_6th[0]; 1872 ind[1] = (in[12] >> 15 | in[13] << 17) >> shifts_6th[1]; 1873 ind[2] = in[13] >> shifts_6th[2]; 1874 ind[3] = (in[13] >> 21 | in[14] << 11) >> shifts_6th[3]; 1875 reg_shft = vld1q_u32(ind); 1876 results = vandq_u32(reg_shft, reg_masks); 1877 vst1q_u32(out, results); 1878 out += 4; 1879 1880 // shift the 7th 4 outs 1881 ind[0] = in[14] >> shifts_7th[0]; 1882 ind[1] = (in[14] >> 27 | in[15] << 5) >> shifts_7th[1]; 1883 ind[2] = (in[15] >> 14 | in[16] << 18) >> shifts_7th[2]; 1884 ind[3] = in[16] >> shifts_7th[3]; 1885 reg_shft = vld1q_u32(ind); 1886 results = vandq_u32(reg_shft, reg_masks); 1887 vst1q_u32(out, results); 1888 out += 4; 1889 1890 // shift the 8th 4 outs 1891 ind[0] = (in[16] >> 20 | in[17] << 12) >> shifts_8th[0]; 1892 ind[1] = in[17] >> shifts_8th[1]; 1893 ind[2] = (in[17] >> 26 | in[18] << 6) >> shifts_8th[2]; 1894 ind[3] = in[18] >> shifts_8th[3]; 1895 reg_shft = vld1q_u32(ind); 1896 results = vandq_u32(reg_shft, reg_masks); 1897 vst1q_u32(out, results); 1898 out += 4; 1899 1900 in += 19; 1901 1902 return in; 1903 } 1904 1905 inline static const uint32_t* unpack20_32_neon(const uint32_t* in, uint32_t* out) { 1906 uint32_t mask = 0xfffff; 1907 uint32_t ind[4]; 1908 uint32_t shifts_1st[4] = {0, 0, 8, 0}; 1909 uint32_t shifts_2nd[4] = {0, 4, 0, 12}; 1910 uint32x4_t reg_shft, reg_masks; 1911 uint32x4_t results; 1912 1913 reg_masks = vdupq_n_u32(mask); 1914 1915 // shift the first 4 outs 1916 ind[0] = in[0] >> shifts_1st[0]; 1917 ind[1] = (in[0] >> 20 | in[1] << 12) >> shifts_1st[1]; 1918 ind[2] = in[1] >> shifts_1st[2]; 1919 ind[3] = (in[1] >> 28 | in[2] << 4) >> shifts_1st[3]; 1920 reg_shft = vld1q_u32(ind); 1921 results = vandq_u32(reg_shft, reg_masks); 1922 vst1q_u32(out, results); 1923 out += 4; 1924 1925 // shift the 2nd 4 outs 1926 ind[0] = (in[2] >> 16 | in[3] << 16) >> shifts_2nd[0]; 1927 ind[1] = in[3] >> shifts_2nd[1]; 1928 ind[2] = (in[3] >> 24 | in[4] << 8) >> shifts_2nd[2]; 1929 ind[3] = in[4] >> shifts_2nd[3]; 1930 reg_shft = vld1q_u32(ind); 1931 results = vandq_u32(reg_shft, reg_masks); 1932 vst1q_u32(out, results); 1933 out += 4; 1934 1935 // shift the 3rd 4 outs 1936 ind[0] = in[5] >> shifts_1st[0]; 1937 ind[1] = (in[5] >> 20 | in[6] << 12) >> shifts_1st[1]; 1938 ind[2] = in[6] >> shifts_1st[2]; 1939 ind[3] = (in[6] >> 28 | in[7] << 4) >> shifts_1st[3]; 1940 reg_shft = vld1q_u32(ind); 1941 results = vandq_u32(reg_shft, reg_masks); 1942 vst1q_u32(out, results); 1943 out += 4; 1944 1945 // shift the 4th 4 outs 1946 ind[0] = (in[7] >> 16 | in[8] << 16) >> shifts_2nd[0]; 1947 ind[1] = in[8] >> shifts_2nd[1]; 1948 ind[2] = (in[8] >> 24 | in[9] << 8) >> shifts_2nd[2]; 1949 ind[3] = in[9] >> shifts_2nd[3]; 1950 reg_shft = vld1q_u32(ind); 1951 results = vandq_u32(reg_shft, reg_masks); 1952 vst1q_u32(out, results); 1953 out += 4; 1954 1955 // shift the 5th 4 outs 1956 ind[0] = in[10] >> shifts_1st[0]; 1957 ind[1] = (in[10] >> 20 | in[11] << 12) >> shifts_1st[1]; 1958 ind[2] = in[11] >> shifts_1st[2]; 1959 ind[3] = (in[11] >> 28 | in[12] << 4) >> shifts_1st[3]; 1960 reg_shft = vld1q_u32(ind); 1961 results = vandq_u32(reg_shft, reg_masks); 1962 vst1q_u32(out, results); 1963 out += 4; 1964 1965 // shift the 6th 4 outs 1966 ind[0] = (in[12] >> 16 | in[13] << 16) >> shifts_2nd[0]; 1967 ind[1] = in[13] >> shifts_2nd[1]; 1968 ind[2] = (in[13] >> 24 | in[14] << 8) >> shifts_2nd[2]; 1969 ind[3] = in[14] >> shifts_2nd[3]; 1970 reg_shft = vld1q_u32(ind); 1971 results = vandq_u32(reg_shft, reg_masks); 1972 vst1q_u32(out, results); 1973 out += 4; 1974 1975 // shift the 7th 4 outs 1976 ind[0] = in[15] >> shifts_1st[0]; 1977 ind[1] = (in[15] >> 20 | in[16] << 12) >> shifts_1st[1]; 1978 ind[2] = in[16] >> shifts_1st[2]; 1979 ind[3] = (in[16] >> 28 | in[17] << 4) >> shifts_1st[3]; 1980 reg_shft = vld1q_u32(ind); 1981 results = vandq_u32(reg_shft, reg_masks); 1982 vst1q_u32(out, results); 1983 out += 4; 1984 1985 // shift the 8th 4 outs 1986 ind[0] = (in[17] >> 16 | in[18] << 16) >> shifts_2nd[0]; 1987 ind[1] = in[18] >> shifts_2nd[1]; 1988 ind[2] = (in[18] >> 24 | in[19] << 8) >> shifts_2nd[2]; 1989 ind[3] = in[19] >> shifts_2nd[3]; 1990 reg_shft = vld1q_u32(ind); 1991 results = vandq_u32(reg_shft, reg_masks); 1992 vst1q_u32(out, results); 1993 out += 4; 1994 1995 in += 20; 1996 1997 return in; 1998 } 1999 2000 inline static const uint32_t* unpack21_32_neon(const uint32_t* in, uint32_t* out) { 2001 uint32_t mask = 0x1fffff; 2002 uint32_t ind[4]; 2003 uint32_t shifts_1st[4] = {0, 0, 10, 0}; 2004 uint32_t shifts_2nd[4] = {0, 9, 0, 0}; 2005 uint32_t shifts_3rd[4] = {8, 0, 0, 7}; 2006 uint32_t shifts_4th[4] = {0, 0, 6, 0}; 2007 uint32_t shifts_5th[4] = {0, 5, 0, 0}; 2008 uint32_t shifts_6th[4] = {4, 0, 0, 3}; 2009 uint32_t shifts_7th[4] = {0, 0, 2, 0}; 2010 uint32_t shifts_8th[4] = {0, 1, 0, 11}; 2011 uint32x4_t reg_shft, reg_masks; 2012 uint32x4_t results; 2013 2014 reg_masks = vdupq_n_u32(mask); 2015 2016 // shift the first 4 outs 2017 ind[0] = in[0] >> shifts_1st[0]; 2018 ind[1] = (in[0] >> 21 | in[1] << 11) >> shifts_1st[1]; 2019 ind[2] = in[1] >> shifts_1st[2]; 2020 ind[3] = (in[1] >> 31 | in[2] << 1) >> shifts_1st[3]; 2021 reg_shft = vld1q_u32(ind); 2022 results = vandq_u32(reg_shft, reg_masks); 2023 vst1q_u32(out, results); 2024 out += 4; 2025 2026 // shift the 2nd 4 outs 2027 ind[0] = (in[2] >> 20 | in[3] << 12) >> shifts_2nd[0]; 2028 ind[1] = in[3] >> shifts_2nd[1]; 2029 ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[2]; 2030 ind[3] = (in[4] >> 19 | in[5] << 13) >> shifts_2nd[3]; 2031 reg_shft = vld1q_u32(ind); 2032 results = vandq_u32(reg_shft, reg_masks); 2033 vst1q_u32(out, results); 2034 out += 4; 2035 2036 // shift the 3rd 4 outs 2037 ind[0] = in[5] >> shifts_3rd[0]; 2038 ind[1] = (in[5] >> 29 | in[6] << 3) >> shifts_3rd[1]; 2039 ind[2] = (in[6] >> 18 | in[7] << 14) >> shifts_3rd[2]; 2040 ind[3] = in[7] >> shifts_3rd[3]; 2041 reg_shft = vld1q_u32(ind); 2042 results = vandq_u32(reg_shft, reg_masks); 2043 vst1q_u32(out, results); 2044 out += 4; 2045 2046 // shift the 4th 4 outs 2047 ind[0] = (in[7] >> 28 | in[8] << 4) >> shifts_4th[0]; 2048 ind[1] = (in[8] >> 17 | in[9] << 15) >> shifts_4th[1]; 2049 ind[2] = in[9] >> shifts_4th[2]; 2050 ind[3] = (in[9] >> 27 | in[10] << 5) >> shifts_4th[3]; 2051 reg_shft = vld1q_u32(ind); 2052 results = vandq_u32(reg_shft, reg_masks); 2053 vst1q_u32(out, results); 2054 out += 4; 2055 2056 // shift the 5th 4 outs 2057 ind[0] = (in[10] >> 16 | in[11] << 16) >> shifts_5th[0]; 2058 ind[1] = in[11] >> shifts_5th[1]; 2059 ind[2] = (in[11] >> 26 | in[12] << 6) >> shifts_5th[2]; 2060 ind[3] = (in[12] >> 15 | in[13] << 17) >> shifts_5th[3]; 2061 reg_shft = vld1q_u32(ind); 2062 results = vandq_u32(reg_shft, reg_masks); 2063 vst1q_u32(out, results); 2064 out += 4; 2065 2066 // shift the 6th 4 outs 2067 ind[0] = in[13] >> shifts_6th[0]; 2068 ind[1] = (in[13] >> 25 | in[14] << 7) >> shifts_6th[1]; 2069 ind[2] = (in[14] >> 14 | in[15] << 18) >> shifts_6th[2]; 2070 ind[3] = in[15] >> shifts_6th[3]; 2071 reg_shft = vld1q_u32(ind); 2072 results = vandq_u32(reg_shft, reg_masks); 2073 vst1q_u32(out, results); 2074 out += 4; 2075 2076 // shift the 7th 4 outs 2077 ind[0] = (in[15] >> 24 | in[16] << 8) >> shifts_7th[0]; 2078 ind[1] = (in[16] >> 13 | in[17] << 19) >> shifts_7th[1]; 2079 ind[2] = in[17] >> shifts_7th[2]; 2080 ind[3] = (in[17] >> 23 | in[18] << 9) >> shifts_7th[3]; 2081 reg_shft = vld1q_u32(ind); 2082 results = vandq_u32(reg_shft, reg_masks); 2083 vst1q_u32(out, results); 2084 out += 4; 2085 2086 // shift the 8th 4 outs 2087 ind[0] = (in[18] >> 12 | in[19] << 20) >> shifts_8th[0]; 2088 ind[1] = in[19] >> shifts_8th[1]; 2089 ind[2] = (in[19] >> 22 | in[20] << 10) >> shifts_8th[2]; 2090 ind[3] = in[20] >> shifts_8th[3]; 2091 reg_shft = vld1q_u32(ind); 2092 results = vandq_u32(reg_shft, reg_masks); 2093 vst1q_u32(out, results); 2094 out += 4; 2095 2096 in += 21; 2097 2098 return in; 2099 } 2100 2101 inline static const uint32_t* unpack22_32_neon(const uint32_t* in, uint32_t* out) { 2102 uint32_t mask = 0x3fffff; 2103 uint32_t ind[4]; 2104 uint32_t shifts_1st[4] = {0, 0, 0, 2}; 2105 uint32_t shifts_2nd[4] = {0, 0, 4, 0}; 2106 uint32_t shifts_3rd[4] = {0, 6, 0, 0}; 2107 uint32_t shifts_4th[4] = {8, 0, 0, 10}; 2108 uint32x4_t reg_shft, reg_masks; 2109 uint32x4_t results; 2110 2111 reg_masks = vdupq_n_u32(mask); 2112 2113 // shift the first 4 outs 2114 ind[0] = in[0] >> shifts_1st[0]; 2115 ind[1] = (in[0] >> 22 | in[1] << 10) >> shifts_1st[1]; 2116 ind[2] = (in[1] >> 12 | in[2] << 20) >> shifts_1st[2]; 2117 ind[3] = in[2] >> shifts_1st[3]; 2118 reg_shft = vld1q_u32(ind); 2119 results = vandq_u32(reg_shft, reg_masks); 2120 vst1q_u32(out, results); 2121 out += 4; 2122 2123 // shift the 2nd 4 outs 2124 ind[0] = (in[2] >> 24 | in[3] << 8) >> shifts_2nd[0]; 2125 ind[1] = (in[3] >> 14 | in[4] << 18) >> shifts_2nd[1]; 2126 ind[2] = in[4] >> shifts_2nd[2]; 2127 ind[3] = (in[4] >> 26 | in[5] << 6) >> shifts_2nd[3]; 2128 reg_shft = vld1q_u32(ind); 2129 results = vandq_u32(reg_shft, reg_masks); 2130 vst1q_u32(out, results); 2131 out += 4; 2132 2133 // shift the 3rd 4 outs 2134 ind[0] = (in[5] >> 16 | in[6] << 16) >> shifts_3rd[0]; 2135 ind[1] = in[6] >> shifts_3rd[1]; 2136 ind[2] = (in[6] >> 28 | in[7] << 4) >> shifts_3rd[2]; 2137 ind[3] = (in[7] >> 18 | in[8] << 14) >> shifts_3rd[3]; 2138 reg_shft = vld1q_u32(ind); 2139 results = vandq_u32(reg_shft, reg_masks); 2140 vst1q_u32(out, results); 2141 out += 4; 2142 2143 // shift the 4th 4 outs 2144 ind[0] = in[8] >> shifts_4th[0]; 2145 ind[1] = (in[8] >> 30 | in[9] << 2) >> shifts_4th[1]; 2146 ind[2] = (in[9] >> 20 | in[10] << 12) >> shifts_4th[2]; 2147 ind[3] = in[10] >> shifts_4th[3]; 2148 reg_shft = vld1q_u32(ind); 2149 results = vandq_u32(reg_shft, reg_masks); 2150 vst1q_u32(out, results); 2151 out += 4; 2152 2153 // shift the 5th 4 outs 2154 ind[0] = in[11] >> shifts_1st[0]; 2155 ind[1] = (in[11] >> 22 | in[12] << 10) >> shifts_1st[1]; 2156 ind[2] = (in[12] >> 12 | in[13] << 20) >> shifts_1st[2]; 2157 ind[3] = in[13] >> shifts_1st[3]; 2158 reg_shft = vld1q_u32(ind); 2159 results = vandq_u32(reg_shft, reg_masks); 2160 vst1q_u32(out, results); 2161 out += 4; 2162 2163 // shift the 6th 4 outs 2164 ind[0] = (in[13] >> 24 | in[14] << 8) >> shifts_2nd[0]; 2165 ind[1] = (in[14] >> 14 | in[15] << 18) >> shifts_2nd[1]; 2166 ind[2] = in[15] >> shifts_2nd[2]; 2167 ind[3] = (in[15] >> 26 | in[16] << 6) >> shifts_2nd[3]; 2168 reg_shft = vld1q_u32(ind); 2169 results = vandq_u32(reg_shft, reg_masks); 2170 vst1q_u32(out, results); 2171 out += 4; 2172 2173 // shift the 7th 4 outs 2174 ind[0] = (in[16] >> 16 | in[17] << 16) >> shifts_3rd[0]; 2175 ind[1] = in[17] >> shifts_3rd[1]; 2176 ind[2] = (in[17] >> 28 | in[18] << 4) >> shifts_3rd[2]; 2177 ind[3] = (in[18] >> 18 | in[19] << 14) >> shifts_3rd[3]; 2178 reg_shft = vld1q_u32(ind); 2179 results = vandq_u32(reg_shft, reg_masks); 2180 vst1q_u32(out, results); 2181 out += 4; 2182 2183 // shift the 8th 4 outs 2184 ind[0] = in[19] >> shifts_4th[0]; 2185 ind[1] = (in[19] >> 30 | in[20] << 2) >> shifts_4th[1]; 2186 ind[2] = (in[20] >> 20 | in[21] << 12) >> shifts_4th[2]; 2187 ind[3] = in[21] >> shifts_4th[3]; 2188 reg_shft = vld1q_u32(ind); 2189 results = vandq_u32(reg_shft, reg_masks); 2190 vst1q_u32(out, results); 2191 out += 4; 2192 2193 in += 22; 2194 2195 return in; 2196 } 2197 2198 inline static const uint32_t* unpack23_32_neon(const uint32_t* in, uint32_t* out) { 2199 uint32_t mask = 0x7fffff; 2200 uint32_t ind[4]; 2201 uint32_t shifts_1st[4] = {0, 0, 0, 5}; 2202 uint32_t shifts_2nd[4] = {0, 0, 0, 1}; 2203 uint32_t shifts_3rd[4] = {0, 0, 6, 0}; 2204 uint32_t shifts_4th[4] = {0, 0, 2, 0}; 2205 uint32_t shifts_5th[4] = {0, 7, 0, 0}; 2206 uint32_t shifts_6th[4] = {0, 3, 0, 0}; 2207 uint32_t shifts_7th[4] = {8, 0, 0, 0}; 2208 uint32_t shifts_8th[4] = {4, 0, 0, 9}; 2209 uint32x4_t reg_shft, reg_masks; 2210 uint32x4_t results; 2211 2212 reg_masks = vdupq_n_u32(mask); 2213 2214 // shift the first 4 outs 2215 ind[0] = in[0] >> shifts_1st[0]; 2216 ind[1] = (in[0] >> 23 | in[1] << 9) >> shifts_1st[1]; 2217 ind[2] = (in[1] >> 14 | in[2] << 18) >> shifts_1st[2]; 2218 ind[3] = in[2] >> shifts_1st[3]; 2219 reg_shft = vld1q_u32(ind); 2220 results = vandq_u32(reg_shft, reg_masks); 2221 vst1q_u32(out, results); 2222 out += 4; 2223 2224 // shift the 2nd 4 outs 2225 ind[0] = (in[2] >> 28 | in[3] << 4) >> shifts_2nd[0]; 2226 ind[1] = (in[3] >> 19 | in[4] << 13) >> shifts_2nd[1]; 2227 ind[2] = (in[4] >> 10 | in[5] << 22) >> shifts_2nd[2]; 2228 ind[3] = in[5] >> shifts_2nd[3]; 2229 reg_shft = vld1q_u32(ind); 2230 results = vandq_u32(reg_shft, reg_masks); 2231 vst1q_u32(out, results); 2232 out += 4; 2233 2234 // shift the 3rd 4 outs 2235 ind[0] = (in[5] >> 24 | in[6] << 8) >> shifts_3rd[0]; 2236 ind[1] = (in[6] >> 15 | in[7] << 17) >> shifts_3rd[1]; 2237 ind[2] = in[7] >> shifts_3rd[2]; 2238 ind[3] = (in[7] >> 29 | in[8] << 3) >> shifts_3rd[3]; 2239 reg_shft = vld1q_u32(ind); 2240 results = vandq_u32(reg_shft, reg_masks); 2241 vst1q_u32(out, results); 2242 out += 4; 2243 2244 // shift the 4th 4 outs 2245 ind[0] = (in[8] >> 20 | in[9] << 12) >> shifts_4th[0]; 2246 ind[1] = (in[9] >> 11 | in[10] << 21) >> shifts_4th[1]; 2247 ind[2] = in[10] >> shifts_4th[2]; 2248 ind[3] = (in[10] >> 25 | in[11] << 7) >> shifts_4th[3]; 2249 reg_shft = vld1q_u32(ind); 2250 results = vandq_u32(reg_shft, reg_masks); 2251 vst1q_u32(out, results); 2252 out += 4; 2253 2254 // shift the 5th 4 outs 2255 ind[0] = (in[11] >> 16 | in[12] << 16) >> shifts_5th[0]; 2256 ind[1] = in[12] >> shifts_5th[1]; 2257 ind[2] = (in[12] >> 30 | in[13] << 2) >> shifts_5th[2]; 2258 ind[3] = (in[13] >> 21 | in[14] << 11) >> shifts_5th[3]; 2259 reg_shft = vld1q_u32(ind); 2260 results = vandq_u32(reg_shft, reg_masks); 2261 vst1q_u32(out, results); 2262 out += 4; 2263 2264 // shift the 6th 4 outs 2265 ind[0] = (in[14] >> 12 | in[15] << 20) >> shifts_6th[0]; 2266 ind[1] = in[15] >> shifts_6th[1]; 2267 ind[2] = (in[15] >> 26 | in[16] << 6) >> shifts_6th[2]; 2268 ind[3] = (in[16] >> 17 | in[17] << 15) >> shifts_6th[3]; 2269 reg_shft = vld1q_u32(ind); 2270 results = vandq_u32(reg_shft, reg_masks); 2271 vst1q_u32(out, results); 2272 out += 4; 2273 2274 // shift the 7th 4 outs 2275 ind[0] = in[17] >> shifts_7th[0]; 2276 ind[1] = (in[17] >> 31 | in[18] << 1) >> shifts_7th[1]; 2277 ind[2] = (in[18] >> 22 | in[19] << 10) >> shifts_7th[2]; 2278 ind[3] = (in[19] >> 13 | in[20] << 19) >> shifts_7th[3]; 2279 reg_shft = vld1q_u32(ind); 2280 results = vandq_u32(reg_shft, reg_masks); 2281 vst1q_u32(out, results); 2282 out += 4; 2283 2284 // shift the 8th 4 outs 2285 ind[0] = in[20] >> shifts_8th[0]; 2286 ind[1] = (in[20] >> 27 | in[21] << 5) >> shifts_8th[1]; 2287 ind[2] = (in[21] >> 18 | in[22] << 14) >> shifts_8th[2]; 2288 ind[3] = in[22] >> shifts_8th[3]; 2289 reg_shft = vld1q_u32(ind); 2290 results = vandq_u32(reg_shft, reg_masks); 2291 vst1q_u32(out, results); 2292 out += 4; 2293 2294 in += 23; 2295 2296 return in; 2297 } 2298 2299 inline static const uint32_t* unpack24_32_neon(const uint32_t* in, uint32_t* out) { 2300 uint32_t mask = 0xffffff; 2301 uint32_t ind[4]; 2302 uint32_t shifts_1st[4] = {0, 0, 0, 8}; 2303 uint32x4_t reg_shft, reg_masks; 2304 uint32x4_t results; 2305 2306 reg_masks = vdupq_n_u32(mask); 2307 2308 // shift the first 4 outs 2309 ind[0] = in[0] >> shifts_1st[0]; 2310 ind[1] = (in[0] >> 24 | in[1] << 8) >> shifts_1st[1]; 2311 ind[2] = (in[1] >> 16 | in[2] << 16) >> shifts_1st[2]; 2312 ind[3] = in[2] >> shifts_1st[3]; 2313 reg_shft = vld1q_u32(ind); 2314 results = vandq_u32(reg_shft, reg_masks); 2315 vst1q_u32(out, results); 2316 out += 4; 2317 2318 // shift the 2nd 4 outs 2319 ind[0] = in[3] >> shifts_1st[0]; 2320 ind[1] = (in[3] >> 24 | in[4] << 8) >> shifts_1st[1]; 2321 ind[2] = (in[4] >> 16 | in[5] << 16) >> shifts_1st[2]; 2322 ind[3] = in[5] >> shifts_1st[3]; 2323 reg_shft = vld1q_u32(ind); 2324 results = vandq_u32(reg_shft, reg_masks); 2325 vst1q_u32(out, results); 2326 out += 4; 2327 2328 // shift the 3rd 4 outs 2329 ind[0] = in[6] >> shifts_1st[0]; 2330 ind[1] = (in[6] >> 24 | in[7] << 8) >> shifts_1st[1]; 2331 ind[2] = (in[7] >> 16 | in[8] << 16) >> shifts_1st[2]; 2332 ind[3] = in[8] >> shifts_1st[3]; 2333 reg_shft = vld1q_u32(ind); 2334 results = vandq_u32(reg_shft, reg_masks); 2335 vst1q_u32(out, results); 2336 out += 4; 2337 2338 // shift the 4th 4 outs 2339 ind[0] = in[9] >> shifts_1st[0]; 2340 ind[1] = (in[9] >> 24 | in[10] << 8) >> shifts_1st[1]; 2341 ind[2] = (in[10] >> 16 | in[11] << 16) >> shifts_1st[2]; 2342 ind[3] = in[11] >> shifts_1st[3]; 2343 reg_shft = vld1q_u32(ind); 2344 results = vandq_u32(reg_shft, reg_masks); 2345 vst1q_u32(out, results); 2346 out += 4; 2347 2348 // shift the 5th 4 outs 2349 ind[0] = in[12] >> shifts_1st[0]; 2350 ind[1] = (in[12] >> 24 | in[13] << 8) >> shifts_1st[1]; 2351 ind[2] = (in[13] >> 16 | in[14] << 16) >> shifts_1st[2]; 2352 ind[3] = in[14] >> shifts_1st[3]; 2353 reg_shft = vld1q_u32(ind); 2354 results = vandq_u32(reg_shft, reg_masks); 2355 vst1q_u32(out, results); 2356 out += 4; 2357 2358 // shift the 6th 4 outs 2359 ind[0] = in[15] >> shifts_1st[0]; 2360 ind[1] = (in[15] >> 24 | in[16] << 8) >> shifts_1st[1]; 2361 ind[2] = (in[16] >> 16 | in[17] << 16) >> shifts_1st[2]; 2362 ind[3] = in[17] >> shifts_1st[3]; 2363 reg_shft = vld1q_u32(ind); 2364 results = vandq_u32(reg_shft, reg_masks); 2365 vst1q_u32(out, results); 2366 out += 4; 2367 2368 // shift the 7th 4 outs 2369 ind[0] = in[18] >> shifts_1st[0]; 2370 ind[1] = (in[18] >> 24 | in[19] << 8) >> shifts_1st[1]; 2371 ind[2] = (in[19] >> 16 | in[20] << 16) >> shifts_1st[2]; 2372 ind[3] = in[20] >> shifts_1st[3]; 2373 reg_shft = vld1q_u32(ind); 2374 results = vandq_u32(reg_shft, reg_masks); 2375 vst1q_u32(out, results); 2376 out += 4; 2377 2378 // shift the 8th 4 outs 2379 ind[0] = in[21] >> shifts_1st[0]; 2380 ind[1] = (in[21] >> 24 | in[22] << 8) >> shifts_1st[1]; 2381 ind[2] = (in[22] >> 16 | in[23] << 16) >> shifts_1st[2]; 2382 ind[3] = in[23] >> shifts_1st[3]; 2383 reg_shft = vld1q_u32(ind); 2384 results = vandq_u32(reg_shft, reg_masks); 2385 vst1q_u32(out, results); 2386 out += 4; 2387 2388 in += 24; 2389 2390 return in; 2391 } 2392 2393 inline static const uint32_t* unpack25_32_neon(const uint32_t* in, uint32_t* out) { 2394 uint32_t mask = 0x1ffffff; 2395 uint32_t ind[4]; 2396 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2397 uint32_t shifts_2nd[4] = {4, 0, 0, 0}; 2398 uint32_t shifts_3rd[4] = {0, 1, 0, 0}; 2399 uint32_t shifts_4th[4] = {0, 5, 0, 0}; 2400 uint32_t shifts_5th[4] = {0, 0, 2, 0}; 2401 uint32_t shifts_6th[4] = {0, 0, 6, 0}; 2402 uint32_t shifts_7th[4] = {0, 0, 0, 3}; 2403 uint32_t shifts_8th[4] = {0, 0, 0, 7}; 2404 uint32x4_t reg_shft, reg_masks; 2405 uint32x4_t results; 2406 2407 reg_masks = vdupq_n_u32(mask); 2408 2409 // shift the first 4 outs 2410 ind[0] = in[0] >> shifts_1st[0]; 2411 ind[1] = (in[0] >> 25 | in[1] << 7) >> shifts_1st[1]; 2412 ind[2] = (in[1] >> 18 | in[2] << 14) >> shifts_1st[2]; 2413 ind[3] = (in[2] >> 11 | in[3] << 21) >> shifts_1st[3]; 2414 reg_shft = vld1q_u32(ind); 2415 results = vandq_u32(reg_shft, reg_masks); 2416 vst1q_u32(out, results); 2417 out += 4; 2418 2419 // shift the 2nd 4 outs 2420 ind[0] = in[3] >> shifts_2nd[0]; 2421 ind[1] = (in[3] >> 29 | in[4] << 3) >> shifts_2nd[1]; 2422 ind[2] = (in[4] >> 22 | in[5] << 10) >> shifts_2nd[2]; 2423 ind[3] = (in[5] >> 15 | in[6] << 17) >> shifts_2nd[3]; 2424 reg_shft = vld1q_u32(ind); 2425 results = vandq_u32(reg_shft, reg_masks); 2426 vst1q_u32(out, results); 2427 out += 4; 2428 2429 // shift the 3rd 4 outs 2430 ind[0] = (in[6] >> 8 | in[7] << 24) >> shifts_3rd[0]; 2431 ind[1] = in[7] >> shifts_3rd[1]; 2432 ind[2] = (in[7] >> 26 | in[8] << 6) >> shifts_3rd[2]; 2433 ind[3] = (in[8] >> 19 | in[9] << 13) >> shifts_3rd[3]; 2434 reg_shft = vld1q_u32(ind); 2435 results = vandq_u32(reg_shft, reg_masks); 2436 vst1q_u32(out, results); 2437 out += 4; 2438 2439 // shift the 4th 4 outs 2440 ind[0] = (in[9] >> 12 | in[10] << 20) >> shifts_4th[0]; 2441 ind[1] = in[10] >> shifts_4th[1]; 2442 ind[2] = (in[10] >> 30 | in[11] << 2) >> shifts_4th[2]; 2443 ind[3] = (in[11] >> 23 | in[12] << 9) >> shifts_4th[3]; 2444 reg_shft = vld1q_u32(ind); 2445 results = vandq_u32(reg_shft, reg_masks); 2446 vst1q_u32(out, results); 2447 out += 4; 2448 2449 // shift the 5th 4 outs 2450 ind[0] = (in[12] >> 16 | in[13] << 16) >> shifts_5th[0]; 2451 ind[1] = (in[13] >> 9 | in[14] << 23) >> shifts_5th[1]; 2452 ind[2] = in[14] >> shifts_5th[2]; 2453 ind[3] = (in[14] >> 27 | in[15] << 5) >> shifts_5th[3]; 2454 reg_shft = vld1q_u32(ind); 2455 results = vandq_u32(reg_shft, reg_masks); 2456 vst1q_u32(out, results); 2457 out += 4; 2458 2459 // shift the 6th 4 outs 2460 ind[0] = (in[15] >> 20 | in[16] << 12) >> shifts_6th[0]; 2461 ind[1] = (in[16] >> 13 | in[17] << 19) >> shifts_6th[1]; 2462 ind[2] = in[17] >> shifts_6th[2]; 2463 ind[3] = (in[17] >> 31 | in[18] << 1) >> shifts_6th[3]; 2464 reg_shft = vld1q_u32(ind); 2465 results = vandq_u32(reg_shft, reg_masks); 2466 vst1q_u32(out, results); 2467 out += 4; 2468 2469 // shift the 7th 4 outs 2470 ind[0] = (in[18] >> 24 | in[19] << 8) >> shifts_7th[0]; 2471 ind[1] = (in[19] >> 17 | in[20] << 15) >> shifts_7th[1]; 2472 ind[2] = (in[20] >> 10 | in[21] << 22) >> shifts_7th[2]; 2473 ind[3] = in[21] >> shifts_7th[3]; 2474 reg_shft = vld1q_u32(ind); 2475 results = vandq_u32(reg_shft, reg_masks); 2476 vst1q_u32(out, results); 2477 out += 4; 2478 2479 // shift the 8th 4 outs 2480 ind[0] = (in[21] >> 28 | in[22] << 4) >> shifts_8th[0]; 2481 ind[1] = (in[22] >> 21 | in[23] << 11) >> shifts_8th[1]; 2482 ind[2] = (in[23] >> 14 | in[24] << 18) >> shifts_8th[2]; 2483 ind[3] = in[24] >> shifts_8th[3]; 2484 reg_shft = vld1q_u32(ind); 2485 results = vandq_u32(reg_shft, reg_masks); 2486 vst1q_u32(out, results); 2487 out += 4; 2488 2489 in += 25; 2490 2491 return in; 2492 } 2493 2494 inline static const uint32_t* unpack26_32_neon(const uint32_t* in, uint32_t* out) { 2495 uint32_t mask = 0x3ffffff; 2496 uint32_t ind[4]; 2497 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2498 uint32_t shifts_2nd[4] = {0, 2, 0, 0}; 2499 uint32_t shifts_3rd[4] = {0, 0, 4, 0}; 2500 uint32_t shifts_4th[4] = {0, 0, 0, 6}; 2501 uint32x4_t reg_shft, reg_masks; 2502 uint32x4_t results; 2503 2504 reg_masks = vdupq_n_u32(mask); 2505 2506 // shift the first 4 outs 2507 ind[0] = in[0] >> shifts_1st[0]; 2508 ind[1] = (in[0] >> 26 | in[1] << 6) >> shifts_1st[1]; 2509 ind[2] = (in[1] >> 20 | in[2] << 12) >> shifts_1st[2]; 2510 ind[3] = (in[2] >> 14 | in[3] << 18) >> shifts_1st[3]; 2511 reg_shft = vld1q_u32(ind); 2512 results = vandq_u32(reg_shft, reg_masks); 2513 vst1q_u32(out, results); 2514 out += 4; 2515 2516 // shift the 2nd 4 outs 2517 ind[0] = (in[3] >> 8 | in[4] << 24) >> shifts_2nd[0]; 2518 ind[1] = in[4] >> shifts_2nd[1]; 2519 ind[2] = (in[4] >> 28 | in[5] << 4) >> shifts_2nd[2]; 2520 ind[3] = (in[5] >> 22 | in[6] << 10) >> shifts_2nd[3]; 2521 reg_shft = vld1q_u32(ind); 2522 results = vandq_u32(reg_shft, reg_masks); 2523 vst1q_u32(out, results); 2524 out += 4; 2525 2526 // shift the 3rd 4 outs 2527 ind[0] = (in[6] >> 16 | in[7] << 16) >> shifts_3rd[0]; 2528 ind[1] = (in[7] >> 10 | in[8] << 22) >> shifts_3rd[1]; 2529 ind[2] = in[8] >> shifts_3rd[2]; 2530 ind[3] = (in[8] >> 30 | in[9] << 2) >> shifts_3rd[3]; 2531 reg_shft = vld1q_u32(ind); 2532 results = vandq_u32(reg_shft, reg_masks); 2533 vst1q_u32(out, results); 2534 out += 4; 2535 2536 // shift the 4th 4 outs 2537 ind[0] = (in[9] >> 24 | in[10] << 8) >> shifts_4th[0]; 2538 ind[1] = (in[10] >> 18 | in[11] << 14) >> shifts_4th[1]; 2539 ind[2] = (in[11] >> 12 | in[12] << 20) >> shifts_4th[2]; 2540 ind[3] = in[12] >> shifts_4th[3]; 2541 reg_shft = vld1q_u32(ind); 2542 results = vandq_u32(reg_shft, reg_masks); 2543 vst1q_u32(out, results); 2544 out += 4; 2545 2546 // shift the 5th 4 outs 2547 ind[0] = in[13] >> shifts_1st[0]; 2548 ind[1] = (in[13] >> 26 | in[14] << 6) >> shifts_1st[1]; 2549 ind[2] = (in[14] >> 20 | in[15] << 12) >> shifts_1st[2]; 2550 ind[3] = (in[15] >> 14 | in[16] << 18) >> shifts_1st[3]; 2551 reg_shft = vld1q_u32(ind); 2552 results = vandq_u32(reg_shft, reg_masks); 2553 vst1q_u32(out, results); 2554 out += 4; 2555 2556 // shift the 6th 4 outs 2557 ind[0] = (in[16] >> 8 | in[17] << 24) >> shifts_2nd[0]; 2558 ind[1] = in[17] >> shifts_2nd[1]; 2559 ind[2] = (in[17] >> 28 | in[18] << 4) >> shifts_2nd[2]; 2560 ind[3] = (in[18] >> 22 | in[19] << 10) >> shifts_2nd[3]; 2561 reg_shft = vld1q_u32(ind); 2562 results = vandq_u32(reg_shft, reg_masks); 2563 vst1q_u32(out, results); 2564 out += 4; 2565 2566 // shift the 7th 4 outs 2567 ind[0] = (in[19] >> 16 | in[20] << 16) >> shifts_3rd[0]; 2568 ind[1] = (in[20] >> 10 | in[21] << 22) >> shifts_3rd[1]; 2569 ind[2] = in[21] >> shifts_3rd[2]; 2570 ind[3] = (in[21] >> 30 | in[22] << 2) >> shifts_3rd[3]; 2571 reg_shft = vld1q_u32(ind); 2572 results = vandq_u32(reg_shft, reg_masks); 2573 vst1q_u32(out, results); 2574 out += 4; 2575 2576 // shift the 8th 4 outs 2577 ind[0] = (in[22] >> 24 | in[23] << 8) >> shifts_4th[0]; 2578 ind[1] = (in[23] >> 18 | in[24] << 14) >> shifts_4th[1]; 2579 ind[2] = (in[24] >> 12 | in[25] << 20) >> shifts_4th[2]; 2580 ind[3] = in[25] >> shifts_4th[3]; 2581 reg_shft = vld1q_u32(ind); 2582 results = vandq_u32(reg_shft, reg_masks); 2583 vst1q_u32(out, results); 2584 out += 4; 2585 2586 in += 26; 2587 2588 return in; 2589 } 2590 2591 inline static const uint32_t* unpack27_32_neon(const uint32_t* in, uint32_t* out) { 2592 uint32_t mask = 0x7ffffff; 2593 uint32_t ind[4]; 2594 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2595 uint32_t shifts_2nd[4] = {0, 0, 2, 0}; 2596 uint32_t shifts_3rd[4] = {0, 0, 0, 0}; 2597 uint32_t shifts_4th[4] = {4, 0, 0, 0}; 2598 uint32_t shifts_5th[4] = {0, 0, 0, 1}; 2599 uint32_t shifts_6th[4] = {0, 0, 0, 0}; 2600 uint32_t shifts_7th[4] = {0, 3, 0, 0}; 2601 uint32_t shifts_8th[4] = {0, 0, 0, 5}; 2602 uint32x4_t reg_shft, reg_masks; 2603 uint32x4_t results; 2604 2605 reg_masks = vdupq_n_u32(mask); 2606 2607 // shift the first 4 outs 2608 ind[0] = in[0] >> shifts_1st[0]; 2609 ind[1] = (in[0] >> 27 | in[1] << 5) >> shifts_1st[1]; 2610 ind[2] = (in[1] >> 22 | in[2] << 10) >> shifts_1st[2]; 2611 ind[3] = (in[2] >> 17 | in[3] << 15) >> shifts_1st[3]; 2612 reg_shft = vld1q_u32(ind); 2613 results = vandq_u32(reg_shft, reg_masks); 2614 vst1q_u32(out, results); 2615 out += 4; 2616 2617 // shift the 2nd 4 outs 2618 ind[0] = (in[3] >> 12 | in[4] << 20) >> shifts_2nd[0]; 2619 ind[1] = (in[4] >> 7 | in[5] << 25) >> shifts_2nd[1]; 2620 ind[2] = in[5] >> shifts_2nd[2]; 2621 ind[3] = (in[5] >> 29 | in[6] << 3) >> shifts_2nd[3]; 2622 reg_shft = vld1q_u32(ind); 2623 results = vandq_u32(reg_shft, reg_masks); 2624 vst1q_u32(out, results); 2625 out += 4; 2626 2627 // shift the 3rd 4 outs 2628 ind[0] = (in[6] >> 24 | in[7] << 8) >> shifts_3rd[0]; 2629 ind[1] = (in[7] >> 19 | in[8] << 13) >> shifts_3rd[1]; 2630 ind[2] = (in[8] >> 14 | in[9] << 18) >> shifts_3rd[2]; 2631 ind[3] = (in[9] >> 9 | in[10] << 23) >> shifts_3rd[3]; 2632 reg_shft = vld1q_u32(ind); 2633 results = vandq_u32(reg_shft, reg_masks); 2634 vst1q_u32(out, results); 2635 out += 4; 2636 2637 // shift the 4th 4 outs 2638 ind[0] = in[10] >> shifts_4th[0]; 2639 ind[1] = (in[10] >> 31 | in[11] << 1) >> shifts_4th[1]; 2640 ind[2] = (in[11] >> 26 | in[12] << 6) >> shifts_4th[2]; 2641 ind[3] = (in[12] >> 21 | in[13] << 11) >> shifts_4th[3]; 2642 reg_shft = vld1q_u32(ind); 2643 results = vandq_u32(reg_shft, reg_masks); 2644 vst1q_u32(out, results); 2645 out += 4; 2646 2647 // shift the 5th 4 outs 2648 ind[0] = (in[13] >> 16 | in[14] << 16) >> shifts_5th[0]; 2649 ind[1] = (in[14] >> 11 | in[15] << 21) >> shifts_5th[1]; 2650 ind[2] = (in[15] >> 6 | in[16] << 26) >> shifts_5th[2]; 2651 ind[3] = in[16] >> shifts_5th[3]; 2652 reg_shft = vld1q_u32(ind); 2653 results = vandq_u32(reg_shft, reg_masks); 2654 vst1q_u32(out, results); 2655 out += 4; 2656 2657 // shift the 6th 4 outs 2658 ind[0] = (in[16] >> 28 | in[17] << 4) >> shifts_6th[0]; 2659 ind[1] = (in[17] >> 23 | in[18] << 9) >> shifts_6th[1]; 2660 ind[2] = (in[18] >> 18 | in[19] << 14) >> shifts_6th[2]; 2661 ind[3] = (in[19] >> 13 | in[20] << 19) >> shifts_6th[3]; 2662 reg_shft = vld1q_u32(ind); 2663 results = vandq_u32(reg_shft, reg_masks); 2664 vst1q_u32(out, results); 2665 out += 4; 2666 2667 // shift the 7th 4 outs 2668 ind[0] = (in[20] >> 8 | in[21] << 24) >> shifts_7th[0]; 2669 ind[1] = in[21] >> shifts_7th[1]; 2670 ind[2] = (in[21] >> 30 | in[22] << 2) >> shifts_7th[2]; 2671 ind[3] = (in[22] >> 25 | in[23] << 7) >> shifts_7th[3]; 2672 reg_shft = vld1q_u32(ind); 2673 results = vandq_u32(reg_shft, reg_masks); 2674 vst1q_u32(out, results); 2675 out += 4; 2676 2677 // shift the 8th 4 outs 2678 ind[0] = (in[23] >> 20 | in[24] << 12) >> shifts_8th[0]; 2679 ind[1] = (in[24] >> 15 | in[25] << 17) >> shifts_8th[1]; 2680 ind[2] = (in[25] >> 10 | in[26] << 22) >> shifts_8th[2]; 2681 ind[3] = in[26] >> shifts_8th[3]; 2682 reg_shft = vld1q_u32(ind); 2683 results = vandq_u32(reg_shft, reg_masks); 2684 vst1q_u32(out, results); 2685 out += 4; 2686 2687 in += 27; 2688 2689 return in; 2690 } 2691 2692 inline static const uint32_t* unpack28_32_neon(const uint32_t* in, uint32_t* out) { 2693 uint32_t mask = 0xfffffff; 2694 uint32_t ind[4]; 2695 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2696 uint32_t shifts_2nd[4] = {0, 0, 0, 4}; 2697 uint32x4_t reg_shft, reg_masks; 2698 uint32x4_t results; 2699 2700 reg_masks = vdupq_n_u32(mask); 2701 2702 // shift the first 4 outs 2703 ind[0] = in[0] >> shifts_1st[0]; 2704 ind[1] = (in[0] >> 28 | in[1] << 4) >> shifts_1st[1]; 2705 ind[2] = (in[1] >> 24 | in[2] << 8) >> shifts_1st[2]; 2706 ind[3] = (in[2] >> 20 | in[3] << 12) >> shifts_1st[3]; 2707 reg_shft = vld1q_u32(ind); 2708 results = vandq_u32(reg_shft, reg_masks); 2709 vst1q_u32(out, results); 2710 out += 4; 2711 2712 // shift the 2nd 4 outs 2713 ind[0] = (in[3] >> 16 | in[4] << 16) >> shifts_2nd[0]; 2714 ind[1] = (in[4] >> 12 | in[5] << 20) >> shifts_2nd[1]; 2715 ind[2] = (in[5] >> 8 | in[6] << 24) >> shifts_2nd[2]; 2716 ind[3] = in[6] >> shifts_2nd[3]; 2717 reg_shft = vld1q_u32(ind); 2718 results = vandq_u32(reg_shft, reg_masks); 2719 vst1q_u32(out, results); 2720 out += 4; 2721 2722 // shift the 3rd 4 outs 2723 ind[0] = in[7] >> shifts_1st[0]; 2724 ind[1] = (in[7] >> 28 | in[8] << 4) >> shifts_1st[1]; 2725 ind[2] = (in[8] >> 24 | in[9] << 8) >> shifts_1st[2]; 2726 ind[3] = (in[9] >> 20 | in[10] << 12) >> shifts_1st[3]; 2727 reg_shft = vld1q_u32(ind); 2728 results = vandq_u32(reg_shft, reg_masks); 2729 vst1q_u32(out, results); 2730 out += 4; 2731 2732 // shift the 4th 4 outs 2733 ind[0] = (in[10] >> 16 | in[11] << 16) >> shifts_2nd[0]; 2734 ind[1] = (in[11] >> 12 | in[12] << 20) >> shifts_2nd[1]; 2735 ind[2] = (in[12] >> 8 | in[13] << 24) >> shifts_2nd[2]; 2736 ind[3] = in[13] >> shifts_2nd[3]; 2737 reg_shft = vld1q_u32(ind); 2738 results = vandq_u32(reg_shft, reg_masks); 2739 vst1q_u32(out, results); 2740 out += 4; 2741 2742 // shift the 5th 4 outs 2743 ind[0] = in[14] >> shifts_1st[0]; 2744 ind[1] = (in[14] >> 28 | in[15] << 4) >> shifts_1st[1]; 2745 ind[2] = (in[15] >> 24 | in[16] << 8) >> shifts_1st[2]; 2746 ind[3] = (in[16] >> 20 | in[17] << 12) >> shifts_1st[3]; 2747 reg_shft = vld1q_u32(ind); 2748 results = vandq_u32(reg_shft, reg_masks); 2749 vst1q_u32(out, results); 2750 out += 4; 2751 2752 // shift the 6th 4 outs 2753 ind[0] = (in[17] >> 16 | in[18] << 16) >> shifts_2nd[0]; 2754 ind[1] = (in[18] >> 12 | in[19] << 20) >> shifts_2nd[1]; 2755 ind[2] = (in[19] >> 8 | in[20] << 24) >> shifts_2nd[2]; 2756 ind[3] = in[20] >> shifts_2nd[3]; 2757 reg_shft = vld1q_u32(ind); 2758 results = vandq_u32(reg_shft, reg_masks); 2759 vst1q_u32(out, results); 2760 out += 4; 2761 2762 // shift the 7th 4 outs 2763 ind[0] = in[21] >> shifts_1st[0]; 2764 ind[1] = (in[21] >> 28 | in[22] << 4) >> shifts_1st[1]; 2765 ind[2] = (in[22] >> 24 | in[23] << 8) >> shifts_1st[2]; 2766 ind[3] = (in[23] >> 20 | in[24] << 12) >> shifts_1st[3]; 2767 reg_shft = vld1q_u32(ind); 2768 results = vandq_u32(reg_shft, reg_masks); 2769 vst1q_u32(out, results); 2770 out += 4; 2771 2772 // shift the 8th 4 outs 2773 ind[0] = (in[24] >> 16 | in[25] << 16) >> shifts_2nd[0]; 2774 ind[1] = (in[25] >> 12 | in[26] << 20) >> shifts_2nd[1]; 2775 ind[2] = (in[26] >> 8 | in[27] << 24) >> shifts_2nd[2]; 2776 ind[3] = in[27] >> shifts_2nd[3]; 2777 reg_shft = vld1q_u32(ind); 2778 results = vandq_u32(reg_shft, reg_masks); 2779 vst1q_u32(out, results); 2780 out += 4; 2781 2782 in += 28; 2783 2784 return in; 2785 } 2786 2787 inline static const uint32_t* unpack29_32_neon(const uint32_t* in, uint32_t* out) { 2788 uint32_t mask = 0x1fffffff; 2789 uint32_t ind[4]; 2790 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2791 uint32_t shifts_2nd[4] = {0, 0, 0, 0}; 2792 uint32_t shifts_3rd[4] = {0, 0, 2, 0}; 2793 uint32_t shifts_4th[4] = {0, 0, 0, 0}; 2794 uint32_t shifts_5th[4] = {0, 0, 0, 0}; 2795 uint32_t shifts_6th[4] = {0, 1, 0, 0}; 2796 uint32_t shifts_7th[4] = {0, 0, 0, 0}; 2797 uint32_t shifts_8th[4] = {0, 0, 0, 3}; 2798 uint32x4_t reg_shft, reg_masks; 2799 uint32x4_t results; 2800 2801 reg_masks = vdupq_n_u32(mask); 2802 2803 // shift the first 4 outs 2804 ind[0] = in[0] >> shifts_1st[0]; 2805 ind[1] = (in[0] >> 29 | in[1] << 3) >> shifts_1st[1]; 2806 ind[2] = (in[1] >> 26 | in[2] << 6) >> shifts_1st[2]; 2807 ind[3] = (in[2] >> 23 | in[3] << 9) >> shifts_1st[3]; 2808 reg_shft = vld1q_u32(ind); 2809 results = vandq_u32(reg_shft, reg_masks); 2810 vst1q_u32(out, results); 2811 out += 4; 2812 2813 // shift the 2nd 4 outs 2814 ind[0] = (in[3] >> 20 | in[4] << 12) >> shifts_2nd[0]; 2815 ind[1] = (in[4] >> 17 | in[5] << 15) >> shifts_2nd[1]; 2816 ind[2] = (in[5] >> 14 | in[6] << 18) >> shifts_2nd[2]; 2817 ind[3] = (in[6] >> 11 | in[7] << 21) >> shifts_2nd[3]; 2818 reg_shft = vld1q_u32(ind); 2819 results = vandq_u32(reg_shft, reg_masks); 2820 vst1q_u32(out, results); 2821 out += 4; 2822 2823 // shift the 3rd 4 outs 2824 ind[0] = (in[7] >> 8 | in[8] << 24) >> shifts_3rd[0]; 2825 ind[1] = (in[8] >> 5 | in[9] << 27) >> shifts_3rd[1]; 2826 ind[2] = in[9] >> shifts_3rd[2]; 2827 ind[3] = (in[9] >> 31 | in[10] << 1) >> shifts_3rd[3]; 2828 reg_shft = vld1q_u32(ind); 2829 results = vandq_u32(reg_shft, reg_masks); 2830 vst1q_u32(out, results); 2831 out += 4; 2832 2833 // shift the 4th 4 outs 2834 ind[0] = (in[10] >> 28 | in[11] << 4) >> shifts_4th[0]; 2835 ind[1] = (in[11] >> 25 | in[12] << 7) >> shifts_4th[1]; 2836 ind[2] = (in[12] >> 22 | in[13] << 10) >> shifts_4th[2]; 2837 ind[3] = (in[13] >> 19 | in[14] << 13) >> shifts_4th[3]; 2838 reg_shft = vld1q_u32(ind); 2839 results = vandq_u32(reg_shft, reg_masks); 2840 vst1q_u32(out, results); 2841 out += 4; 2842 2843 // shift the 5th 4 outs 2844 ind[0] = (in[14] >> 16 | in[15] << 16) >> shifts_5th[0]; 2845 ind[1] = (in[15] >> 13 | in[16] << 19) >> shifts_5th[1]; 2846 ind[2] = (in[16] >> 10 | in[17] << 22) >> shifts_5th[2]; 2847 ind[3] = (in[17] >> 7 | in[18] << 25) >> shifts_5th[3]; 2848 reg_shft = vld1q_u32(ind); 2849 results = vandq_u32(reg_shft, reg_masks); 2850 vst1q_u32(out, results); 2851 out += 4; 2852 2853 // shift the 6th 4 outs 2854 ind[0] = (in[18] >> 4 | in[19] << 28) >> shifts_6th[0]; 2855 ind[1] = in[19] >> shifts_6th[1]; 2856 ind[2] = (in[19] >> 30 | in[20] << 2) >> shifts_6th[2]; 2857 ind[3] = (in[20] >> 27 | in[21] << 5) >> shifts_6th[3]; 2858 reg_shft = vld1q_u32(ind); 2859 results = vandq_u32(reg_shft, reg_masks); 2860 vst1q_u32(out, results); 2861 out += 4; 2862 2863 // shift the 7th 4 outs 2864 ind[0] = (in[21] >> 24 | in[22] << 8) >> shifts_7th[0]; 2865 ind[1] = (in[22] >> 21 | in[23] << 11) >> shifts_7th[1]; 2866 ind[2] = (in[23] >> 18 | in[24] << 14) >> shifts_7th[2]; 2867 ind[3] = (in[24] >> 15 | in[25] << 17) >> shifts_7th[3]; 2868 reg_shft = vld1q_u32(ind); 2869 results = vandq_u32(reg_shft, reg_masks); 2870 vst1q_u32(out, results); 2871 out += 4; 2872 2873 // shift the 8th 4 outs 2874 ind[0] = (in[25] >> 12 | in[26] << 20) >> shifts_8th[0]; 2875 ind[1] = (in[26] >> 9 | in[27] << 23) >> shifts_8th[1]; 2876 ind[2] = (in[27] >> 6 | in[28] << 26) >> shifts_8th[2]; 2877 ind[3] = in[28] >> shifts_8th[3]; 2878 reg_shft = vld1q_u32(ind); 2879 results = vandq_u32(reg_shft, reg_masks); 2880 vst1q_u32(out, results); 2881 out += 4; 2882 2883 in += 29; 2884 2885 return in; 2886 } 2887 2888 inline static const uint32_t* unpack30_32_neon(const uint32_t* in, uint32_t* out) { 2889 uint32_t mask = 0x3fffffff; 2890 uint32_t ind[4]; 2891 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2892 uint32_t shifts_2nd[4] = {0, 0, 0, 0}; 2893 uint32_t shifts_3rd[4] = {0, 0, 0, 0}; 2894 uint32_t shifts_4th[4] = {0, 0, 0, 2}; 2895 uint32x4_t reg_shft, reg_masks; 2896 uint32x4_t results; 2897 2898 reg_masks = vdupq_n_u32(mask); 2899 2900 // shift the first 4 outs 2901 ind[0] = in[0] >> shifts_1st[0]; 2902 ind[1] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[1]; 2903 ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_1st[2]; 2904 ind[3] = (in[2] >> 26 | in[3] << 6) >> shifts_1st[3]; 2905 reg_shft = vld1q_u32(ind); 2906 results = vandq_u32(reg_shft, reg_masks); 2907 vst1q_u32(out, results); 2908 out += 4; 2909 2910 // shift the 2nd 4 outs 2911 ind[0] = (in[3] >> 24 | in[4] << 8) >> shifts_2nd[0]; 2912 ind[1] = (in[4] >> 22 | in[5] << 10) >> shifts_2nd[1]; 2913 ind[2] = (in[5] >> 20 | in[6] << 12) >> shifts_2nd[2]; 2914 ind[3] = (in[6] >> 18 | in[7] << 14) >> shifts_2nd[3]; 2915 reg_shft = vld1q_u32(ind); 2916 results = vandq_u32(reg_shft, reg_masks); 2917 vst1q_u32(out, results); 2918 out += 4; 2919 2920 // shift the 3rd 4 outs 2921 ind[0] = (in[7] >> 16 | in[8] << 16) >> shifts_3rd[0]; 2922 ind[1] = (in[8] >> 14 | in[9] << 18) >> shifts_3rd[1]; 2923 ind[2] = (in[9] >> 12 | in[10] << 20) >> shifts_3rd[2]; 2924 ind[3] = (in[10] >> 10 | in[11] << 22) >> shifts_3rd[3]; 2925 reg_shft = vld1q_u32(ind); 2926 results = vandq_u32(reg_shft, reg_masks); 2927 vst1q_u32(out, results); 2928 out += 4; 2929 2930 // shift the 4th 4 outs 2931 ind[0] = (in[11] >> 8 | in[12] << 24) >> shifts_4th[0]; 2932 ind[1] = (in[12] >> 6 | in[13] << 26) >> shifts_4th[1]; 2933 ind[2] = (in[13] >> 4 | in[14] << 28) >> shifts_4th[2]; 2934 ind[3] = in[14] >> shifts_4th[3]; 2935 reg_shft = vld1q_u32(ind); 2936 results = vandq_u32(reg_shft, reg_masks); 2937 vst1q_u32(out, results); 2938 out += 4; 2939 2940 // shift the 5th 4 outs 2941 ind[0] = in[15] >> shifts_1st[0]; 2942 ind[1] = (in[15] >> 30 | in[16] << 2) >> shifts_1st[1]; 2943 ind[2] = (in[16] >> 28 | in[17] << 4) >> shifts_1st[2]; 2944 ind[3] = (in[17] >> 26 | in[18] << 6) >> shifts_1st[3]; 2945 reg_shft = vld1q_u32(ind); 2946 results = vandq_u32(reg_shft, reg_masks); 2947 vst1q_u32(out, results); 2948 out += 4; 2949 2950 // shift the 6th 4 outs 2951 ind[0] = (in[18] >> 24 | in[19] << 8) >> shifts_2nd[0]; 2952 ind[1] = (in[19] >> 22 | in[20] << 10) >> shifts_2nd[1]; 2953 ind[2] = (in[20] >> 20 | in[21] << 12) >> shifts_2nd[2]; 2954 ind[3] = (in[21] >> 18 | in[22] << 14) >> shifts_2nd[3]; 2955 reg_shft = vld1q_u32(ind); 2956 results = vandq_u32(reg_shft, reg_masks); 2957 vst1q_u32(out, results); 2958 out += 4; 2959 2960 // shift the 7th 4 outs 2961 ind[0] = (in[22] >> 16 | in[23] << 16) >> shifts_3rd[0]; 2962 ind[1] = (in[23] >> 14 | in[24] << 18) >> shifts_3rd[1]; 2963 ind[2] = (in[24] >> 12 | in[25] << 20) >> shifts_3rd[2]; 2964 ind[3] = (in[25] >> 10 | in[26] << 22) >> shifts_3rd[3]; 2965 reg_shft = vld1q_u32(ind); 2966 results = vandq_u32(reg_shft, reg_masks); 2967 vst1q_u32(out, results); 2968 out += 4; 2969 2970 // shift the 8th 4 outs 2971 ind[0] = (in[26] >> 8 | in[27] << 24) >> shifts_4th[0]; 2972 ind[1] = (in[27] >> 6 | in[28] << 26) >> shifts_4th[1]; 2973 ind[2] = (in[28] >> 4 | in[29] << 28) >> shifts_4th[2]; 2974 ind[3] = in[29] >> shifts_4th[3]; 2975 reg_shft = vld1q_u32(ind); 2976 results = vandq_u32(reg_shft, reg_masks); 2977 vst1q_u32(out, results); 2978 out += 4; 2979 2980 in += 30; 2981 2982 return in; 2983 } 2984 2985 inline static const uint32_t* unpack31_32_neon(const uint32_t* in, uint32_t* out) { 2986 uint32_t mask = 0x7fffffff; 2987 uint32_t ind[4]; 2988 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2989 uint32_t shifts_2nd[4] = {0, 0, 0, 1}; 2990 uint32x4_t reg_shft, reg_masks; 2991 uint32x4_t results; 2992 2993 reg_masks = vdupq_n_u32(mask); 2994 2995 // shift the first 4 outs 2996 ind[0] = in[0] >> shifts_1st[0]; 2997 ind[1] = (in[0] >> 31 | in[1] << 1) >> shifts_1st[1]; 2998 ind[2] = (in[1] >> 30 | in[2] << 2) >> shifts_1st[2]; 2999 ind[3] = (in[2] >> 29 | in[3] << 3) >> shifts_1st[3]; 3000 reg_shft = vld1q_u32(ind); 3001 results = vandq_u32(reg_shft, reg_masks); 3002 vst1q_u32(out, results); 3003 out += 4; 3004 3005 // shift the 2nd 4 outs 3006 ind[0] = (in[3] >> 28 | in[4] << 4) >> shifts_1st[0]; 3007 ind[1] = (in[4] >> 27 | in[5] << 5) >> shifts_1st[1]; 3008 ind[2] = (in[5] >> 26 | in[6] << 6) >> shifts_1st[2]; 3009 ind[3] = (in[6] >> 25 | in[7] << 7) >> shifts_1st[3]; 3010 reg_shft = vld1q_u32(ind); 3011 results = vandq_u32(reg_shft, reg_masks); 3012 vst1q_u32(out, results); 3013 out += 4; 3014 3015 // shift the 3rd 4 outs 3016 ind[0] = (in[7] >> 24 | in[8] << 8) >> shifts_1st[0]; 3017 ind[1] = (in[8] >> 23 | in[9] << 9) >> shifts_1st[1]; 3018 ind[2] = (in[9] >> 22 | in[10] << 10) >> shifts_1st[2]; 3019 ind[3] = (in[10] >> 21 | in[11] << 11) >> shifts_1st[3]; 3020 reg_shft = vld1q_u32(ind); 3021 results = vandq_u32(reg_shft, reg_masks); 3022 vst1q_u32(out, results); 3023 out += 4; 3024 3025 // shift the 4th 4 outs 3026 ind[0] = (in[11] >> 20 | in[12] << 12) >> shifts_1st[0]; 3027 ind[1] = (in[12] >> 19 | in[13] << 13) >> shifts_1st[1]; 3028 ind[2] = (in[13] >> 18 | in[14] << 14) >> shifts_1st[2]; 3029 ind[3] = (in[14] >> 17 | in[15] << 15) >> shifts_1st[3]; 3030 reg_shft = vld1q_u32(ind); 3031 results = vandq_u32(reg_shft, reg_masks); 3032 vst1q_u32(out, results); 3033 out += 4; 3034 3035 // shift the 5th 4 outs 3036 ind[0] = (in[15] >> 16 | in[16] << 16) >> shifts_1st[0]; 3037 ind[1] = (in[16] >> 15 | in[17] << 17) >> shifts_1st[1]; 3038 ind[2] = (in[17] >> 14 | in[18] << 18) >> shifts_1st[2]; 3039 ind[3] = (in[18] >> 13 | in[19] << 19) >> shifts_1st[3]; 3040 reg_shft = vld1q_u32(ind); 3041 results = vandq_u32(reg_shft, reg_masks); 3042 vst1q_u32(out, results); 3043 out += 4; 3044 3045 // shift the 6th 4 outs 3046 ind[0] = (in[19] >> 12 | in[20] << 20) >> shifts_1st[0]; 3047 ind[1] = (in[20] >> 11 | in[21] << 21) >> shifts_1st[1]; 3048 ind[2] = (in[21] >> 10 | in[22] << 22) >> shifts_1st[2]; 3049 ind[3] = (in[22] >> 9 | in[23] << 23) >> shifts_1st[3]; 3050 reg_shft = vld1q_u32(ind); 3051 results = vandq_u32(reg_shft, reg_masks); 3052 vst1q_u32(out, results); 3053 out += 4; 3054 3055 // shift the 7th 4 outs 3056 ind[0] = (in[23] >> 8 | in[24] << 24) >> shifts_1st[0]; 3057 ind[1] = (in[24] >> 7 | in[25] << 25) >> shifts_1st[1]; 3058 ind[2] = (in[25] >> 6 | in[26] << 26) >> shifts_1st[2]; 3059 ind[3] = (in[26] >> 5 | in[27] << 27) >> shifts_1st[3]; 3060 reg_shft = vld1q_u32(ind); 3061 results = vandq_u32(reg_shft, reg_masks); 3062 vst1q_u32(out, results); 3063 out += 4; 3064 3065 // shift the 8th 4 outs 3066 ind[0] = (in[27] >> 4 | in[28] << 28) >> shifts_2nd[0]; 3067 ind[1] = (in[28] >> 3 | in[29] << 29) >> shifts_2nd[1]; 3068 ind[2] = (in[29] >> 2 | in[30] << 30) >> shifts_2nd[2]; 3069 ind[3] = in[30] >> shifts_2nd[3]; 3070 reg_shft = vld1q_u32(ind); 3071 results = vandq_u32(reg_shft, reg_masks); 3072 vst1q_u32(out, results); 3073 out += 4; 3074 3075 in += 31; 3076 3077 return in; 3078 } 3079 3080 inline const uint32_t* unpack32_32_neon(const uint32_t* in, uint32_t* out) { 3081 for (const uint32_t* end = out + 32; out != end; out++) { 3082 *out = *in; 3083 in++; 3084 } 3085 3086 return in; 3087 } 3088 3089 int unpack32_neon(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { 3090 batch_size = batch_size / 32 * 32; 3091 int num_loops = batch_size / 32; 3092 3093 switch (num_bits) { 3094 case 0: 3095 for (int i = 0; i < num_loops; ++i) in = unpack0_32_neon(in, out + i * 32); 3096 break; 3097 case 1: 3098 for (int i = 0; i < num_loops; ++i) in = unpack1_32_neon(in, out + i * 32); 3099 break; 3100 case 2: 3101 for (int i = 0; i < num_loops; ++i) in = unpack2_32_neon(in, out + i * 32); 3102 break; 3103 case 3: 3104 for (int i = 0; i < num_loops; ++i) in = unpack3_32_neon(in, out + i * 32); 3105 break; 3106 case 4: 3107 for (int i = 0; i < num_loops; ++i) in = unpack4_32_neon(in, out + i * 32); 3108 break; 3109 case 5: 3110 for (int i = 0; i < num_loops; ++i) in = unpack5_32_neon(in, out + i * 32); 3111 break; 3112 case 6: 3113 for (int i = 0; i < num_loops; ++i) in = unpack6_32_neon(in, out + i * 32); 3114 break; 3115 case 7: 3116 for (int i = 0; i < num_loops; ++i) in = unpack7_32_neon(in, out + i * 32); 3117 break; 3118 case 8: 3119 for (int i = 0; i < num_loops; ++i) in = unpack8_32_neon(in, out + i * 32); 3120 break; 3121 case 9: 3122 for (int i = 0; i < num_loops; ++i) in = unpack9_32_neon(in, out + i * 32); 3123 break; 3124 case 10: 3125 for (int i = 0; i < num_loops; ++i) in = unpack10_32_neon(in, out + i * 32); 3126 break; 3127 case 11: 3128 for (int i = 0; i < num_loops; ++i) in = unpack11_32_neon(in, out + i * 32); 3129 break; 3130 case 12: 3131 for (int i = 0; i < num_loops; ++i) in = unpack12_32_neon(in, out + i * 32); 3132 break; 3133 case 13: 3134 for (int i = 0; i < num_loops; ++i) in = unpack13_32_neon(in, out + i * 32); 3135 break; 3136 case 14: 3137 for (int i = 0; i < num_loops; ++i) in = unpack14_32_neon(in, out + i * 32); 3138 break; 3139 case 15: 3140 for (int i = 0; i < num_loops; ++i) in = unpack15_32_neon(in, out + i * 32); 3141 break; 3142 case 16: 3143 for (int i = 0; i < num_loops; ++i) in = unpack16_32_neon(in, out + i * 32); 3144 break; 3145 case 17: 3146 for (int i = 0; i < num_loops; ++i) in = unpack17_32_neon(in, out + i * 32); 3147 break; 3148 case 18: 3149 for (int i = 0; i < num_loops; ++i) in = unpack18_32_neon(in, out + i * 32); 3150 break; 3151 case 19: 3152 for (int i = 0; i < num_loops; ++i) in = unpack19_32_neon(in, out + i * 32); 3153 break; 3154 case 20: 3155 for (int i = 0; i < num_loops; ++i) in = unpack20_32_neon(in, out + i * 32); 3156 break; 3157 case 21: 3158 for (int i = 0; i < num_loops; ++i) in = unpack21_32_neon(in, out + i * 32); 3159 break; 3160 case 22: 3161 for (int i = 0; i < num_loops; ++i) in = unpack22_32_neon(in, out + i * 32); 3162 break; 3163 case 23: 3164 for (int i = 0; i < num_loops; ++i) in = unpack23_32_neon(in, out + i * 32); 3165 break; 3166 case 24: 3167 for (int i = 0; i < num_loops; ++i) in = unpack24_32_neon(in, out + i * 32); 3168 break; 3169 case 25: 3170 for (int i = 0; i < num_loops; ++i) in = unpack25_32_neon(in, out + i * 32); 3171 break; 3172 case 26: 3173 for (int i = 0; i < num_loops; ++i) in = unpack26_32_neon(in, out + i * 32); 3174 break; 3175 case 27: 3176 for (int i = 0; i < num_loops; ++i) in = unpack27_32_neon(in, out + i * 32); 3177 break; 3178 case 28: 3179 for (int i = 0; i < num_loops; ++i) in = unpack28_32_neon(in, out + i * 32); 3180 break; 3181 case 29: 3182 for (int i = 0; i < num_loops; ++i) in = unpack29_32_neon(in, out + i * 32); 3183 break; 3184 case 30: 3185 for (int i = 0; i < num_loops; ++i) in = unpack30_32_neon(in, out + i * 32); 3186 break; 3187 case 31: 3188 for (int i = 0; i < num_loops; ++i) in = unpack31_32_neon(in, out + i * 32); 3189 break; 3190 case 32: 3191 for (int i = 0; i < num_loops; ++i) in = unpack32_32_neon(in, out + i * 32); 3192 break; 3193 } 3194 3195 return batch_size; 3196 }