github.com/apache/arrow/go/v10@v10.0.1/parquet/internal/utils/_lib/bit_packing_neon.c (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 #include <stdint.h> 18 #include <string.h> 19 20 #include "arm_neon.h" 21 22 inline const uint32_t* unpack0_32_neon(const uint32_t* in, uint32_t* out) { 23 memset(out, 0x0, 32 * sizeof(*out)); 24 out += 32; 25 26 return in; 27 } 28 29 inline static const uint32_t* unpack1_32_neon(const uint32_t* in, uint32_t* out) { 30 uint32_t mask = 0x1; 31 uint32_t ind[4]; 32 uint32_t shifts_1st[4] = {0, 1, 2, 3}; 33 uint32_t shifts_2nd[4] = {4, 5, 6, 7}; 34 uint32_t shifts_3rd[4] = {8, 9, 10, 11}; 35 uint32_t shifts_4th[4] = {12, 13, 14, 15}; 36 uint32_t shifts_5th[4] = {16, 17, 18, 19}; 37 uint32_t shifts_6th[4] = {20, 21, 22, 23}; 38 uint32_t shifts_7th[4] = {24, 25, 26, 27}; 39 uint32_t shifts_8th[4] = {28, 29, 30, 31}; 40 uint32x4_t reg_shft, reg_masks; 41 uint32x4_t results; 42 43 reg_masks = vdupq_n_u32(mask); 44 45 // shift the first 4 outs 46 ind[0] = in[0] >> shifts_1st[0]; 47 ind[1] = in[0] >> shifts_1st[1]; 48 ind[2] = in[0] >> shifts_1st[2]; 49 ind[3] = in[0] >> shifts_1st[3]; 50 reg_shft = vld1q_u32(ind); 51 results = vandq_u32(reg_shft, reg_masks); 52 vst1q_u32(out, results); 53 out += 4; 54 55 // shift the 2nd 4 outs 56 ind[0] = in[0] >> shifts_2nd[0]; 57 ind[1] = in[0] >> shifts_2nd[1]; 58 ind[2] = in[0] >> shifts_2nd[2]; 59 ind[3] = in[0] >> shifts_2nd[3]; 60 reg_shft = vld1q_u32(ind); 61 results = vandq_u32(reg_shft, reg_masks); 62 vst1q_u32(out, results); 63 out += 4; 64 65 // shift the 3rd 4 outs 66 ind[0] = in[0] >> shifts_3rd[0]; 67 ind[1] = in[0] >> shifts_3rd[1]; 68 ind[2] = in[0] >> shifts_3rd[2]; 69 ind[3] = in[0] >> shifts_3rd[3]; 70 reg_shft = vld1q_u32(ind); 71 results = vandq_u32(reg_shft, reg_masks); 72 vst1q_u32(out, results); 73 out += 4; 74 75 // shift the 4th 4 outs 76 ind[0] = in[0] >> shifts_4th[0]; 77 ind[1] = in[0] >> shifts_4th[1]; 78 ind[2] = in[0] >> shifts_4th[2]; 79 ind[3] = in[0] >> shifts_4th[3]; 80 reg_shft = vld1q_u32(ind); 81 results = vandq_u32(reg_shft, reg_masks); 82 vst1q_u32(out, results); 83 out += 4; 84 85 // shift the 5th 4 outs 86 ind[0] = in[0] >> shifts_5th[0]; 87 ind[1] = in[0] >> shifts_5th[1]; 88 ind[2] = in[0] >> shifts_5th[2]; 89 ind[3] = in[0] >> shifts_5th[3]; 90 reg_shft = vld1q_u32(ind); 91 results = vandq_u32(reg_shft, reg_masks); 92 vst1q_u32(out, results); 93 out += 4; 94 95 // shift the 6th 4 outs 96 ind[0] = in[0] >> shifts_6th[0]; 97 ind[1] = in[0] >> shifts_6th[1]; 98 ind[2] = in[0] >> shifts_6th[2]; 99 ind[3] = in[0] >> shifts_6th[3]; 100 reg_shft = vld1q_u32(ind); 101 results = vandq_u32(reg_shft, reg_masks); 102 vst1q_u32(out, results); 103 out += 4; 104 105 // shift the 7th 4 outs 106 ind[0] = in[0] >> shifts_7th[0]; 107 ind[1] = in[0] >> shifts_7th[1]; 108 ind[2] = in[0] >> shifts_7th[2]; 109 ind[3] = in[0] >> shifts_7th[3]; 110 reg_shft = vld1q_u32(ind); 111 results = vandq_u32(reg_shft, reg_masks); 112 vst1q_u32(out, results); 113 out += 4; 114 115 // shift the 8th 4 outs 116 ind[0] = in[0] >> shifts_8th[0]; 117 ind[1] = in[0] >> shifts_8th[1]; 118 ind[2] = in[0] >> shifts_8th[2]; 119 ind[3] = in[0] >> shifts_8th[3]; 120 reg_shft = vld1q_u32(ind); 121 results = vandq_u32(reg_shft, reg_masks); 122 vst1q_u32(out, results); 123 out += 4; 124 125 in += 1; 126 127 return in; 128 } 129 130 inline static const uint32_t* unpack2_32_neon(const uint32_t* in, uint32_t* out) { 131 uint32_t mask = 0x3; 132 uint32_t ind[4]; 133 uint32_t shifts_1st[4] = {0, 2, 4, 6}; 134 uint32_t shifts_2nd[4] = {8, 10, 12, 14}; 135 uint32_t shifts_3rd[4] = {16, 18, 20, 22}; 136 uint32_t shifts_4th[4] = {24, 26, 28, 30}; 137 138 uint32x4_t reg_shft, reg_masks; 139 uint32x4_t results; 140 141 reg_masks = vdupq_n_u32(mask); 142 143 // shift the first 4 outs 144 ind[0] = in[0] >> shifts_1st[0]; 145 ind[1] = in[0] >> shifts_1st[1]; 146 ind[2] = in[0] >> shifts_1st[2]; 147 ind[3] = in[0] >> shifts_1st[3]; 148 reg_shft = vld1q_u32(ind); 149 results = vandq_u32(reg_shft, reg_masks); 150 vst1q_u32(out, results); 151 out += 4; 152 153 // shift the 2nd 4 outs 154 ind[0] = in[0] >> shifts_2nd[0]; 155 ind[1] = in[0] >> shifts_2nd[1]; 156 ind[2] = in[0] >> shifts_2nd[2]; 157 ind[3] = in[0] >> shifts_2nd[3]; 158 reg_shft = vld1q_u32(ind); 159 results = vandq_u32(reg_shft, reg_masks); 160 vst1q_u32(out, results); 161 out += 4; 162 163 // shift the 3rd 4 outs 164 ind[0] = in[0] >> shifts_3rd[0]; 165 ind[1] = in[0] >> shifts_3rd[1]; 166 ind[2] = in[0] >> shifts_3rd[2]; 167 ind[3] = in[0] >> shifts_3rd[3]; 168 reg_shft = vld1q_u32(ind); 169 results = vandq_u32(reg_shft, reg_masks); 170 vst1q_u32(out, results); 171 out += 4; 172 173 // shift the 4th 4 outs 174 ind[0] = in[0] >> shifts_4th[0]; 175 ind[1] = in[0] >> shifts_4th[1]; 176 ind[2] = in[0] >> shifts_4th[2]; 177 ind[3] = in[0] >> shifts_4th[3]; 178 reg_shft = vld1q_u32(ind); 179 results = vandq_u32(reg_shft, reg_masks); 180 vst1q_u32(out, results); 181 out += 4; 182 183 // shift the 5th 4 outs 184 ind[0] = in[1] >> shifts_1st[0]; 185 ind[1] = in[1] >> shifts_1st[1]; 186 ind[2] = in[1] >> shifts_1st[2]; 187 ind[3] = in[1] >> shifts_1st[3]; 188 reg_shft = vld1q_u32(ind); 189 results = vandq_u32(reg_shft, reg_masks); 190 vst1q_u32(out, results); 191 out += 4; 192 193 // shift the 6th 4 outs 194 ind[0] = in[1] >> shifts_2nd[0]; 195 ind[1] = in[1] >> shifts_2nd[1]; 196 ind[2] = in[1] >> shifts_2nd[2]; 197 ind[3] = in[1] >> shifts_2nd[3]; 198 reg_shft = vld1q_u32(ind); 199 results = vandq_u32(reg_shft, reg_masks); 200 vst1q_u32(out, results); 201 out += 4; 202 203 // shift the 7th 4 outs 204 ind[0] = in[1] >> shifts_3rd[0]; 205 ind[1] = in[1] >> shifts_3rd[1]; 206 ind[2] = in[1] >> shifts_3rd[2]; 207 ind[3] = in[1] >> shifts_3rd[3]; 208 reg_shft = vld1q_u32(ind); 209 results = vandq_u32(reg_shft, reg_masks); 210 vst1q_u32(out, results); 211 out += 4; 212 213 // shift the 8th 4 outs 214 ind[0] = in[1] >> shifts_4th[0]; 215 ind[1] = in[1] >> shifts_4th[1]; 216 ind[2] = in[1] >> shifts_4th[2]; 217 ind[3] = in[1] >> shifts_4th[3]; 218 reg_shft = vld1q_u32(ind); 219 results = vandq_u32(reg_shft, reg_masks); 220 vst1q_u32(out, results); 221 out += 4; 222 223 in += 2; 224 225 return in; 226 } 227 228 inline static const uint32_t* unpack3_32_neon(const uint32_t* in, uint32_t* out) { 229 uint32_t mask = 0x7; 230 uint32_t ind[4]; 231 uint32_t shifts_1st[4] = {0, 3, 6, 9}; 232 uint32_t shifts_2nd[4] = {12, 15, 18, 21}; 233 uint32_t shifts_3rd[4] = {24, 27, 0, 1}; 234 uint32_t shifts_4th[4] = {4, 7, 10, 13}; 235 uint32_t shifts_5th[4] = {16, 19, 22, 25}; 236 uint32_t shifts_6th[4] = {28, 0, 2, 5}; 237 uint32_t shifts_7th[4] = {8, 11, 14, 17}; 238 uint32_t shifts_8th[4] = {20, 23, 26, 29}; 239 uint32x4_t reg_shft, reg_masks; 240 uint32x4_t results; 241 242 reg_masks = vdupq_n_u32(mask); 243 244 // shift the first 4 outs 245 ind[0] = in[0] >> shifts_1st[0]; 246 ind[1] = in[0] >> shifts_1st[1]; 247 ind[2] = in[0] >> shifts_1st[2]; 248 ind[3] = in[0] >> shifts_1st[3]; 249 reg_shft = vld1q_u32(ind); 250 results = vandq_u32(reg_shft, reg_masks); 251 vst1q_u32(out, results); 252 out += 4; 253 254 // shift the 2nd 4 outs 255 ind[0] = in[0] >> shifts_2nd[0]; 256 ind[1] = in[0] >> shifts_2nd[1]; 257 ind[2] = in[0] >> shifts_2nd[2]; 258 ind[3] = in[0] >> shifts_2nd[3]; 259 reg_shft = vld1q_u32(ind); 260 results = vandq_u32(reg_shft, reg_masks); 261 vst1q_u32(out, results); 262 out += 4; 263 264 // shift the 3rd 4 outs 265 ind[0] = in[0] >> shifts_3rd[0]; 266 ind[1] = in[0] >> shifts_3rd[1]; 267 ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_3rd[2]; 268 ind[3] = in[1] >> shifts_3rd[3]; 269 reg_shft = vld1q_u32(ind); 270 results = vandq_u32(reg_shft, reg_masks); 271 vst1q_u32(out, results); 272 out += 4; 273 274 // shift the 4th 4 outs 275 ind[0] = in[1] >> shifts_4th[0]; 276 ind[1] = in[1] >> shifts_4th[1]; 277 ind[2] = in[1] >> shifts_4th[2]; 278 ind[3] = in[1] >> shifts_4th[3]; 279 reg_shft = vld1q_u32(ind); 280 results = vandq_u32(reg_shft, reg_masks); 281 vst1q_u32(out, results); 282 out += 4; 283 284 // shift the 5th 4 outs 285 ind[0] = in[1] >> shifts_5th[0]; 286 ind[1] = in[1] >> shifts_5th[1]; 287 ind[2] = in[1] >> shifts_5th[2]; 288 ind[3] = in[1] >> shifts_5th[3]; 289 reg_shft = vld1q_u32(ind); 290 results = vandq_u32(reg_shft, reg_masks); 291 vst1q_u32(out, results); 292 out += 4; 293 294 // shift the 6th 4 outs 295 ind[0] = in[1] >> shifts_6th[0]; 296 ind[1] = (in[1] >> 31 | in[2] << 1) >> shifts_6th[1]; 297 ind[2] = in[2] >> shifts_6th[2]; 298 ind[3] = in[2] >> shifts_6th[3]; 299 reg_shft = vld1q_u32(ind); 300 results = vandq_u32(reg_shft, reg_masks); 301 vst1q_u32(out, results); 302 out += 4; 303 304 // shift the 7th 4 outs 305 ind[0] = in[2] >> shifts_7th[0]; 306 ind[1] = in[2] >> shifts_7th[1]; 307 ind[2] = in[2] >> shifts_7th[2]; 308 ind[3] = in[2] >> shifts_7th[3]; 309 reg_shft = vld1q_u32(ind); 310 results = vandq_u32(reg_shft, reg_masks); 311 vst1q_u32(out, results); 312 out += 4; 313 314 // shift the 8th 4 outs 315 ind[0] = in[2] >> shifts_8th[0]; 316 ind[1] = in[2] >> shifts_8th[1]; 317 ind[2] = in[2] >> shifts_8th[2]; 318 ind[3] = in[2] >> shifts_8th[3]; 319 reg_shft = vld1q_u32(ind); 320 results = vandq_u32(reg_shft, reg_masks); 321 vst1q_u32(out, results); 322 out += 4; 323 324 in += 3; 325 326 return in; 327 } 328 329 inline static const uint32_t* unpack4_32_neon(const uint32_t* in, uint32_t* out) { 330 uint32_t mask = 0xf; 331 uint32_t ind[4]; 332 uint32_t shifts_1st[4] = {0, 4, 8, 12}; 333 uint32_t shifts_2nd[4] = {16, 20, 24, 28}; 334 uint32x4_t reg_shft, reg_masks; 335 uint32x4_t results; 336 337 reg_masks = vdupq_n_u32(mask); 338 339 // shift the first 4 outs 340 ind[0] = in[0] >> shifts_1st[0]; 341 ind[1] = in[0] >> shifts_1st[1]; 342 ind[2] = in[0] >> shifts_1st[2]; 343 ind[3] = in[0] >> shifts_1st[3]; 344 reg_shft = vld1q_u32(ind); 345 results = vandq_u32(reg_shft, reg_masks); 346 vst1q_u32(out, results); 347 out += 4; 348 349 // shift the 2nd 4 outs 350 ind[0] = in[0] >> shifts_2nd[0]; 351 ind[1] = in[0] >> shifts_2nd[1]; 352 ind[2] = in[0] >> shifts_2nd[2]; 353 ind[3] = in[0] >> shifts_2nd[3]; 354 reg_shft = vld1q_u32(ind); 355 results = vandq_u32(reg_shft, reg_masks); 356 vst1q_u32(out, results); 357 out += 4; 358 359 // shift the 3rd 4 outs 360 ind[0] = in[1] >> shifts_1st[0]; 361 ind[1] = in[1] >> shifts_1st[1]; 362 ind[2] = in[1] >> shifts_1st[2]; 363 ind[3] = in[1] >> shifts_1st[3]; 364 reg_shft = vld1q_u32(ind); 365 results = vandq_u32(reg_shft, reg_masks); 366 vst1q_u32(out, results); 367 out += 4; 368 369 // shift the 4th 4 outs 370 ind[0] = in[1] >> shifts_2nd[0]; 371 ind[1] = in[1] >> shifts_2nd[1]; 372 ind[2] = in[1] >> shifts_2nd[2]; 373 ind[3] = in[1] >> shifts_2nd[3]; 374 reg_shft = vld1q_u32(ind); 375 results = vandq_u32(reg_shft, reg_masks); 376 vst1q_u32(out, results); 377 out += 4; 378 379 // shift the 5th 4 outs 380 ind[0] = in[2] >> shifts_1st[0]; 381 ind[1] = in[2] >> shifts_1st[1]; 382 ind[2] = in[2] >> shifts_1st[2]; 383 ind[3] = in[2] >> shifts_1st[3]; 384 reg_shft = vld1q_u32(ind); 385 results = vandq_u32(reg_shft, reg_masks); 386 vst1q_u32(out, results); 387 out += 4; 388 389 // shift the 6th 4 outs 390 ind[0] = in[2] >> shifts_2nd[0]; 391 ind[1] = in[2] >> shifts_2nd[1]; 392 ind[2] = in[2] >> shifts_2nd[2]; 393 ind[3] = in[2] >> shifts_2nd[3]; 394 reg_shft = vld1q_u32(ind); 395 results = vandq_u32(reg_shft, reg_masks); 396 vst1q_u32(out, results); 397 out += 4; 398 399 // shift the 7th 4 outs 400 ind[0] = in[3] >> shifts_1st[0]; 401 ind[1] = in[3] >> shifts_1st[1]; 402 ind[2] = in[3] >> shifts_1st[2]; 403 ind[3] = in[3] >> shifts_1st[3]; 404 reg_shft = vld1q_u32(ind); 405 results = vandq_u32(reg_shft, reg_masks); 406 vst1q_u32(out, results); 407 out += 4; 408 409 // shift the 8th 4 outs 410 ind[0] = in[3] >> shifts_2nd[0]; 411 ind[1] = in[3] >> shifts_2nd[1]; 412 ind[2] = in[3] >> shifts_2nd[2]; 413 ind[3] = in[3] >> shifts_2nd[3]; 414 reg_shft = vld1q_u32(ind); 415 results = vandq_u32(reg_shft, reg_masks); 416 vst1q_u32(out, results); 417 out += 4; 418 419 in += 4; 420 421 return in; 422 } 423 424 inline static const uint32_t* unpack5_32_neon(const uint32_t* in, uint32_t* out) { 425 uint32_t mask = 0x1f; 426 uint32_t ind[4]; 427 uint32_t shifts_1st[4] = {0, 5, 10, 15}; 428 uint32_t shifts_2nd[4] = {20, 25, 0, 3}; 429 uint32_t shifts_3rd[4] = {8, 13, 18, 23}; 430 uint32_t shifts_4th[4] = {0, 1, 6, 11}; 431 uint32_t shifts_5th[4] = {16, 21, 26, 0}; 432 uint32_t shifts_6th[4] = {4, 9, 14, 19}; 433 uint32_t shifts_7th[4] = {24, 0, 2, 7}; 434 uint32_t shifts_8th[4] = {12, 17, 22, 27}; 435 uint32x4_t reg_shft, reg_masks; 436 uint32x4_t results; 437 438 reg_masks = vdupq_n_u32(mask); 439 440 // shift the first 4 outs 441 ind[0] = in[0] >> shifts_1st[0]; 442 ind[1] = in[0] >> shifts_1st[1]; 443 ind[2] = in[0] >> shifts_1st[2]; 444 ind[3] = in[0] >> shifts_1st[3]; 445 reg_shft = vld1q_u32(ind); 446 results = vandq_u32(reg_shft, reg_masks); 447 vst1q_u32(out, results); 448 out += 4; 449 450 // shift the 2nd 4 outs 451 ind[0] = in[0] >> shifts_2nd[0]; 452 ind[1] = in[0] >> shifts_2nd[1]; 453 ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_2nd[2]; 454 ind[3] = in[1] >> shifts_2nd[3]; 455 reg_shft = vld1q_u32(ind); 456 results = vandq_u32(reg_shft, reg_masks); 457 vst1q_u32(out, results); 458 out += 4; 459 460 // shift the 3rd 4 outs 461 ind[0] = in[1] >> shifts_3rd[0]; 462 ind[1] = in[1] >> shifts_3rd[1]; 463 ind[2] = in[1] >> shifts_3rd[2]; 464 ind[3] = in[1] >> shifts_3rd[3]; 465 reg_shft = vld1q_u32(ind); 466 results = vandq_u32(reg_shft, reg_masks); 467 vst1q_u32(out, results); 468 out += 4; 469 470 // shift the 4th 4 outs 471 ind[0] = (in[1] >> 28 | in[2] << 4) >> shifts_4th[0]; 472 ind[1] = in[2] >> shifts_4th[1]; 473 ind[2] = in[2] >> shifts_4th[2]; 474 ind[3] = in[2] >> shifts_4th[3]; 475 reg_shft = vld1q_u32(ind); 476 results = vandq_u32(reg_shft, reg_masks); 477 vst1q_u32(out, results); 478 out += 4; 479 480 // shift the 5th 4 outs 481 ind[0] = in[2] >> shifts_5th[0]; 482 ind[1] = in[2] >> shifts_5th[1]; 483 ind[2] = in[2] >> shifts_5th[2]; 484 ind[3] = (in[2] >> 31 | in[3] << 1) >> shifts_5th[3]; 485 reg_shft = vld1q_u32(ind); 486 results = vandq_u32(reg_shft, reg_masks); 487 vst1q_u32(out, results); 488 out += 4; 489 490 // shift the 6th 4 outs 491 ind[0] = in[3] >> shifts_6th[0]; 492 ind[1] = in[3] >> shifts_6th[1]; 493 ind[2] = in[3] >> shifts_6th[2]; 494 ind[3] = in[3] >> shifts_6th[3]; 495 reg_shft = vld1q_u32(ind); 496 results = vandq_u32(reg_shft, reg_masks); 497 vst1q_u32(out, results); 498 out += 4; 499 500 // shift the 7th 4 outs 501 ind[0] = in[3] >> shifts_7th[0]; 502 ind[1] = (in[3] >> 29 | in[4] << 3) >> shifts_7th[1]; 503 ind[2] = in[4] >> shifts_7th[2]; 504 ind[3] = in[4] >> shifts_7th[3]; 505 reg_shft = vld1q_u32(ind); 506 results = vandq_u32(reg_shft, reg_masks); 507 vst1q_u32(out, results); 508 out += 4; 509 510 // shift the 8th 4 outs 511 ind[0] = in[4] >> shifts_8th[0]; 512 ind[1] = in[4] >> shifts_8th[1]; 513 ind[2] = in[4] >> shifts_8th[2]; 514 ind[3] = in[4] >> shifts_8th[3]; 515 reg_shft = vld1q_u32(ind); 516 results = vandq_u32(reg_shft, reg_masks); 517 vst1q_u32(out, results); 518 out += 4; 519 520 in += 5; 521 522 return in; 523 } 524 525 inline static const uint32_t* unpack6_32_neon(const uint32_t* in, uint32_t* out) { 526 uint32_t mask = 0x3f; 527 uint32_t ind[4]; 528 uint32_t shifts_1st[4] = {0, 6, 12, 18}; 529 uint32_t shifts_2nd[4] = {24, 0, 4, 10}; 530 uint32_t shifts_3rd[4] = {16, 22, 0, 2}; 531 uint32_t shifts_4th[4] = {8, 14, 20, 26}; 532 533 uint32x4_t reg_shft, reg_masks; 534 uint32x4_t results; 535 536 reg_masks = vdupq_n_u32(mask); 537 538 // shift the first 4 outs 539 ind[0] = in[0] >> shifts_1st[0]; 540 ind[1] = in[0] >> shifts_1st[1]; 541 ind[2] = in[0] >> shifts_1st[2]; 542 ind[3] = in[0] >> shifts_1st[3]; 543 reg_shft = vld1q_u32(ind); 544 results = vandq_u32(reg_shft, reg_masks); 545 vst1q_u32(out, results); 546 out += 4; 547 548 // shift the 2nd 4 outs 549 ind[0] = in[0] >> shifts_2nd[0]; 550 ind[1] = (in[0] >> 30 | in[1] << 2) >> shifts_2nd[1]; 551 ind[2] = in[1] >> shifts_2nd[2]; 552 ind[3] = in[1] >> shifts_2nd[3]; 553 reg_shft = vld1q_u32(ind); 554 results = vandq_u32(reg_shft, reg_masks); 555 vst1q_u32(out, results); 556 out += 4; 557 558 // shift the 3rd 4 outs 559 ind[0] = in[1] >> shifts_3rd[0]; 560 ind[1] = in[1] >> shifts_3rd[1]; 561 ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_3rd[2]; 562 ind[3] = in[2] >> shifts_3rd[3]; 563 reg_shft = vld1q_u32(ind); 564 results = vandq_u32(reg_shft, reg_masks); 565 vst1q_u32(out, results); 566 out += 4; 567 568 // shift the 4th 4 outs 569 ind[0] = in[2] >> shifts_4th[0]; 570 ind[1] = in[2] >> shifts_4th[1]; 571 ind[2] = in[2] >> shifts_4th[2]; 572 ind[3] = in[2] >> shifts_4th[3]; 573 reg_shft = vld1q_u32(ind); 574 results = vandq_u32(reg_shft, reg_masks); 575 vst1q_u32(out, results); 576 out += 4; 577 578 // shift the 5th 4 outs 579 ind[0] = in[3] >> shifts_1st[0]; 580 ind[1] = in[3] >> shifts_1st[1]; 581 ind[2] = in[3] >> shifts_1st[2]; 582 ind[3] = in[3] >> shifts_1st[3]; 583 reg_shft = vld1q_u32(ind); 584 results = vandq_u32(reg_shft, reg_masks); 585 vst1q_u32(out, results); 586 out += 4; 587 588 // shift the 6th 4 outs 589 ind[0] = in[3] >> shifts_2nd[0]; 590 ind[1] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[1]; 591 ind[2] = in[4] >> shifts_2nd[2]; 592 ind[3] = in[4] >> shifts_2nd[3]; 593 reg_shft = vld1q_u32(ind); 594 results = vandq_u32(reg_shft, reg_masks); 595 vst1q_u32(out, results); 596 out += 4; 597 598 // shift the 7th 4 outs 599 ind[0] = in[4] >> shifts_3rd[0]; 600 ind[1] = in[4] >> shifts_3rd[1]; 601 ind[2] = (in[4] >> 28 | in[5] << 4) >> shifts_3rd[2]; 602 ind[3] = in[5] >> shifts_3rd[3]; 603 reg_shft = vld1q_u32(ind); 604 results = vandq_u32(reg_shft, reg_masks); 605 vst1q_u32(out, results); 606 out += 4; 607 608 // shift the 8th 4 outs 609 ind[0] = in[5] >> shifts_4th[0]; 610 ind[1] = in[5] >> shifts_4th[1]; 611 ind[2] = in[5] >> shifts_4th[2]; 612 ind[3] = in[5] >> shifts_4th[3]; 613 reg_shft = vld1q_u32(ind); 614 results = vandq_u32(reg_shft, reg_masks); 615 vst1q_u32(out, results); 616 out += 4; 617 618 in += 6; 619 620 return in; 621 } 622 623 inline static const uint32_t* unpack7_32_neon(const uint32_t* in, uint32_t* out) { 624 uint32_t mask = 0x7f; 625 uint32_t ind[4]; 626 uint32_t shifts_1st[4] = {0, 7, 14, 21}; 627 uint32_t shifts_2nd[4] = {0, 3, 10, 17}; 628 uint32_t shifts_3rd[4] = {24, 0, 6, 13}; 629 uint32_t shifts_4th[4] = {20, 0, 2, 9}; 630 uint32_t shifts_5th[4] = {16, 23, 0, 5}; 631 uint32_t shifts_6th[4] = {12, 19, 0, 1}; 632 uint32_t shifts_7th[4] = {8, 15, 22, 0}; 633 uint32_t shifts_8th[4] = {4, 11, 18, 25}; 634 uint32x4_t reg_shft, reg_masks; 635 uint32x4_t results; 636 637 reg_masks = vdupq_n_u32(mask); 638 639 // shift the first 4 outs 640 ind[0] = in[0] >> shifts_1st[0]; 641 ind[1] = in[0] >> shifts_1st[1]; 642 ind[2] = in[0] >> shifts_1st[2]; 643 ind[3] = in[0] >> shifts_1st[3]; 644 reg_shft = vld1q_u32(ind); 645 results = vandq_u32(reg_shft, reg_masks); 646 vst1q_u32(out, results); 647 out += 4; 648 649 // shift the 2nd 4 outs 650 ind[2] = (in[0] >> 28 | in[1] << 4) >> shifts_2nd[0]; 651 ind[1] = in[1] >> shifts_2nd[1]; 652 ind[2] = in[1] >> shifts_2nd[2]; 653 ind[3] = in[1] >> shifts_2nd[3]; 654 reg_shft = vld1q_u32(ind); 655 results = vandq_u32(reg_shft, reg_masks); 656 vst1q_u32(out, results); 657 out += 4; 658 659 // shift the 3rd 4 outs 660 ind[0] = in[1] >> shifts_3rd[0]; 661 ind[1] = (in[1] >> 31 | in[2] << 1) >> shifts_3rd[1]; 662 ind[2] = in[2] >> shifts_3rd[2]; 663 ind[3] = in[2] >> shifts_3rd[3]; 664 reg_shft = vld1q_u32(ind); 665 results = vandq_u32(reg_shft, reg_masks); 666 vst1q_u32(out, results); 667 out += 4; 668 669 // shift the 4th 4 outs 670 ind[0] = in[2] >> shifts_4th[0]; 671 ind[1] = (in[2] >> 27 | in[3] << 5) >> shifts_4th[1]; 672 ind[2] = in[3] >> shifts_4th[2]; 673 ind[3] = in[3] >> shifts_4th[3]; 674 reg_shft = vld1q_u32(ind); 675 results = vandq_u32(reg_shft, reg_masks); 676 vst1q_u32(out, results); 677 out += 4; 678 679 // shift the 5th 4 outs 680 ind[0] = in[3] >> shifts_5th[0]; 681 ind[1] = in[3] >> shifts_5th[1]; 682 ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_5th[2]; 683 ind[3] = in[4] >> shifts_5th[3]; 684 reg_shft = vld1q_u32(ind); 685 results = vandq_u32(reg_shft, reg_masks); 686 vst1q_u32(out, results); 687 out += 4; 688 689 // shift the 6th 4 outs 690 ind[0] = in[4] >> shifts_6th[0]; 691 ind[1] = in[4] >> shifts_6th[1]; 692 ind[2] = (in[4] >> 26 | in[5] << 6) >> shifts_6th[2]; 693 ind[3] = in[5] >> shifts_6th[3]; 694 reg_shft = vld1q_u32(ind); 695 results = vandq_u32(reg_shft, reg_masks); 696 vst1q_u32(out, results); 697 out += 4; 698 699 // shift the 7th 4 outs 700 ind[0] = in[5] >> shifts_7th[0]; 701 ind[1] = in[5] >> shifts_7th[1]; 702 ind[2] = in[5] >> shifts_7th[2]; 703 ind[3] = (in[5] >> 29 | in[6] << 3) >> shifts_7th[3]; 704 reg_shft = vld1q_u32(ind); 705 results = vandq_u32(reg_shft, reg_masks); 706 vst1q_u32(out, results); 707 out += 4; 708 709 // shift the 8th 4 outs 710 ind[0] = in[6] >> shifts_8th[0]; 711 ind[1] = in[6] >> shifts_8th[1]; 712 ind[2] = in[6] >> shifts_8th[2]; 713 ind[3] = in[6] >> shifts_8th[3]; 714 reg_shft = vld1q_u32(ind); 715 results = vandq_u32(reg_shft, reg_masks); 716 vst1q_u32(out, results); 717 out += 4; 718 719 in += 7; 720 721 return in; 722 } 723 724 inline static const uint32_t* unpack8_32_neon(const uint32_t* in, uint32_t* out) { 725 uint32_t mask = 0xff; 726 uint32_t ind[4]; 727 uint32_t shifts_1st[4] = {0, 8, 16, 24}; 728 uint32x4_t reg_shft, reg_masks; 729 uint32x4_t results; 730 731 reg_masks = vdupq_n_u32(mask); 732 733 // shift the first 4 outs 734 ind[0] = in[0] >> shifts_1st[0]; 735 ind[1] = in[0] >> shifts_1st[1]; 736 ind[2] = in[0] >> shifts_1st[2]; 737 ind[3] = in[0] >> shifts_1st[3]; 738 reg_shft = vld1q_u32(ind); 739 results = vandq_u32(reg_shft, reg_masks); 740 vst1q_u32(out, results); 741 out += 4; 742 743 // shift the 2nd 4 outs 744 ind[0] = in[1] >> shifts_1st[0]; 745 ind[1] = in[1] >> shifts_1st[1]; 746 ind[2] = in[1] >> shifts_1st[2]; 747 ind[3] = in[1] >> shifts_1st[3]; 748 reg_shft = vld1q_u32(ind); 749 results = vandq_u32(reg_shft, reg_masks); 750 vst1q_u32(out, results); 751 out += 4; 752 753 // shift the 3rd 4 outs 754 ind[0] = in[2] >> shifts_1st[0]; 755 ind[1] = in[2] >> shifts_1st[1]; 756 ind[2] = in[2] >> shifts_1st[2]; 757 ind[3] = in[2] >> shifts_1st[3]; 758 reg_shft = vld1q_u32(ind); 759 results = vandq_u32(reg_shft, reg_masks); 760 vst1q_u32(out, results); 761 out += 4; 762 763 // shift the 4th 4 outs 764 ind[0] = in[3] >> shifts_1st[0]; 765 ind[1] = in[3] >> shifts_1st[1]; 766 ind[2] = in[3] >> shifts_1st[2]; 767 ind[3] = in[3] >> shifts_1st[3]; 768 reg_shft = vld1q_u32(ind); 769 results = vandq_u32(reg_shft, reg_masks); 770 vst1q_u32(out, results); 771 out += 4; 772 773 // shift the 5th 4 outs 774 ind[0] = in[4] >> shifts_1st[0]; 775 ind[1] = in[4] >> shifts_1st[1]; 776 ind[2] = in[4] >> shifts_1st[2]; 777 ind[3] = in[4] >> shifts_1st[3]; 778 reg_shft = vld1q_u32(ind); 779 results = vandq_u32(reg_shft, reg_masks); 780 vst1q_u32(out, results); 781 out += 4; 782 783 // shift the 6th 4 outs 784 ind[0] = in[5] >> shifts_1st[0]; 785 ind[1] = in[5] >> shifts_1st[1]; 786 ind[2] = in[5] >> shifts_1st[2]; 787 ind[3] = in[5] >> shifts_1st[3]; 788 reg_shft = vld1q_u32(ind); 789 results = vandq_u32(reg_shft, reg_masks); 790 vst1q_u32(out, results); 791 out += 4; 792 793 // shift the 7th 4 outs 794 ind[0] = in[6] >> shifts_1st[0]; 795 ind[1] = in[6] >> shifts_1st[1]; 796 ind[2] = in[6] >> shifts_1st[2]; 797 ind[3] = in[6] >> shifts_1st[3]; 798 reg_shft = vld1q_u32(ind); 799 results = vandq_u32(reg_shft, reg_masks); 800 vst1q_u32(out, results); 801 out += 4; 802 803 // shift the 8th 4 outs 804 ind[0] = in[7] >> shifts_1st[0]; 805 ind[1] = in[7] >> shifts_1st[1]; 806 ind[2] = in[7] >> shifts_1st[2]; 807 ind[3] = in[7] >> shifts_1st[3]; 808 reg_shft = vld1q_u32(ind); 809 results = vandq_u32(reg_shft, reg_masks); 810 vst1q_u32(out, results); 811 out += 4; 812 813 in += 8; 814 815 return in; 816 } 817 818 inline static const uint32_t* unpack9_32_neon(const uint32_t* in, uint32_t* out) { 819 uint32_t mask = 0x1ff; 820 uint32_t ind[4]; 821 uint32_t shifts_1st[4] = {0, 9, 18, 0}; 822 uint32_t shifts_2nd[4] = {4, 13, 22, 0}; 823 uint32_t shifts_3rd[4] = {8, 17, 0, 3}; 824 uint32_t shifts_4th[4] = {12, 21, 0, 7}; 825 uint32_t shifts_5th[4] = {16, 0, 2, 11}; 826 uint32_t shifts_6th[4] = {20, 0, 6, 15}; 827 uint32_t shifts_7th[4] = {0, 1, 10, 19}; 828 uint32_t shifts_8th[4] = {0, 5, 14, 23}; 829 uint32x4_t reg_shft, reg_masks; 830 uint32x4_t results; 831 832 reg_masks = vdupq_n_u32(mask); 833 834 // shift the first 4 outs 835 ind[0] = in[0] >> shifts_1st[0]; 836 ind[1] = in[0] >> shifts_1st[1]; 837 ind[2] = in[0] >> shifts_1st[2]; 838 ind[3] = (in[0] >> 27 | in[1] << 5) >> shifts_1st[3]; 839 reg_shft = vld1q_u32(ind); 840 results = vandq_u32(reg_shft, reg_masks); 841 vst1q_u32(out, results); 842 out += 4; 843 844 // shift the 2nd 4 outs 845 ind[2] = in[1] >> shifts_2nd[0]; 846 ind[1] = in[1] >> shifts_2nd[1]; 847 ind[2] = in[1] >> shifts_2nd[2]; 848 ind[3] = (in[1] >> 31 | in[2] << 1) >> shifts_2nd[3]; 849 reg_shft = vld1q_u32(ind); 850 results = vandq_u32(reg_shft, reg_masks); 851 vst1q_u32(out, results); 852 out += 4; 853 854 // shift the 3rd 4 outs 855 ind[0] = in[2] >> shifts_3rd[0]; 856 ind[1] = in[2] >> shifts_3rd[1]; 857 ind[2] = (in[2] >> 26 | in[3] << 6) >> shifts_3rd[2]; 858 ind[3] = in[3] >> shifts_3rd[3]; 859 reg_shft = vld1q_u32(ind); 860 results = vandq_u32(reg_shft, reg_masks); 861 vst1q_u32(out, results); 862 out += 4; 863 864 // shift the 4th 4 outs 865 ind[0] = in[3] >> shifts_4th[0]; 866 ind[1] = in[3] >> shifts_4th[1]; 867 ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_4th[2]; 868 ind[3] = in[4] >> shifts_4th[3]; 869 reg_shft = vld1q_u32(ind); 870 results = vandq_u32(reg_shft, reg_masks); 871 vst1q_u32(out, results); 872 out += 4; 873 874 // shift the 5th 4 outs 875 ind[0] = in[4] >> shifts_5th[0]; 876 ind[1] = (in[4] >> 25 | in[5] << 7) >> shifts_5th[1]; 877 ind[2] = in[5] >> shifts_5th[2]; 878 ind[3] = in[5] >> shifts_5th[3]; 879 reg_shft = vld1q_u32(ind); 880 results = vandq_u32(reg_shft, reg_masks); 881 vst1q_u32(out, results); 882 out += 4; 883 884 // shift the 6th 4 outs 885 ind[0] = in[5] >> shifts_6th[0]; 886 ind[1] = (in[5] >> 29 | in[6] << 3) >> shifts_6th[1]; 887 ind[2] = in[6] >> shifts_6th[2]; 888 ind[3] = in[6] >> shifts_6th[3]; 889 reg_shft = vld1q_u32(ind); 890 results = vandq_u32(reg_shft, reg_masks); 891 vst1q_u32(out, results); 892 out += 4; 893 894 // shift the 7th 4 outs 895 ind[0] = (in[6] >> 24 | in[7] << 8) >> shifts_7th[0]; 896 ind[1] = in[7] >> shifts_7th[1]; 897 ind[2] = in[7] >> shifts_7th[2]; 898 ind[3] = in[7] >> shifts_7th[3]; 899 reg_shft = vld1q_u32(ind); 900 results = vandq_u32(reg_shft, reg_masks); 901 vst1q_u32(out, results); 902 out += 4; 903 904 // shift the 8th 4 outs 905 ind[0] = (in[7] >> 28 | in[8] << 4) >> shifts_8th[0]; 906 ind[1] = in[8] >> shifts_8th[1]; 907 ind[2] = in[8] >> shifts_8th[2]; 908 ind[3] = in[8] >> shifts_8th[3]; 909 reg_shft = vld1q_u32(ind); 910 results = vandq_u32(reg_shft, reg_masks); 911 vst1q_u32(out, results); 912 out += 4; 913 914 in += 9; 915 916 return in; 917 } 918 919 inline static const uint32_t* unpack10_32_neon(const uint32_t* in, uint32_t* out) { 920 uint32_t mask = 0x3ff; 921 uint32_t ind[4]; 922 uint32_t shifts_1st[4] = {0, 10, 20, 0}; 923 uint32_t shifts_2nd[4] = {8, 18, 0, 6}; 924 uint32_t shifts_3rd[4] = {16, 0, 4, 14}; 925 uint32_t shifts_4th[4] = {0, 2, 12, 22}; 926 uint32x4_t reg_shft, reg_masks; 927 uint32x4_t results; 928 929 reg_masks = vdupq_n_u32(mask); 930 931 // shift the first 4 outs 932 ind[0] = in[0] >> shifts_1st[0]; 933 ind[1] = in[0] >> shifts_1st[1]; 934 ind[2] = in[0] >> shifts_1st[2]; 935 ind[3] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[3]; 936 reg_shft = vld1q_u32(ind); 937 results = vandq_u32(reg_shft, reg_masks); 938 vst1q_u32(out, results); 939 out += 4; 940 941 // shift the 2nd 4 outs 942 ind[2] = in[1] >> shifts_2nd[0]; 943 ind[1] = in[1] >> shifts_2nd[1]; 944 ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[2]; 945 ind[3] = in[2] >> shifts_2nd[3]; 946 reg_shft = vld1q_u32(ind); 947 results = vandq_u32(reg_shft, reg_masks); 948 vst1q_u32(out, results); 949 out += 4; 950 951 // shift the 3rd 4 outs 952 ind[0] = in[2] >> shifts_3rd[0]; 953 ind[1] = (in[2] >> 26 | in[3] << 6) >> shifts_3rd[1]; 954 ind[2] = in[3] >> shifts_3rd[2]; 955 ind[3] = in[3] >> shifts_3rd[3]; 956 reg_shft = vld1q_u32(ind); 957 results = vandq_u32(reg_shft, reg_masks); 958 vst1q_u32(out, results); 959 out += 4; 960 961 // shift the 4th 4 outs 962 ind[0] = (in[3] >> 24 | in[4] << 8) >> shifts_4th[0]; 963 ind[1] = in[4] >> shifts_4th[1]; 964 ind[2] = in[4] >> shifts_4th[2]; 965 ind[3] = in[4] >> shifts_4th[3]; 966 reg_shft = vld1q_u32(ind); 967 results = vandq_u32(reg_shft, reg_masks); 968 vst1q_u32(out, results); 969 out += 4; 970 971 // shift the 5th 4 outs 972 ind[0] = in[5] >> shifts_1st[0]; 973 ind[1] = in[5] >> shifts_1st[1]; 974 ind[2] = in[5] >> shifts_1st[2]; 975 ind[3] = (in[5] >> 30 | in[6] << 2) >> shifts_1st[3]; 976 reg_shft = vld1q_u32(ind); 977 results = vandq_u32(reg_shft, reg_masks); 978 vst1q_u32(out, results); 979 out += 4; 980 981 // shift the 6th 4 outs 982 ind[0] = in[6] >> shifts_2nd[0]; 983 ind[1] = in[6] >> shifts_2nd[1]; 984 ind[2] = (in[6] >> 28 | in[7] << 4) >> shifts_2nd[2]; 985 ind[3] = in[7] >> shifts_2nd[3]; 986 reg_shft = vld1q_u32(ind); 987 results = vandq_u32(reg_shft, reg_masks); 988 vst1q_u32(out, results); 989 out += 4; 990 991 // shift the 7th 4 outs 992 ind[0] = in[7] >> shifts_3rd[0]; 993 ind[1] = (in[7] >> 26 | in[8] << 6) >> shifts_3rd[1]; 994 ind[2] = in[8] >> shifts_3rd[2]; 995 ind[3] = in[8] >> shifts_3rd[3]; 996 reg_shft = vld1q_u32(ind); 997 results = vandq_u32(reg_shft, reg_masks); 998 vst1q_u32(out, results); 999 out += 4; 1000 1001 // shift the 8th 4 outs 1002 ind[0] = (in[8] >> 24 | in[9] << 8) >> shifts_4th[0]; 1003 ind[1] = in[9] >> shifts_4th[1]; 1004 ind[2] = in[9] >> shifts_4th[2]; 1005 ind[3] = in[9] >> shifts_4th[3]; 1006 reg_shft = vld1q_u32(ind); 1007 results = vandq_u32(reg_shft, reg_masks); 1008 vst1q_u32(out, results); 1009 out += 4; 1010 1011 in += 10; 1012 1013 return in; 1014 } 1015 1016 inline static const uint32_t* unpack11_32_neon(const uint32_t* in, uint32_t* out) { 1017 uint32_t mask = 0x7ff; 1018 uint32_t ind[4]; 1019 uint32_t shifts_1st[4] = {0, 11, 0, 1}; 1020 uint32_t shifts_2nd[4] = {12, 0, 2, 13}; 1021 uint32_t shifts_3rd[4] = {0, 3, 14, 0}; 1022 uint32_t shifts_4th[4] = {4, 15, 0, 5}; 1023 uint32_t shifts_5th[4] = {16, 0, 6, 17}; 1024 uint32_t shifts_6th[4] = {0, 7, 18, 0}; 1025 uint32_t shifts_7th[4] = {8, 19, 0, 9}; 1026 uint32_t shifts_8th[4] = {20, 0, 10, 21}; 1027 uint32x4_t reg_shft, reg_masks; 1028 uint32x4_t results; 1029 1030 reg_masks = vdupq_n_u32(mask); 1031 1032 // shift the first 4 outs 1033 ind[0] = in[0] >> shifts_1st[0]; 1034 ind[1] = in[0] >> shifts_1st[1]; 1035 ind[2] = (in[0] >> 22 | in[1] << 10) >> shifts_1st[2]; 1036 ind[3] = in[1] >> shifts_1st[3]; 1037 reg_shft = vld1q_u32(ind); 1038 results = vandq_u32(reg_shft, reg_masks); 1039 vst1q_u32(out, results); 1040 out += 4; 1041 1042 // shift the 2nd 4 outs 1043 ind[2] = in[1] >> shifts_2nd[0]; 1044 ind[1] = (in[1] >> 23 | in[2] << 9) >> shifts_2nd[1]; 1045 ind[2] = in[2] >> shifts_2nd[2]; 1046 ind[3] = in[2] >> shifts_2nd[3]; 1047 reg_shft = vld1q_u32(ind); 1048 results = vandq_u32(reg_shft, reg_masks); 1049 vst1q_u32(out, results); 1050 out += 4; 1051 1052 // shift the 3rd 4 outs 1053 ind[0] = (in[2] >> 24 | in[3] << 8) >> shifts_3rd[0]; 1054 ind[1] = in[3] >> shifts_3rd[1]; 1055 ind[2] = in[3] >> shifts_3rd[2]; 1056 ind[3] = (in[3] >> 25 | in[4] << 7) >> shifts_3rd[3]; 1057 reg_shft = vld1q_u32(ind); 1058 results = vandq_u32(reg_shft, reg_masks); 1059 vst1q_u32(out, results); 1060 out += 4; 1061 1062 // shift the 4th 4 outs 1063 ind[0] = in[4] >> shifts_4th[0]; 1064 ind[1] = in[4] >> shifts_4th[1]; 1065 ind[2] = (in[4] >> 26 | in[5] << 6) >> shifts_4th[2]; 1066 ind[3] = in[5] >> shifts_4th[3]; 1067 reg_shft = vld1q_u32(ind); 1068 results = vandq_u32(reg_shft, reg_masks); 1069 vst1q_u32(out, results); 1070 out += 4; 1071 1072 // shift the 5th 4 outs 1073 ind[0] = in[5] >> shifts_5th[0]; 1074 ind[1] = (in[5] >> 27 | in[6] << 5) >> shifts_5th[1]; 1075 ind[2] = in[6] >> shifts_5th[2]; 1076 ind[3] = in[6] >> shifts_5th[3]; 1077 reg_shft = vld1q_u32(ind); 1078 results = vandq_u32(reg_shft, reg_masks); 1079 vst1q_u32(out, results); 1080 out += 4; 1081 1082 // shift the 6th 4 outs 1083 ind[0] = (in[6] >> 28 | in[7] << 4) >> shifts_6th[0]; 1084 ind[1] = in[7] >> shifts_6th[1]; 1085 ind[2] = in[7] >> shifts_6th[2]; 1086 ind[3] = (in[7] >> 29 | in[8] << 3) >> shifts_6th[3]; 1087 reg_shft = vld1q_u32(ind); 1088 results = vandq_u32(reg_shft, reg_masks); 1089 vst1q_u32(out, results); 1090 out += 4; 1091 1092 // shift the 7th 4 outs 1093 ind[0] = in[8] >> shifts_7th[0]; 1094 ind[1] = in[8] >> shifts_7th[1]; 1095 ind[2] = (in[8] >> 30 | in[9] << 2) >> shifts_7th[2]; 1096 ind[3] = in[9] >> shifts_7th[3]; 1097 reg_shft = vld1q_u32(ind); 1098 results = vandq_u32(reg_shft, reg_masks); 1099 vst1q_u32(out, results); 1100 out += 4; 1101 1102 // shift the 8th 4 outs 1103 ind[0] = in[9] >> shifts_8th[0]; 1104 ind[1] = (in[9] >> 31 | in[10] << 1) >> shifts_8th[1]; 1105 ind[2] = in[10] >> shifts_8th[2]; 1106 ind[3] = in[10] >> shifts_8th[3]; 1107 reg_shft = vld1q_u32(ind); 1108 results = vandq_u32(reg_shft, reg_masks); 1109 vst1q_u32(out, results); 1110 out += 4; 1111 1112 in += 11; 1113 1114 return in; 1115 } 1116 1117 inline static const uint32_t* unpack12_32_neon(const uint32_t* in, uint32_t* out) { 1118 uint32_t mask = 0xfff; 1119 uint32_t ind[4]; 1120 uint32_t shifts_1st[4] = {0, 12, 0, 4}; 1121 uint32_t shifts_2nd[4] = {16, 0, 8, 20}; 1122 uint32x4_t reg_shft, reg_masks; 1123 uint32x4_t results; 1124 1125 reg_masks = vdupq_n_u32(mask); 1126 1127 // shift the first 4 outs 1128 ind[0] = in[0] >> shifts_1st[0]; 1129 ind[1] = in[0] >> shifts_1st[1]; 1130 ind[2] = (in[0] >> 24 | in[1] << 8) >> shifts_1st[2]; 1131 ind[3] = in[1] >> shifts_1st[3]; 1132 reg_shft = vld1q_u32(ind); 1133 results = vandq_u32(reg_shft, reg_masks); 1134 vst1q_u32(out, results); 1135 out += 4; 1136 1137 // shift the 2nd 4 outs 1138 ind[2] = in[1] >> shifts_2nd[0]; 1139 ind[1] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[1]; 1140 ind[2] = in[2] >> shifts_2nd[2]; 1141 ind[3] = in[2] >> shifts_2nd[3]; 1142 reg_shft = vld1q_u32(ind); 1143 results = vandq_u32(reg_shft, reg_masks); 1144 vst1q_u32(out, results); 1145 out += 4; 1146 1147 // shift the 3rd 4 outs 1148 ind[0] = in[3] >> shifts_1st[0]; 1149 ind[1] = in[3] >> shifts_1st[1]; 1150 ind[2] = (in[3] >> 24 | in[4] << 8) >> shifts_1st[2]; 1151 ind[3] = in[4] >> shifts_1st[3]; 1152 reg_shft = vld1q_u32(ind); 1153 results = vandq_u32(reg_shft, reg_masks); 1154 vst1q_u32(out, results); 1155 out += 4; 1156 1157 // shift the 4th 4 outs 1158 ind[0] = in[4] >> shifts_2nd[0]; 1159 ind[1] = (in[4] >> 28 | in[5] << 4) >> shifts_2nd[1]; 1160 ind[2] = in[5] >> shifts_2nd[2]; 1161 ind[3] = in[5] >> shifts_2nd[3]; 1162 reg_shft = vld1q_u32(ind); 1163 results = vandq_u32(reg_shft, reg_masks); 1164 vst1q_u32(out, results); 1165 out += 4; 1166 1167 // shift the 5th 4 outs 1168 ind[0] = in[6] >> shifts_1st[0]; 1169 ind[1] = in[6] >> shifts_1st[1]; 1170 ind[2] = (in[6] >> 24 | in[7] << 8) >> shifts_1st[2]; 1171 ind[3] = in[7] >> shifts_1st[3]; 1172 reg_shft = vld1q_u32(ind); 1173 results = vandq_u32(reg_shft, reg_masks); 1174 vst1q_u32(out, results); 1175 out += 4; 1176 1177 // shift the 6th 4 outs 1178 ind[0] = in[7] >> shifts_2nd[0]; 1179 ind[1] = (in[7] >> 28 | in[8] << 4) >> shifts_2nd[1]; 1180 ind[2] = in[8] >> shifts_2nd[2]; 1181 ind[3] = in[8] >> shifts_2nd[3]; 1182 reg_shft = vld1q_u32(ind); 1183 results = vandq_u32(reg_shft, reg_masks); 1184 vst1q_u32(out, results); 1185 out += 4; 1186 1187 // shift the 7th 4 outs 1188 ind[0] = in[9] >> shifts_1st[0]; 1189 ind[1] = in[9] >> shifts_1st[1]; 1190 ind[2] = (in[9] >> 24 | in[10] << 8) >> shifts_1st[2]; 1191 ind[3] = in[10] >> shifts_1st[3]; 1192 reg_shft = vld1q_u32(ind); 1193 results = vandq_u32(reg_shft, reg_masks); 1194 vst1q_u32(out, results); 1195 out += 4; 1196 1197 // shift the 8th 4 outs 1198 ind[0] = in[10] >> shifts_2nd[0]; 1199 ind[1] = (in[10] >> 28 | in[11] << 4) >> shifts_2nd[1]; 1200 ind[2] = in[11] >> shifts_2nd[2]; 1201 ind[3] = in[11] >> shifts_2nd[3]; 1202 reg_shft = vld1q_u32(ind); 1203 results = vandq_u32(reg_shft, reg_masks); 1204 vst1q_u32(out, results); 1205 out += 4; 1206 1207 in += 12; 1208 1209 return in; 1210 } 1211 1212 inline static const uint32_t* unpack13_32_neon(const uint32_t* in, uint32_t* out) { 1213 uint32_t mask = 0x1fff; 1214 uint32_t ind[4]; 1215 uint32_t shifts_1st[4] = {0, 13, 0, 7}; 1216 uint32_t shifts_2nd[4] = {0, 1, 14, 0}; 1217 uint32_t shifts_3rd[4] = {8, 0, 2, 15}; 1218 uint32_t shifts_4th[4] = {0, 9, 0, 3}; 1219 uint32_t shifts_5th[4] = {16, 0, 10, 0}; 1220 uint32_t shifts_6th[4] = {4, 17, 0, 11}; 1221 uint32_t shifts_7th[4] = {0, 5, 18, 0}; 1222 uint32_t shifts_8th[4] = {12, 0, 6, 19}; 1223 uint32x4_t reg_shft, reg_masks; 1224 uint32x4_t results; 1225 1226 reg_masks = vdupq_n_u32(mask); 1227 1228 // shift the first 4 outs 1229 ind[0] = in[0] >> shifts_1st[0]; 1230 ind[1] = in[0] >> shifts_1st[1]; 1231 ind[2] = (in[0] >> 26 | in[1] << 6) >> shifts_1st[2]; 1232 ind[3] = in[1] >> shifts_1st[3]; 1233 reg_shft = vld1q_u32(ind); 1234 results = vandq_u32(reg_shft, reg_masks); 1235 vst1q_u32(out, results); 1236 out += 4; 1237 1238 // shift the 2nd 4 outs 1239 ind[2] = (in[1] >> 20 | in[2] << 12) >> shifts_2nd[0]; 1240 ind[1] = in[2] >> shifts_2nd[1]; 1241 ind[2] = in[2] >> shifts_2nd[2]; 1242 ind[3] = (in[2] >> 27 | in[3] << 5) >> shifts_2nd[3]; 1243 reg_shft = vld1q_u32(ind); 1244 results = vandq_u32(reg_shft, reg_masks); 1245 vst1q_u32(out, results); 1246 out += 4; 1247 1248 // shift the 3rd 4 outs 1249 ind[0] = in[3] >> shifts_3rd[0]; 1250 ind[1] = (in[3] >> 21 | in[4] << 11) >> shifts_3rd[1]; 1251 ind[2] = in[4] >> shifts_3rd[2]; 1252 ind[3] = in[4] >> shifts_3rd[3]; 1253 reg_shft = vld1q_u32(ind); 1254 results = vandq_u32(reg_shft, reg_masks); 1255 vst1q_u32(out, results); 1256 out += 4; 1257 1258 // shift the 4th 4 outs 1259 ind[0] = (in[4] >> 28 | in[5] << 4) >> shifts_4th[0]; 1260 ind[1] = in[5] >> shifts_4th[1]; 1261 ind[2] = (in[5] >> 22 | in[6] << 10) >> shifts_4th[2]; 1262 ind[3] = in[6] >> shifts_4th[3]; 1263 reg_shft = vld1q_u32(ind); 1264 results = vandq_u32(reg_shft, reg_masks); 1265 vst1q_u32(out, results); 1266 out += 4; 1267 1268 // shift the 5th 4 outs 1269 ind[0] = in[6] >> shifts_5th[0]; 1270 ind[1] = (in[6] >> 29 | in[7] << 3) >> shifts_5th[1]; 1271 ind[2] = in[7] >> shifts_5th[2]; 1272 ind[3] = (in[7] >> 23 | in[8] << 9) >> shifts_5th[3]; 1273 reg_shft = vld1q_u32(ind); 1274 results = vandq_u32(reg_shft, reg_masks); 1275 vst1q_u32(out, results); 1276 out += 4; 1277 1278 // shift the 6th 4 outs 1279 ind[0] = in[8] >> shifts_6th[0]; 1280 ind[1] = in[8] >> shifts_6th[1]; 1281 ind[2] = (in[8] >> 30 | in[9] << 2) >> shifts_6th[2]; 1282 ind[3] = in[9] >> shifts_6th[3]; 1283 reg_shft = vld1q_u32(ind); 1284 results = vandq_u32(reg_shft, reg_masks); 1285 vst1q_u32(out, results); 1286 out += 4; 1287 1288 // shift the 7th 4 outs 1289 ind[0] = (in[9] >> 24 | in[10] << 8) >> shifts_7th[0]; 1290 ind[1] = in[10] >> shifts_7th[1]; 1291 ind[2] = in[10] >> shifts_7th[2]; 1292 ind[3] = (in[10] >> 31 | in[11] << 1) >> shifts_7th[3]; 1293 reg_shft = vld1q_u32(ind); 1294 results = vandq_u32(reg_shft, reg_masks); 1295 vst1q_u32(out, results); 1296 out += 4; 1297 1298 // shift the 8th 4 outs 1299 ind[0] = in[11] >> shifts_8th[0]; 1300 ind[1] = (in[11] >> 25 | in[12] << 7) >> shifts_8th[1]; 1301 ind[2] = in[12] >> shifts_8th[2]; 1302 ind[3] = in[12] >> shifts_8th[3]; 1303 reg_shft = vld1q_u32(ind); 1304 results = vandq_u32(reg_shft, reg_masks); 1305 vst1q_u32(out, results); 1306 out += 4; 1307 1308 in += 13; 1309 1310 return in; 1311 } 1312 1313 inline static const uint32_t* unpack14_32_neon(const uint32_t* in, uint32_t* out) { 1314 uint32_t mask = 0x3fff; 1315 uint32_t ind[4]; 1316 uint32_t shifts_1st[4] = {0, 14, 0, 10}; 1317 uint32_t shifts_2nd[4] = {0, 6, 0, 2}; 1318 uint32_t shifts_3rd[4] = {16, 0, 12, 0}; 1319 uint32_t shifts_4th[4] = {8, 0, 4, 18}; 1320 uint32x4_t reg_shft, reg_masks; 1321 uint32x4_t results; 1322 1323 reg_masks = vdupq_n_u32(mask); 1324 1325 // shift the first 4 outs 1326 ind[0] = in[0] >> shifts_1st[0]; 1327 ind[1] = in[0] >> shifts_1st[1]; 1328 ind[2] = (in[0] >> 28 | in[1] << 4) >> shifts_1st[2]; 1329 ind[3] = in[1] >> shifts_1st[3]; 1330 reg_shft = vld1q_u32(ind); 1331 results = vandq_u32(reg_shft, reg_masks); 1332 vst1q_u32(out, results); 1333 out += 4; 1334 1335 // shift the 2nd 4 outs 1336 ind[0] = (in[1] >> 24 | in[2] << 8) >> shifts_2nd[0]; 1337 ind[1] = in[2] >> shifts_2nd[1]; 1338 ind[2] = (in[2] >> 20 | in[3] << 12) >> shifts_2nd[2]; 1339 ind[3] = in[3] >> shifts_2nd[3]; 1340 reg_shft = vld1q_u32(ind); 1341 results = vandq_u32(reg_shft, reg_masks); 1342 vst1q_u32(out, results); 1343 out += 4; 1344 1345 // shift the 3rd 4 outs 1346 ind[0] = in[3] >> shifts_3rd[0]; 1347 ind[1] = (in[3] >> 30 | in[4] << 2) >> shifts_3rd[1]; 1348 ind[2] = in[4] >> shifts_3rd[2]; 1349 ind[3] = (in[4] >> 26 | in[5] << 6) >> shifts_3rd[3]; 1350 reg_shft = vld1q_u32(ind); 1351 results = vandq_u32(reg_shft, reg_masks); 1352 vst1q_u32(out, results); 1353 out += 4; 1354 1355 // shift the 4th 4 outs 1356 ind[0] = in[5] >> shifts_4th[0]; 1357 ind[1] = (in[5] >> 22 | in[6] << 10) >> shifts_4th[1]; 1358 ind[2] = in[6] >> shifts_4th[2]; 1359 ind[3] = in[6] >> shifts_4th[3]; 1360 reg_shft = vld1q_u32(ind); 1361 results = vandq_u32(reg_shft, reg_masks); 1362 vst1q_u32(out, results); 1363 out += 4; 1364 1365 // shift the 5th 4 outs 1366 ind[0] = in[7] >> shifts_1st[0]; 1367 ind[1] = in[7] >> shifts_1st[1]; 1368 ind[2] = (in[7] >> 28 | in[8] << 4) >> shifts_1st[2]; 1369 ind[3] = in[8] >> shifts_1st[3]; 1370 reg_shft = vld1q_u32(ind); 1371 results = vandq_u32(reg_shft, reg_masks); 1372 vst1q_u32(out, results); 1373 out += 4; 1374 1375 // shift the 6th 4 outs 1376 ind[0] = (in[8] >> 24 | in[9] << 8) >> shifts_2nd[0]; 1377 ind[1] = in[9] >> shifts_2nd[1]; 1378 ind[2] = (in[9] >> 20 | in[10] << 12) >> shifts_2nd[2]; 1379 ind[3] = in[10] >> shifts_2nd[3]; 1380 reg_shft = vld1q_u32(ind); 1381 results = vandq_u32(reg_shft, reg_masks); 1382 vst1q_u32(out, results); 1383 out += 4; 1384 1385 // shift the 7th 4 outs 1386 ind[0] = in[10] >> shifts_3rd[0]; 1387 ind[1] = (in[10] >> 30 | in[11] << 2) >> shifts_3rd[1]; 1388 ind[2] = in[11] >> shifts_3rd[2]; 1389 ind[3] = (in[11] >> 26 | in[12] << 6) >> shifts_3rd[3]; 1390 reg_shft = vld1q_u32(ind); 1391 results = vandq_u32(reg_shft, reg_masks); 1392 vst1q_u32(out, results); 1393 out += 4; 1394 1395 // shift the 8th 4 outs 1396 ind[0] = in[12] >> shifts_4th[0]; 1397 ind[1] = (in[12] >> 22 | in[13] << 10) >> shifts_4th[1]; 1398 ind[2] = in[13] >> shifts_4th[2]; 1399 ind[3] = in[13] >> shifts_4th[3]; 1400 reg_shft = vld1q_u32(ind); 1401 results = vandq_u32(reg_shft, reg_masks); 1402 vst1q_u32(out, results); 1403 out += 4; 1404 1405 in += 14; 1406 1407 return in; 1408 } 1409 1410 inline static const uint32_t* unpack15_32_neon(const uint32_t* in, uint32_t* out) { 1411 uint32_t mask = 0x7fff; 1412 uint32_t ind[4]; 1413 uint32_t shifts_1st[4] = {0, 15, 0, 13}; 1414 uint32_t shifts_2nd[4] = {0, 11, 0, 9}; 1415 uint32_t shifts_3rd[4] = {0, 7, 0, 5}; 1416 uint32_t shifts_4th[4] = {0, 3, 0, 1}; 1417 uint32_t shifts_5th[4] = {16, 0, 14, 0}; 1418 uint32_t shifts_6th[4] = {12, 0, 10, 0}; 1419 uint32_t shifts_7th[4] = {8, 0, 6, 0}; 1420 uint32_t shifts_8th[4] = {4, 0, 2, 17}; 1421 uint32x4_t reg_shft, reg_masks; 1422 uint32x4_t results; 1423 1424 reg_masks = vdupq_n_u32(mask); 1425 1426 // shift the first 4 outs 1427 ind[0] = in[0] >> shifts_1st[0]; 1428 ind[1] = in[0] >> shifts_1st[1]; 1429 ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[2]; 1430 ind[3] = in[1] >> shifts_1st[3]; 1431 reg_shft = vld1q_u32(ind); 1432 results = vandq_u32(reg_shft, reg_masks); 1433 vst1q_u32(out, results); 1434 out += 4; 1435 1436 // shift the 2nd 4 outs 1437 ind[0] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[0]; 1438 ind[1] = in[2] >> shifts_2nd[1]; 1439 ind[2] = (in[2] >> 26 | in[3] << 6) >> shifts_2nd[2]; 1440 ind[3] = in[3] >> shifts_2nd[3]; 1441 reg_shft = vld1q_u32(ind); 1442 results = vandq_u32(reg_shft, reg_masks); 1443 vst1q_u32(out, results); 1444 out += 4; 1445 1446 // shift the 3rd 4 outs 1447 ind[0] = (in[3] >> 24 | in[4] << 8) >> shifts_3rd[0]; 1448 ind[1] = in[4] >> shifts_3rd[1]; 1449 ind[2] = (in[4] >> 22 | in[5] << 10) >> shifts_3rd[2]; 1450 ind[3] = in[5] >> shifts_3rd[3]; 1451 reg_shft = vld1q_u32(ind); 1452 results = vandq_u32(reg_shft, reg_masks); 1453 vst1q_u32(out, results); 1454 out += 4; 1455 1456 // shift the 4th 4 outs 1457 ind[0] = (in[5] >> 20 | in[6] << 12) >> shifts_4th[0]; 1458 ind[1] = in[6] >> shifts_4th[1]; 1459 ind[2] = (in[6] >> 18 | in[7] << 14) >> shifts_4th[2]; 1460 ind[3] = in[7] >> shifts_4th[3]; 1461 reg_shft = vld1q_u32(ind); 1462 results = vandq_u32(reg_shft, reg_masks); 1463 vst1q_u32(out, results); 1464 out += 4; 1465 1466 // shift the 5th 4 outs 1467 ind[0] = in[7] >> shifts_5th[0]; 1468 ind[1] = (in[7] >> 31 | in[8] << 1) >> shifts_5th[1]; 1469 ind[2] = in[8] >> shifts_5th[2]; 1470 ind[3] = (in[8] >> 29 | in[9] << 3) >> shifts_5th[3]; 1471 reg_shft = vld1q_u32(ind); 1472 results = vandq_u32(reg_shft, reg_masks); 1473 vst1q_u32(out, results); 1474 out += 4; 1475 1476 // shift the 6th 4 outs 1477 ind[0] = in[9] >> shifts_6th[0]; 1478 ind[1] = (in[9] >> 27 | in[10] << 5) >> shifts_6th[1]; 1479 ind[2] = in[10] >> shifts_6th[2]; 1480 ind[3] = (in[10] >> 25 | in[11] << 7) >> shifts_6th[3]; 1481 reg_shft = vld1q_u32(ind); 1482 results = vandq_u32(reg_shft, reg_masks); 1483 vst1q_u32(out, results); 1484 out += 4; 1485 1486 // shift the 7th 4 outs 1487 ind[0] = in[11] >> shifts_7th[0]; 1488 ind[1] = (in[11] >> 23 | in[12] << 9) >> shifts_7th[1]; 1489 ind[2] = in[12] >> shifts_7th[2]; 1490 ind[3] = (in[12] >> 21 | in[13] << 11) >> shifts_7th[3]; 1491 reg_shft = vld1q_u32(ind); 1492 results = vandq_u32(reg_shft, reg_masks); 1493 vst1q_u32(out, results); 1494 out += 4; 1495 1496 // shift the 8th 4 outs 1497 ind[0] = in[13] >> shifts_8th[0]; 1498 ind[1] = (in[13] >> 19 | in[14] << 13) >> shifts_8th[1]; 1499 ind[2] = in[14] >> shifts_8th[2]; 1500 ind[3] = in[14] >> shifts_8th[3]; 1501 reg_shft = vld1q_u32(ind); 1502 results = vandq_u32(reg_shft, reg_masks); 1503 vst1q_u32(out, results); 1504 out += 4; 1505 1506 in += 15; 1507 1508 return in; 1509 } 1510 1511 inline static const uint32_t* unpack16_32_neon(const uint32_t* in, uint32_t* out) { 1512 uint32_t mask = 0xffff; 1513 uint32_t ind[4]; 1514 uint32_t shifts_1st[4] = {0, 16, 0, 16}; 1515 uint32x4_t reg_shft, reg_masks; 1516 uint32x4_t results; 1517 1518 reg_masks = vdupq_n_u32(mask); 1519 1520 // shift the first 4 outs 1521 ind[0] = in[0] >> shifts_1st[0]; 1522 ind[1] = in[0] >> shifts_1st[1]; 1523 ind[2] = in[1] >> shifts_1st[2]; 1524 ind[3] = in[1] >> shifts_1st[3]; 1525 reg_shft = vld1q_u32(ind); 1526 results = vandq_u32(reg_shft, reg_masks); 1527 vst1q_u32(out, results); 1528 out += 4; 1529 1530 // shift the 2nd 4 outs 1531 ind[0] = in[2] >> shifts_1st[0]; 1532 ind[1] = in[2] >> shifts_1st[1]; 1533 ind[2] = in[3] >> shifts_1st[2]; 1534 ind[3] = in[3] >> shifts_1st[3]; 1535 reg_shft = vld1q_u32(ind); 1536 results = vandq_u32(reg_shft, reg_masks); 1537 vst1q_u32(out, results); 1538 out += 4; 1539 1540 // shift the 3rd 4 outs 1541 ind[0] = in[4] >> shifts_1st[0]; 1542 ind[1] = in[4] >> shifts_1st[1]; 1543 ind[2] = in[5] >> shifts_1st[2]; 1544 ind[3] = in[5] >> shifts_1st[3]; 1545 reg_shft = vld1q_u32(ind); 1546 results = vandq_u32(reg_shft, reg_masks); 1547 vst1q_u32(out, results); 1548 out += 4; 1549 1550 // shift the 4th 4 outs 1551 ind[0] = in[6] >> shifts_1st[0]; 1552 ind[1] = in[6] >> shifts_1st[1]; 1553 ind[2] = in[7] >> shifts_1st[2]; 1554 ind[3] = in[7] >> shifts_1st[3]; 1555 reg_shft = vld1q_u32(ind); 1556 results = vandq_u32(reg_shft, reg_masks); 1557 vst1q_u32(out, results); 1558 out += 4; 1559 1560 // shift the 5th 4 outs 1561 ind[0] = in[8] >> shifts_1st[0]; 1562 ind[1] = in[8] >> shifts_1st[1]; 1563 ind[2] = in[9] >> shifts_1st[2]; 1564 ind[3] = in[9] >> shifts_1st[3]; 1565 reg_shft = vld1q_u32(ind); 1566 results = vandq_u32(reg_shft, reg_masks); 1567 vst1q_u32(out, results); 1568 out += 4; 1569 1570 // shift the 6th 4 outs 1571 ind[0] = in[10] >> shifts_1st[0]; 1572 ind[1] = in[10] >> shifts_1st[1]; 1573 ind[2] = in[11] >> shifts_1st[2]; 1574 ind[3] = in[11] >> shifts_1st[3]; 1575 reg_shft = vld1q_u32(ind); 1576 results = vandq_u32(reg_shft, reg_masks); 1577 vst1q_u32(out, results); 1578 out += 4; 1579 1580 // shift the 7th 4 outs 1581 ind[0] = in[12] >> shifts_1st[0]; 1582 ind[1] = in[12] >> shifts_1st[1]; 1583 ind[2] = in[13] >> shifts_1st[2]; 1584 ind[3] = in[13] >> shifts_1st[3]; 1585 reg_shft = vld1q_u32(ind); 1586 results = vandq_u32(reg_shft, reg_masks); 1587 vst1q_u32(out, results); 1588 out += 4; 1589 1590 // shift the 8th 4 outs 1591 ind[0] = in[14] >> shifts_1st[0]; 1592 ind[1] = in[14] >> shifts_1st[1]; 1593 ind[2] = in[15] >> shifts_1st[2]; 1594 ind[3] = in[15] >> shifts_1st[3]; 1595 reg_shft = vld1q_u32(ind); 1596 results = vandq_u32(reg_shft, reg_masks); 1597 vst1q_u32(out, results); 1598 out += 4; 1599 1600 in += 16; 1601 1602 return in; 1603 } 1604 1605 inline static const uint32_t* unpack17_32_neon(const uint32_t* in, uint32_t* out) { 1606 uint32_t mask = 0x1ffff; 1607 uint32_t ind[4]; 1608 uint32_t shifts_1st[4] = {0, 0, 2, 0}; 1609 uint32_t shifts_2nd[4] = {4, 0, 6, 0}; 1610 uint32_t shifts_3rd[4] = {8, 0, 10, 0}; 1611 uint32_t shifts_4th[4] = {12, 0, 14, 0}; 1612 uint32_t shifts_5th[4] = {0, 1, 0, 3}; 1613 uint32_t shifts_6th[4] = {0, 5, 0, 7}; 1614 uint32_t shifts_7th[4] = {0, 9, 0, 11}; 1615 uint32_t shifts_8th[4] = {0, 13, 0, 15}; 1616 uint32x4_t reg_shft, reg_masks; 1617 uint32x4_t results; 1618 1619 reg_masks = vdupq_n_u32(mask); 1620 1621 // shift the first 4 outs 1622 ind[0] = in[0] >> shifts_1st[0]; 1623 ind[1] = (in[0] >> 17 | in[1] << 15) >> shifts_1st[1]; 1624 ind[2] = in[1] >> shifts_1st[2]; 1625 ind[3] = (in[1] >> 19 | in[2] << 13) >> shifts_1st[3]; 1626 reg_shft = vld1q_u32(ind); 1627 results = vandq_u32(reg_shft, reg_masks); 1628 vst1q_u32(out, results); 1629 out += 4; 1630 1631 // shift the 2nd 4 outs 1632 ind[0] = in[2] >> shifts_2nd[0]; 1633 ind[1] = (in[2] >> 21 | in[3] << 11) >> shifts_2nd[1]; 1634 ind[2] = in[3] >> shifts_2nd[2]; 1635 ind[3] = (in[3] >> 23 | in[4] << 9) >> shifts_2nd[3]; 1636 reg_shft = vld1q_u32(ind); 1637 results = vandq_u32(reg_shft, reg_masks); 1638 vst1q_u32(out, results); 1639 out += 4; 1640 1641 // shift the 3rd 4 outs 1642 ind[0] = in[4] >> shifts_3rd[0]; 1643 ind[1] = (in[4] >> 25 | in[5] << 7) >> shifts_3rd[1]; 1644 ind[2] = in[5] >> shifts_3rd[2]; 1645 ind[3] = (in[5] >> 27 | in[6] << 5) >> shifts_3rd[3]; 1646 reg_shft = vld1q_u32(ind); 1647 results = vandq_u32(reg_shft, reg_masks); 1648 vst1q_u32(out, results); 1649 out += 4; 1650 1651 // shift the 4th 4 outs 1652 ind[0] = in[6] >> shifts_4th[0]; 1653 ind[1] = (in[6] >> 29 | in[7] << 3) >> shifts_4th[1]; 1654 ind[2] = in[7] >> shifts_4th[2]; 1655 ind[3] = (in[7] >> 31 | in[8] << 1) >> shifts_4th[3]; 1656 reg_shft = vld1q_u32(ind); 1657 results = vandq_u32(reg_shft, reg_masks); 1658 vst1q_u32(out, results); 1659 out += 4; 1660 1661 // shift the 5th 4 outs 1662 ind[0] = (in[8] >> 16 | in[9] << 16) >> shifts_5th[0]; 1663 ind[1] = in[9] >> shifts_5th[1]; 1664 ind[2] = (in[9] >> 18 | in[10] << 14) >> shifts_5th[2]; 1665 ind[3] = in[10] >> shifts_5th[3]; 1666 reg_shft = vld1q_u32(ind); 1667 results = vandq_u32(reg_shft, reg_masks); 1668 vst1q_u32(out, results); 1669 out += 4; 1670 1671 // shift the 6th 4 outs 1672 ind[0] = (in[10] >> 20 | in[11] << 12) >> shifts_6th[0]; 1673 ind[1] = in[11] >> shifts_6th[1]; 1674 ind[2] = (in[11] >> 22 | in[12] << 10) >> shifts_6th[2]; 1675 ind[3] = in[12] >> shifts_6th[3]; 1676 reg_shft = vld1q_u32(ind); 1677 results = vandq_u32(reg_shft, reg_masks); 1678 vst1q_u32(out, results); 1679 out += 4; 1680 1681 // shift the 7th 4 outs 1682 ind[0] = (in[12] >> 24 | in[13] << 8) >> shifts_7th[0]; 1683 ind[1] = in[13] >> shifts_7th[1]; 1684 ind[2] = (in[13] >> 26 | in[14] << 6) >> shifts_7th[2]; 1685 ind[3] = in[14] >> shifts_7th[3]; 1686 reg_shft = vld1q_u32(ind); 1687 results = vandq_u32(reg_shft, reg_masks); 1688 vst1q_u32(out, results); 1689 out += 4; 1690 1691 // shift the 8th 4 outs 1692 ind[0] = (in[14] >> 28 | in[15] << 4) >> shifts_8th[0]; 1693 ind[1] = in[15] >> shifts_8th[1]; 1694 ind[2] = (in[15] >> 30 | in[16] << 2) >> shifts_8th[2]; 1695 ind[3] = in[16] >> shifts_8th[3]; 1696 reg_shft = vld1q_u32(ind); 1697 results = vandq_u32(reg_shft, reg_masks); 1698 vst1q_u32(out, results); 1699 out += 4; 1700 1701 1702 in += 17; 1703 1704 return in; 1705 } 1706 1707 inline static const uint32_t* unpack18_32_neon(const uint32_t* in, uint32_t* out) { 1708 uint32_t mask = 0x3ffff; 1709 uint32_t ind[4]; 1710 uint32_t shifts_1st[4] = {0, 0, 4, 0}; 1711 uint32_t shifts_2nd[4] = {8, 0, 12, 0}; 1712 uint32_t shifts_3rd[4] = {0, 2, 0, 6}; 1713 uint32_t shifts_4th[4] = {0, 10, 0, 14}; 1714 uint32x4_t reg_shft, reg_masks; 1715 uint32x4_t results; 1716 1717 reg_masks = vdupq_n_u32(mask); 1718 1719 // shift the first 4 outs 1720 ind[0] = in[0] >> shifts_1st[0]; 1721 ind[1] = (in[0] >> 18 | in[1] << 14) >> shifts_1st[1]; 1722 ind[2] = in[1] >> shifts_1st[2]; 1723 ind[3] = (in[1] >> 22 | in[2] << 10) >> shifts_1st[3]; 1724 reg_shft = vld1q_u32(ind); 1725 results = vandq_u32(reg_shft, reg_masks); 1726 vst1q_u32(out, results); 1727 out += 4; 1728 1729 // shift the 2nd 4 outs 1730 ind[0] = in[2] >> shifts_2nd[0]; 1731 ind[1] = (in[2] >> 26 | in[3] << 6) >> shifts_2nd[1]; 1732 ind[2] = in[3] >> shifts_2nd[2]; 1733 ind[3] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[3]; 1734 reg_shft = vld1q_u32(ind); 1735 results = vandq_u32(reg_shft, reg_masks); 1736 vst1q_u32(out, results); 1737 out += 4; 1738 1739 // shift the 3rd 4 outs 1740 ind[0] = (in[4] >> 16 | in[5] << 16) >> shifts_3rd[0]; 1741 ind[1] = in[5] >> shifts_3rd[1]; 1742 ind[2] = (in[5] >> 20 | in[6] << 12) >> shifts_3rd[2]; 1743 ind[3] = in[6] >> shifts_3rd[3]; 1744 reg_shft = vld1q_u32(ind); 1745 results = vandq_u32(reg_shft, reg_masks); 1746 vst1q_u32(out, results); 1747 out += 4; 1748 1749 // shift the 4th 4 outs 1750 ind[0] = (in[6] >> 24 | in[7] << 8) >> shifts_4th[0]; 1751 ind[1] = in[7] >> shifts_4th[1]; 1752 ind[2] = (in[7] >> 28 | in[8] << 4) >> shifts_4th[2]; 1753 ind[3] = in[8] >> shifts_4th[3]; 1754 reg_shft = vld1q_u32(ind); 1755 results = vandq_u32(reg_shft, reg_masks); 1756 vst1q_u32(out, results); 1757 out += 4; 1758 1759 // shift the 5th 4 outs 1760 ind[0] = in[9] >> shifts_1st[0]; 1761 ind[1] = (in[9] >> 18 | in[10] << 14) >> shifts_1st[1]; 1762 ind[2] = in[10] >> shifts_1st[2]; 1763 ind[3] = (in[10] >> 22 | in[11] << 10) >> shifts_1st[3]; 1764 reg_shft = vld1q_u32(ind); 1765 results = vandq_u32(reg_shft, reg_masks); 1766 vst1q_u32(out, results); 1767 out += 4; 1768 1769 // shift the 6th 4 outs 1770 ind[0] = in[11] >> shifts_2nd[0]; 1771 ind[1] = (in[11] >> 26 | in[12] << 6) >> shifts_2nd[1]; 1772 ind[2] = in[12] >> shifts_2nd[2]; 1773 ind[3] = (in[12] >> 30 | in[13] << 2) >> shifts_2nd[3]; 1774 reg_shft = vld1q_u32(ind); 1775 results = vandq_u32(reg_shft, reg_masks); 1776 vst1q_u32(out, results); 1777 out += 4; 1778 1779 // shift the 7th 4 outs 1780 ind[0] = (in[13] >> 16 | in[14] << 16) >> shifts_3rd[0]; 1781 ind[1] = in[14] >> shifts_3rd[1]; 1782 ind[2] = (in[14] >> 20 | in[15] << 12) >> shifts_3rd[2]; 1783 ind[3] = in[15] >> shifts_3rd[3]; 1784 reg_shft = vld1q_u32(ind); 1785 results = vandq_u32(reg_shft, reg_masks); 1786 vst1q_u32(out, results); 1787 out += 4; 1788 1789 // shift the 8th 4 outs 1790 ind[0] = (in[15] >> 24 | in[16] << 8) >> shifts_4th[0]; 1791 ind[1] = in[16] >> shifts_4th[1]; 1792 ind[2] = (in[16] >> 28 | in[17] << 4) >> shifts_4th[2]; 1793 ind[3] = in[17] >> shifts_4th[3]; 1794 reg_shft = vld1q_u32(ind); 1795 results = vandq_u32(reg_shft, reg_masks); 1796 vst1q_u32(out, results); 1797 out += 4; 1798 1799 in += 18; 1800 1801 return in; 1802 } 1803 1804 inline static const uint32_t* unpack19_32_neon(const uint32_t* in, uint32_t* out) { 1805 uint32_t mask = 0x7ffff; 1806 uint32_t ind[4]; 1807 uint32_t shifts_1st[4] = {0, 0, 6, 0}; 1808 uint32_t shifts_2nd[4] = {12, 0, 0, 5}; 1809 uint32_t shifts_3rd[4] = {0, 11, 0, 0}; 1810 uint32_t shifts_4th[4] = {4, 0, 10, 0}; 1811 uint32_t shifts_5th[4] = {0, 3, 0, 9}; 1812 uint32_t shifts_6th[4] = {0, 0, 2, 0}; 1813 uint32_t shifts_7th[4] = {8, 0, 0, 1}; 1814 uint32_t shifts_8th[4] = {0, 7, 0, 13}; 1815 uint32x4_t reg_shft, reg_masks; 1816 uint32x4_t results; 1817 1818 reg_masks = vdupq_n_u32(mask); 1819 1820 // shift the first 4 outs 1821 ind[0] = in[0] >> shifts_1st[0]; 1822 ind[1] = (in[0] >> 19 | in[1] << 13) >> shifts_1st[1]; 1823 ind[2] = in[1] >> shifts_1st[2]; 1824 ind[3] = (in[1] >> 25 | in[2] << 7) >> shifts_1st[3]; 1825 reg_shft = vld1q_u32(ind); 1826 results = vandq_u32(reg_shft, reg_masks); 1827 vst1q_u32(out, results); 1828 out += 4; 1829 1830 // shift the 2nd 4 outs 1831 ind[0] = in[2] >> shifts_2nd[0]; 1832 ind[1] = (in[2] >> 31 | in[3] << 1) >> shifts_2nd[1]; 1833 ind[2] = (in[3] >> 18 | in[4] << 14) >> shifts_2nd[2]; 1834 ind[3] = in[4] >> shifts_2nd[3]; 1835 reg_shft = vld1q_u32(ind); 1836 results = vandq_u32(reg_shft, reg_masks); 1837 vst1q_u32(out, results); 1838 out += 4; 1839 1840 // shift the 3rd 4 outs 1841 ind[0] = (in[4] >> 24 | in[5] << 8) >> shifts_3rd[0]; 1842 ind[1] = in[5] >> shifts_3rd[1]; 1843 ind[2] = (in[5] >> 30 | in[6] << 2) >> shifts_3rd[2]; 1844 ind[3] = (in[6] >> 17 | in[7] << 15) >> shifts_3rd[3]; 1845 reg_shft = vld1q_u32(ind); 1846 results = vandq_u32(reg_shft, reg_masks); 1847 vst1q_u32(out, results); 1848 out += 4; 1849 1850 // shift the 4th 4 outs 1851 ind[0] = in[7] >> shifts_4th[0]; 1852 ind[1] = (in[7] >> 23 | in[8] << 9) >> shifts_4th[1]; 1853 ind[2] = in[8] >> shifts_4th[2]; 1854 ind[3] = (in[8] >> 29 | in[9] << 3) >> shifts_4th[3]; 1855 reg_shft = vld1q_u32(ind); 1856 results = vandq_u32(reg_shft, reg_masks); 1857 vst1q_u32(out, results); 1858 out += 4; 1859 1860 // shift the 5th 4 outs 1861 ind[0] = (in[9] >> 16 | in[10] << 16) >> shifts_5th[0]; 1862 ind[1] = in[10] >> shifts_5th[1]; 1863 ind[2] = (in[10] >> 22 | in[11] << 10) >> shifts_5th[2]; 1864 ind[3] = in[11] >> shifts_5th[3]; 1865 reg_shft = vld1q_u32(ind); 1866 results = vandq_u32(reg_shft, reg_masks); 1867 vst1q_u32(out, results); 1868 out += 4; 1869 1870 // shift the 6th 4 outs 1871 ind[0] = (in[11] >> 28 | in[12] << 4) >> shifts_6th[0]; 1872 ind[1] = (in[12] >> 15 | in[13] << 17) >> shifts_6th[1]; 1873 ind[2] = in[13] >> shifts_6th[2]; 1874 ind[3] = (in[13] >> 21 | in[14] << 11) >> shifts_6th[3]; 1875 reg_shft = vld1q_u32(ind); 1876 results = vandq_u32(reg_shft, reg_masks); 1877 vst1q_u32(out, results); 1878 out += 4; 1879 1880 // shift the 7th 4 outs 1881 ind[0] = in[14] >> shifts_7th[0]; 1882 ind[1] = (in[14] >> 27 | in[15] << 5) >> shifts_7th[1]; 1883 ind[2] = (in[15] >> 14 | in[16] << 18) >> shifts_7th[2]; 1884 ind[3] = in[16] >> shifts_7th[3]; 1885 reg_shft = vld1q_u32(ind); 1886 results = vandq_u32(reg_shft, reg_masks); 1887 vst1q_u32(out, results); 1888 out += 4; 1889 1890 // shift the 8th 4 outs 1891 ind[0] = (in[16] >> 20 | in[17] << 12) >> shifts_8th[0]; 1892 ind[1] = in[17] >> shifts_8th[1]; 1893 ind[2] = (in[17] >> 26 | in[18] << 6) >> shifts_8th[2]; 1894 ind[3] = in[18] >> shifts_8th[3]; 1895 reg_shft = vld1q_u32(ind); 1896 results = vandq_u32(reg_shft, reg_masks); 1897 vst1q_u32(out, results); 1898 out += 4; 1899 1900 in += 19; 1901 1902 return in; 1903 } 1904 1905 inline static const uint32_t* unpack20_32_neon(const uint32_t* in, uint32_t* out) { 1906 uint32_t mask = 0xfffff; 1907 uint32_t ind[4]; 1908 uint32_t shifts_1st[4] = {0, 0, 8, 0}; 1909 uint32_t shifts_2nd[4] = {0, 4, 0, 12}; 1910 uint32x4_t reg_shft, reg_masks; 1911 uint32x4_t results; 1912 1913 reg_masks = vdupq_n_u32(mask); 1914 1915 // shift the first 4 outs 1916 ind[0] = in[0] >> shifts_1st[0]; 1917 ind[1] = (in[0] >> 20 | in[1] << 12) >> shifts_1st[1]; 1918 ind[2] = in[1] >> shifts_1st[2]; 1919 ind[3] = (in[1] >> 28 | in[2] << 4) >> shifts_1st[3]; 1920 reg_shft = vld1q_u32(ind); 1921 results = vandq_u32(reg_shft, reg_masks); 1922 vst1q_u32(out, results); 1923 out += 4; 1924 1925 // shift the 2nd 4 outs 1926 ind[0] = (in[2] >> 16 | in[3] << 16) >> shifts_2nd[0]; 1927 ind[1] = in[3] >> shifts_2nd[1]; 1928 ind[2] = (in[3] >> 24 | in[4] << 8) >> shifts_2nd[2]; 1929 ind[3] = in[4] >> shifts_2nd[3]; 1930 reg_shft = vld1q_u32(ind); 1931 results = vandq_u32(reg_shft, reg_masks); 1932 vst1q_u32(out, results); 1933 out += 4; 1934 1935 // shift the 3rd 4 outs 1936 ind[0] = in[5] >> shifts_1st[0]; 1937 ind[1] = (in[5] >> 20 | in[6] << 12) >> shifts_1st[1]; 1938 ind[2] = in[6] >> shifts_1st[2]; 1939 ind[3] = (in[6] >> 28 | in[7] << 4) >> shifts_1st[3]; 1940 reg_shft = vld1q_u32(ind); 1941 results = vandq_u32(reg_shft, reg_masks); 1942 vst1q_u32(out, results); 1943 out += 4; 1944 1945 // shift the 4th 4 outs 1946 ind[0] = (in[7] >> 16 | in[8] << 16) >> shifts_2nd[0]; 1947 ind[1] = in[8] >> shifts_2nd[1]; 1948 ind[2] = (in[8] >> 24 | in[9] << 8) >> shifts_2nd[2]; 1949 ind[3] = in[9] >> shifts_2nd[3]; 1950 reg_shft = vld1q_u32(ind); 1951 results = vandq_u32(reg_shft, reg_masks); 1952 vst1q_u32(out, results); 1953 out += 4; 1954 1955 // shift the 5th 4 outs 1956 ind[0] = in[10] >> shifts_1st[0]; 1957 ind[1] = (in[10] >> 20 | in[11] << 12) >> shifts_1st[1]; 1958 ind[2] = in[11] >> shifts_1st[2]; 1959 ind[3] = (in[11] >> 28 | in[12] << 4) >> shifts_1st[3]; 1960 reg_shft = vld1q_u32(ind); 1961 results = vandq_u32(reg_shft, reg_masks); 1962 vst1q_u32(out, results); 1963 out += 4; 1964 1965 // shift the 6th 4 outs 1966 ind[0] = (in[12] >> 16 | in[13] << 16) >> shifts_2nd[0]; 1967 ind[1] = in[13] >> shifts_2nd[1]; 1968 ind[2] = (in[13] >> 24 | in[14] << 8) >> shifts_2nd[2]; 1969 ind[3] = in[14] >> shifts_2nd[3]; 1970 reg_shft = vld1q_u32(ind); 1971 results = vandq_u32(reg_shft, reg_masks); 1972 vst1q_u32(out, results); 1973 out += 4; 1974 1975 // shift the 7th 4 outs 1976 ind[0] = in[15] >> shifts_1st[0]; 1977 ind[1] = (in[15] >> 20 | in[16] << 12) >> shifts_1st[1]; 1978 ind[2] = in[16] >> shifts_1st[2]; 1979 ind[3] = (in[16] >> 28 | in[17] << 4) >> shifts_1st[3]; 1980 reg_shft = vld1q_u32(ind); 1981 results = vandq_u32(reg_shft, reg_masks); 1982 vst1q_u32(out, results); 1983 out += 4; 1984 1985 // shift the 8th 4 outs 1986 ind[0] = (in[17] >> 16 | in[18] << 16) >> shifts_2nd[0]; 1987 ind[1] = in[18] >> shifts_2nd[1]; 1988 ind[2] = (in[18] >> 24 | in[19] << 8) >> shifts_2nd[2]; 1989 ind[3] = in[19] >> shifts_2nd[3]; 1990 reg_shft = vld1q_u32(ind); 1991 results = vandq_u32(reg_shft, reg_masks); 1992 vst1q_u32(out, results); 1993 out += 4; 1994 1995 in += 20; 1996 1997 return in; 1998 } 1999 2000 inline static const uint32_t* unpack21_32_neon(const uint32_t* in, uint32_t* out) { 2001 uint32_t mask = 0x1fffff; 2002 uint32_t ind[4]; 2003 uint32_t shifts_1st[4] = {0, 0, 10, 0}; 2004 uint32_t shifts_2nd[4] = {0, 9, 0, 0}; 2005 uint32_t shifts_3rd[4] = {8, 0, 0, 7}; 2006 uint32_t shifts_4th[4] = {0, 0, 6, 0}; 2007 uint32_t shifts_5th[4] = {0, 5, 0, 0}; 2008 uint32_t shifts_6th[4] = {4, 0, 0, 3}; 2009 uint32_t shifts_7th[4] = {0, 0, 2, 0}; 2010 uint32_t shifts_8th[4] = {0, 1, 0, 11}; 2011 uint32x4_t reg_shft, reg_masks; 2012 uint32x4_t results; 2013 2014 reg_masks = vdupq_n_u32(mask); 2015 2016 // shift the first 4 outs 2017 ind[0] = in[0] >> shifts_1st[0]; 2018 ind[1] = (in[0] >> 21 | in[1] << 11) >> shifts_1st[1]; 2019 ind[2] = in[1] >> shifts_1st[2]; 2020 ind[3] = (in[1] >> 31 | in[2] << 1) >> shifts_1st[3]; 2021 reg_shft = vld1q_u32(ind); 2022 results = vandq_u32(reg_shft, reg_masks); 2023 vst1q_u32(out, results); 2024 out += 4; 2025 2026 // shift the 2nd 4 outs 2027 ind[0] = (in[2] >> 20 | in[3] << 12) >> shifts_2nd[0]; 2028 ind[1] = in[3] >> shifts_2nd[1]; 2029 ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[2]; 2030 ind[3] = (in[4] >> 19 | in[5] << 13) >> shifts_2nd[3]; 2031 reg_shft = vld1q_u32(ind); 2032 results = vandq_u32(reg_shft, reg_masks); 2033 vst1q_u32(out, results); 2034 out += 4; 2035 2036 // shift the 3rd 4 outs 2037 ind[0] = in[5] >> shifts_3rd[0]; 2038 ind[1] = (in[5] >> 29 | in[6] << 3) >> shifts_3rd[1]; 2039 ind[2] = (in[6] >> 18 | in[7] << 14) >> shifts_3rd[2]; 2040 ind[3] = in[7] >> shifts_3rd[3]; 2041 reg_shft = vld1q_u32(ind); 2042 results = vandq_u32(reg_shft, reg_masks); 2043 vst1q_u32(out, results); 2044 out += 4; 2045 2046 // shift the 4th 4 outs 2047 ind[0] = (in[7] >> 28 | in[8] << 4) >> shifts_4th[0]; 2048 ind[1] = (in[8] >> 17 | in[9] << 15) >> shifts_4th[1]; 2049 ind[2] = in[9] >> shifts_4th[2]; 2050 ind[3] = (in[9] >> 27 | in[10] << 5) >> shifts_4th[3]; 2051 reg_shft = vld1q_u32(ind); 2052 results = vandq_u32(reg_shft, reg_masks); 2053 vst1q_u32(out, results); 2054 out += 4; 2055 2056 // shift the 5th 4 outs 2057 ind[0] = (in[10] >> 16 | in[11] << 16) >> shifts_5th[0]; 2058 ind[1] = in[11] >> shifts_5th[1]; 2059 ind[2] = (in[11] >> 26 | in[12] << 6) >> shifts_5th[2]; 2060 ind[3] = (in[12] >> 15 | in[13] << 17) >> shifts_5th[3]; 2061 reg_shft = vld1q_u32(ind); 2062 results = vandq_u32(reg_shft, reg_masks); 2063 vst1q_u32(out, results); 2064 out += 4; 2065 2066 // shift the 6th 4 outs 2067 ind[0] = in[13] >> shifts_6th[0]; 2068 ind[1] = (in[13] >> 25 | in[14] << 7) >> shifts_6th[1]; 2069 ind[2] = (in[14] >> 14 | in[15] << 18) >> shifts_6th[2]; 2070 ind[3] = in[15] >> shifts_6th[3]; 2071 reg_shft = vld1q_u32(ind); 2072 results = vandq_u32(reg_shft, reg_masks); 2073 vst1q_u32(out, results); 2074 out += 4; 2075 2076 // shift the 7th 4 outs 2077 ind[0] = (in[15] >> 24 | in[16] << 8) >> shifts_7th[0]; 2078 ind[1] = (in[16] >> 13 | in[17] << 19) >> shifts_7th[1]; 2079 ind[2] = in[17] >> shifts_7th[2]; 2080 ind[3] = (in[17] >> 23 | in[18] << 9) >> shifts_7th[3]; 2081 reg_shft = vld1q_u32(ind); 2082 results = vandq_u32(reg_shft, reg_masks); 2083 vst1q_u32(out, results); 2084 out += 4; 2085 2086 // shift the 8th 4 outs 2087 ind[0] = (in[18] >> 12 | in[19] << 20) >> shifts_8th[0]; 2088 ind[1] = in[19] >> shifts_8th[1]; 2089 ind[2] = (in[19] >> 22 | in[20] << 10) >> shifts_8th[2]; 2090 ind[3] = in[20] >> shifts_8th[3]; 2091 reg_shft = vld1q_u32(ind); 2092 results = vandq_u32(reg_shft, reg_masks); 2093 vst1q_u32(out, results); 2094 out += 4; 2095 2096 2097 in += 21; 2098 2099 return in; 2100 } 2101 2102 inline static const uint32_t* unpack22_32_neon(const uint32_t* in, uint32_t* out) { 2103 uint32_t mask = 0x3fffff; 2104 uint32_t ind[4]; 2105 uint32_t shifts_1st[4] = {0, 0, 0, 2}; 2106 uint32_t shifts_2nd[4] = {0, 0, 4, 0}; 2107 uint32_t shifts_3rd[4] = {0, 6, 0, 0}; 2108 uint32_t shifts_4th[4] = {8, 0, 0, 10}; 2109 uint32x4_t reg_shft, reg_masks; 2110 uint32x4_t results; 2111 2112 reg_masks = vdupq_n_u32(mask); 2113 2114 // shift the first 4 outs 2115 ind[0] = in[0] >> shifts_1st[0]; 2116 ind[1] = (in[0] >> 22 | in[1] << 10) >> shifts_1st[1]; 2117 ind[2] = (in[1] >> 12 | in[2] << 20) >> shifts_1st[2]; 2118 ind[3] = in[2] >> shifts_1st[3]; 2119 reg_shft = vld1q_u32(ind); 2120 results = vandq_u32(reg_shft, reg_masks); 2121 vst1q_u32(out, results); 2122 out += 4; 2123 2124 // shift the 2nd 4 outs 2125 ind[0] = (in[2] >> 24 | in[3] << 8) >> shifts_2nd[0]; 2126 ind[1] = (in[3] >> 14 | in[4] << 18) >> shifts_2nd[1]; 2127 ind[2] = in[4] >> shifts_2nd[2]; 2128 ind[3] = (in[4] >> 26 | in[5] << 6) >> shifts_2nd[3]; 2129 reg_shft = vld1q_u32(ind); 2130 results = vandq_u32(reg_shft, reg_masks); 2131 vst1q_u32(out, results); 2132 out += 4; 2133 2134 // shift the 3rd 4 outs 2135 ind[0] = (in[5] >> 16 | in[6] << 16) >> shifts_3rd[0]; 2136 ind[1] = in[6] >> shifts_3rd[1]; 2137 ind[2] = (in[6] >> 28 | in[7] << 4) >> shifts_3rd[2]; 2138 ind[3] = (in[7] >> 18 | in[8] << 14) >> shifts_3rd[3]; 2139 reg_shft = vld1q_u32(ind); 2140 results = vandq_u32(reg_shft, reg_masks); 2141 vst1q_u32(out, results); 2142 out += 4; 2143 2144 // shift the 4th 4 outs 2145 ind[0] = in[8] >> shifts_4th[0]; 2146 ind[1] = (in[8] >> 30 | in[9] << 2) >> shifts_4th[1]; 2147 ind[2] = (in[9] >> 20 | in[10] << 12) >> shifts_4th[2]; 2148 ind[3] = in[10] >> shifts_4th[3]; 2149 reg_shft = vld1q_u32(ind); 2150 results = vandq_u32(reg_shft, reg_masks); 2151 vst1q_u32(out, results); 2152 out += 4; 2153 2154 // shift the 5th 4 outs 2155 ind[0] = in[11] >> shifts_1st[0]; 2156 ind[1] = (in[11] >> 22 | in[12] << 10) >> shifts_1st[1]; 2157 ind[2] = (in[12] >> 12 | in[13] << 20) >> shifts_1st[2]; 2158 ind[3] = in[13] >> shifts_1st[3]; 2159 reg_shft = vld1q_u32(ind); 2160 results = vandq_u32(reg_shft, reg_masks); 2161 vst1q_u32(out, results); 2162 out += 4; 2163 2164 // shift the 6th 4 outs 2165 ind[0] = (in[13] >> 24 | in[14] << 8) >> shifts_2nd[0]; 2166 ind[1] = (in[14] >> 14 | in[15] << 18) >> shifts_2nd[1]; 2167 ind[2] = in[15] >> shifts_2nd[2]; 2168 ind[3] = (in[15] >> 26 | in[16] << 6) >> shifts_2nd[3]; 2169 reg_shft = vld1q_u32(ind); 2170 results = vandq_u32(reg_shft, reg_masks); 2171 vst1q_u32(out, results); 2172 out += 4; 2173 2174 // shift the 7th 4 outs 2175 ind[0] = (in[16] >> 16 | in[17] << 16) >> shifts_3rd[0]; 2176 ind[1] = in[17] >> shifts_3rd[1]; 2177 ind[2] = (in[17] >> 28 | in[18] << 4) >> shifts_3rd[2]; 2178 ind[3] = (in[18] >> 18 | in[19] << 14) >> shifts_3rd[3]; 2179 reg_shft = vld1q_u32(ind); 2180 results = vandq_u32(reg_shft, reg_masks); 2181 vst1q_u32(out, results); 2182 out += 4; 2183 2184 // shift the 8th 4 outs 2185 ind[0] = in[19] >> shifts_4th[0]; 2186 ind[1] = (in[19] >> 30 | in[20] << 2) >> shifts_4th[1]; 2187 ind[2] = (in[20] >> 20 | in[21] << 12) >> shifts_4th[2]; 2188 ind[3] = in[21] >> shifts_4th[3]; 2189 reg_shft = vld1q_u32(ind); 2190 results = vandq_u32(reg_shft, reg_masks); 2191 vst1q_u32(out, results); 2192 out += 4; 2193 2194 in += 22; 2195 2196 return in; 2197 } 2198 2199 inline static const uint32_t* unpack23_32_neon(const uint32_t* in, uint32_t* out) { 2200 uint32_t mask = 0x7fffff; 2201 uint32_t ind[4]; 2202 uint32_t shifts_1st[4] = {0, 0, 0, 5}; 2203 uint32_t shifts_2nd[4] = {0, 0, 0, 1}; 2204 uint32_t shifts_3rd[4] = {0, 0, 6, 0}; 2205 uint32_t shifts_4th[4] = {0, 0, 2, 0}; 2206 uint32_t shifts_5th[4] = {0, 7, 0, 0}; 2207 uint32_t shifts_6th[4] = {0, 3, 0, 0}; 2208 uint32_t shifts_7th[4] = {8, 0, 0, 0}; 2209 uint32_t shifts_8th[4] = {4, 0, 0, 9}; 2210 uint32x4_t reg_shft, reg_masks; 2211 uint32x4_t results; 2212 2213 reg_masks = vdupq_n_u32(mask); 2214 2215 // shift the first 4 outs 2216 ind[0] = in[0] >> shifts_1st[0]; 2217 ind[1] = (in[0] >> 23 | in[1] << 9) >> shifts_1st[1]; 2218 ind[2] = (in[1] >> 14 | in[2] << 18) >> shifts_1st[2]; 2219 ind[3] = in[2] >> shifts_1st[3]; 2220 reg_shft = vld1q_u32(ind); 2221 results = vandq_u32(reg_shft, reg_masks); 2222 vst1q_u32(out, results); 2223 out += 4; 2224 2225 // shift the 2nd 4 outs 2226 ind[0] = (in[2] >> 28 | in[3] << 4) >> shifts_2nd[0]; 2227 ind[1] = (in[3] >> 19 | in[4] << 13) >> shifts_2nd[1]; 2228 ind[2] = (in[4] >> 10 | in[5] << 22) >> shifts_2nd[2]; 2229 ind[3] = in[5] >> shifts_2nd[3]; 2230 reg_shft = vld1q_u32(ind); 2231 results = vandq_u32(reg_shft, reg_masks); 2232 vst1q_u32(out, results); 2233 out += 4; 2234 2235 // shift the 3rd 4 outs 2236 ind[0] = (in[5] >> 24 | in[6] << 8) >> shifts_3rd[0]; 2237 ind[1] = (in[6] >> 15 | in[7] << 17) >> shifts_3rd[1]; 2238 ind[2] = in[7] >> shifts_3rd[2]; 2239 ind[3] = (in[7] >> 29 | in[8] << 3) >> shifts_3rd[3]; 2240 reg_shft = vld1q_u32(ind); 2241 results = vandq_u32(reg_shft, reg_masks); 2242 vst1q_u32(out, results); 2243 out += 4; 2244 2245 // shift the 4th 4 outs 2246 ind[0] = (in[8] >> 20 | in[9] << 12) >> shifts_4th[0]; 2247 ind[1] = (in[9] >> 11 | in[10] << 21) >> shifts_4th[1]; 2248 ind[2] = in[10] >> shifts_4th[2]; 2249 ind[3] = (in[10] >> 25 | in[11] << 7) >> shifts_4th[3]; 2250 reg_shft = vld1q_u32(ind); 2251 results = vandq_u32(reg_shft, reg_masks); 2252 vst1q_u32(out, results); 2253 out += 4; 2254 2255 // shift the 5th 4 outs 2256 ind[0] = (in[11] >> 16 | in[12] << 16) >> shifts_5th[0]; 2257 ind[1] = in[12] >> shifts_5th[1]; 2258 ind[2] = (in[12] >> 30 | in[13] << 2) >> shifts_5th[2]; 2259 ind[3] = (in[13] >> 21 | in[14] << 11) >> shifts_5th[3]; 2260 reg_shft = vld1q_u32(ind); 2261 results = vandq_u32(reg_shft, reg_masks); 2262 vst1q_u32(out, results); 2263 out += 4; 2264 2265 // shift the 6th 4 outs 2266 ind[0] = (in[14] >> 12 | in[15] << 20) >> shifts_6th[0]; 2267 ind[1] = in[15] >> shifts_6th[1]; 2268 ind[2] = (in[15] >> 26 | in[16] << 6) >> shifts_6th[2]; 2269 ind[3] = (in[16] >> 17 | in[17] << 15) >> shifts_6th[3]; 2270 reg_shft = vld1q_u32(ind); 2271 results = vandq_u32(reg_shft, reg_masks); 2272 vst1q_u32(out, results); 2273 out += 4; 2274 2275 // shift the 7th 4 outs 2276 ind[0] = in[17] >> shifts_7th[0]; 2277 ind[1] = (in[17] >> 31 | in[18] << 1) >> shifts_7th[1]; 2278 ind[2] = (in[18] >> 22 | in[19] << 10) >> shifts_7th[2]; 2279 ind[3] = (in[19] >> 13 | in[20] << 19) >> shifts_7th[3]; 2280 reg_shft = vld1q_u32(ind); 2281 results = vandq_u32(reg_shft, reg_masks); 2282 vst1q_u32(out, results); 2283 out += 4; 2284 2285 // shift the 8th 4 outs 2286 ind[0] = in[20] >> shifts_8th[0]; 2287 ind[1] = (in[20] >> 27 | in[21] << 5) >> shifts_8th[1]; 2288 ind[2] = (in[21] >> 18 | in[22] << 14) >> shifts_8th[2]; 2289 ind[3] = in[22] >> shifts_8th[3]; 2290 reg_shft = vld1q_u32(ind); 2291 results = vandq_u32(reg_shft, reg_masks); 2292 vst1q_u32(out, results); 2293 out += 4; 2294 2295 in += 23; 2296 2297 return in; 2298 } 2299 2300 inline static const uint32_t* unpack24_32_neon(const uint32_t* in, uint32_t* out) { 2301 uint32_t mask = 0xffffff; 2302 uint32_t ind[4]; 2303 uint32_t shifts_1st[4] = {0, 0, 0, 8}; 2304 uint32x4_t reg_shft, reg_masks; 2305 uint32x4_t results; 2306 2307 reg_masks = vdupq_n_u32(mask); 2308 2309 // shift the first 4 outs 2310 ind[0] = in[0] >> shifts_1st[0]; 2311 ind[1] = (in[0] >> 24 | in[1] << 8) >> shifts_1st[1]; 2312 ind[2] = (in[1] >> 16 | in[2] << 16) >> shifts_1st[2]; 2313 ind[3] = in[2] >> shifts_1st[3]; 2314 reg_shft = vld1q_u32(ind); 2315 results = vandq_u32(reg_shft, reg_masks); 2316 vst1q_u32(out, results); 2317 out += 4; 2318 2319 // shift the 2nd 4 outs 2320 ind[0] = in[3] >> shifts_1st[0]; 2321 ind[1] = (in[3] >> 24 | in[4] << 8) >> shifts_1st[1]; 2322 ind[2] = (in[4] >> 16 | in[5] << 16) >> shifts_1st[2]; 2323 ind[3] = in[5] >> shifts_1st[3]; 2324 reg_shft = vld1q_u32(ind); 2325 results = vandq_u32(reg_shft, reg_masks); 2326 vst1q_u32(out, results); 2327 out += 4; 2328 2329 // shift the 3rd 4 outs 2330 ind[0] = in[6] >> shifts_1st[0]; 2331 ind[1] = (in[6] >> 24 | in[7] << 8) >> shifts_1st[1]; 2332 ind[2] = (in[7] >> 16 | in[8] << 16) >> shifts_1st[2]; 2333 ind[3] = in[8] >> shifts_1st[3]; 2334 reg_shft = vld1q_u32(ind); 2335 results = vandq_u32(reg_shft, reg_masks); 2336 vst1q_u32(out, results); 2337 out += 4; 2338 2339 // shift the 4th 4 outs 2340 ind[0] = in[9] >> shifts_1st[0]; 2341 ind[1] = (in[9] >> 24 | in[10] << 8) >> shifts_1st[1]; 2342 ind[2] = (in[10] >> 16 | in[11] << 16) >> shifts_1st[2]; 2343 ind[3] = in[11] >> shifts_1st[3]; 2344 reg_shft = vld1q_u32(ind); 2345 results = vandq_u32(reg_shft, reg_masks); 2346 vst1q_u32(out, results); 2347 out += 4; 2348 2349 // shift the 5th 4 outs 2350 ind[0] = in[12] >> shifts_1st[0]; 2351 ind[1] = (in[12] >> 24 | in[13] << 8) >> shifts_1st[1]; 2352 ind[2] = (in[13] >> 16 | in[14] << 16) >> shifts_1st[2]; 2353 ind[3] = in[14] >> shifts_1st[3]; 2354 reg_shft = vld1q_u32(ind); 2355 results = vandq_u32(reg_shft, reg_masks); 2356 vst1q_u32(out, results); 2357 out += 4; 2358 2359 // shift the 6th 4 outs 2360 ind[0] = in[15] >> shifts_1st[0]; 2361 ind[1] = (in[15] >> 24 | in[16] << 8) >> shifts_1st[1]; 2362 ind[2] = (in[16] >> 16 | in[17] << 16) >> shifts_1st[2]; 2363 ind[3] = in[17] >> shifts_1st[3]; 2364 reg_shft = vld1q_u32(ind); 2365 results = vandq_u32(reg_shft, reg_masks); 2366 vst1q_u32(out, results); 2367 out += 4; 2368 2369 // shift the 7th 4 outs 2370 ind[0] = in[18] >> shifts_1st[0]; 2371 ind[1] = (in[18] >> 24 | in[19] << 8) >> shifts_1st[1]; 2372 ind[2] = (in[19] >> 16 | in[20] << 16) >> shifts_1st[2]; 2373 ind[3] = in[20] >> shifts_1st[3]; 2374 reg_shft = vld1q_u32(ind); 2375 results = vandq_u32(reg_shft, reg_masks); 2376 vst1q_u32(out, results); 2377 out += 4; 2378 2379 // shift the 8th 4 outs 2380 ind[0] = in[21] >> shifts_1st[0]; 2381 ind[1] = (in[21] >> 24 | in[22] << 8) >> shifts_1st[1]; 2382 ind[2] = (in[22] >> 16 | in[23] << 16) >> shifts_1st[2]; 2383 ind[3] = in[23] >> shifts_1st[3]; 2384 reg_shft = vld1q_u32(ind); 2385 results = vandq_u32(reg_shft, reg_masks); 2386 vst1q_u32(out, results); 2387 out += 4; 2388 2389 in += 24; 2390 2391 return in; 2392 } 2393 2394 inline static const uint32_t* unpack25_32_neon(const uint32_t* in, uint32_t* out) { 2395 uint32_t mask = 0x1ffffff; 2396 uint32_t ind[4]; 2397 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2398 uint32_t shifts_2nd[4] = {4, 0, 0, 0}; 2399 uint32_t shifts_3rd[4] = {0, 1, 0, 0}; 2400 uint32_t shifts_4th[4] = {0, 5, 0, 0}; 2401 uint32_t shifts_5th[4] = {0, 0, 2, 0}; 2402 uint32_t shifts_6th[4] = {0, 0, 6, 0}; 2403 uint32_t shifts_7th[4] = {0, 0, 0, 3}; 2404 uint32_t shifts_8th[4] = {0, 0, 0, 7}; 2405 uint32x4_t reg_shft, reg_masks; 2406 uint32x4_t results; 2407 2408 reg_masks = vdupq_n_u32(mask); 2409 2410 // shift the first 4 outs 2411 ind[0] = in[0] >> shifts_1st[0]; 2412 ind[1] = (in[0] >> 25 | in[1] << 7) >> shifts_1st[1]; 2413 ind[2] = (in[1] >> 18 | in[2] << 14) >> shifts_1st[2]; 2414 ind[3] = (in[2] >> 11 | in[3] << 21) >> shifts_1st[3]; 2415 reg_shft = vld1q_u32(ind); 2416 results = vandq_u32(reg_shft, reg_masks); 2417 vst1q_u32(out, results); 2418 out += 4; 2419 2420 // shift the 2nd 4 outs 2421 ind[0] = in[3] >> shifts_2nd[0]; 2422 ind[1] = (in[3] >> 29 | in[4] << 3) >> shifts_2nd[1]; 2423 ind[2] = (in[4] >> 22 | in[5] << 10) >> shifts_2nd[2]; 2424 ind[3] = (in[5] >> 15 | in[6] << 17) >> shifts_2nd[3]; 2425 reg_shft = vld1q_u32(ind); 2426 results = vandq_u32(reg_shft, reg_masks); 2427 vst1q_u32(out, results); 2428 out += 4; 2429 2430 // shift the 3rd 4 outs 2431 ind[0] = (in[6] >> 8 | in[7] << 24) >> shifts_3rd[0]; 2432 ind[1] = in[7] >> shifts_3rd[1]; 2433 ind[2] = (in[7] >> 26 | in[8] << 6) >> shifts_3rd[2]; 2434 ind[3] = (in[8] >> 19 | in[9] << 13) >> shifts_3rd[3]; 2435 reg_shft = vld1q_u32(ind); 2436 results = vandq_u32(reg_shft, reg_masks); 2437 vst1q_u32(out, results); 2438 out += 4; 2439 2440 // shift the 4th 4 outs 2441 ind[0] = (in[9] >> 12 | in[10] << 20) >> shifts_4th[0]; 2442 ind[1] = in[10] >> shifts_4th[1]; 2443 ind[2] = (in[10] >> 30 | in[11] << 2) >> shifts_4th[2]; 2444 ind[3] = (in[11] >> 23 | in[12] << 9) >> shifts_4th[3]; 2445 reg_shft = vld1q_u32(ind); 2446 results = vandq_u32(reg_shft, reg_masks); 2447 vst1q_u32(out, results); 2448 out += 4; 2449 2450 // shift the 5th 4 outs 2451 ind[0] = (in[12] >> 16 | in[13] << 16) >> shifts_5th[0]; 2452 ind[1] = (in[13] >> 9 | in[14] << 23) >> shifts_5th[1]; 2453 ind[2] = in[14] >> shifts_5th[2]; 2454 ind[3] = (in[14] >> 27 | in[15] << 5) >> shifts_5th[3]; 2455 reg_shft = vld1q_u32(ind); 2456 results = vandq_u32(reg_shft, reg_masks); 2457 vst1q_u32(out, results); 2458 out += 4; 2459 2460 // shift the 6th 4 outs 2461 ind[0] = (in[15] >> 20 | in[16] << 12) >> shifts_6th[0]; 2462 ind[1] = (in[16] >> 13 | in[17] << 19) >> shifts_6th[1]; 2463 ind[2] = in[17] >> shifts_6th[2]; 2464 ind[3] = (in[17] >> 31 | in[18] << 1) >> shifts_6th[3]; 2465 reg_shft = vld1q_u32(ind); 2466 results = vandq_u32(reg_shft, reg_masks); 2467 vst1q_u32(out, results); 2468 out += 4; 2469 2470 // shift the 7th 4 outs 2471 ind[0] = (in[18] >> 24 | in[19] << 8) >> shifts_7th[0]; 2472 ind[1] = (in[19] >> 17 | in[20] << 15) >> shifts_7th[1]; 2473 ind[2] = (in[20] >> 10 | in[21] << 22) >> shifts_7th[2]; 2474 ind[3] = in[21] >> shifts_7th[3]; 2475 reg_shft = vld1q_u32(ind); 2476 results = vandq_u32(reg_shft, reg_masks); 2477 vst1q_u32(out, results); 2478 out += 4; 2479 2480 // shift the 8th 4 outs 2481 ind[0] = (in[21] >> 28 | in[22] << 4) >> shifts_8th[0]; 2482 ind[1] = (in[22] >> 21 | in[23] << 11) >> shifts_8th[1]; 2483 ind[2] = (in[23] >> 14 | in[24] << 18) >> shifts_8th[2]; 2484 ind[3] = in[24] >> shifts_8th[3]; 2485 reg_shft = vld1q_u32(ind); 2486 results = vandq_u32(reg_shft, reg_masks); 2487 vst1q_u32(out, results); 2488 out += 4; 2489 2490 in += 25; 2491 2492 return in; 2493 } 2494 2495 inline static const uint32_t* unpack26_32_neon(const uint32_t* in, uint32_t* out) { 2496 uint32_t mask = 0x3ffffff; 2497 uint32_t ind[4]; 2498 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2499 uint32_t shifts_2nd[4] = {0, 2, 0, 0}; 2500 uint32_t shifts_3rd[4] = {0, 0, 4, 0}; 2501 uint32_t shifts_4th[4] = {0, 0, 0, 6}; 2502 uint32x4_t reg_shft, reg_masks; 2503 uint32x4_t results; 2504 2505 reg_masks = vdupq_n_u32(mask); 2506 2507 // shift the first 4 outs 2508 ind[0] = in[0] >> shifts_1st[0]; 2509 ind[1] = (in[0] >> 26 | in[1] << 6) >> shifts_1st[1]; 2510 ind[2] = (in[1] >> 20 | in[2] << 12) >> shifts_1st[2]; 2511 ind[3] = (in[2] >> 14 | in[3] << 18) >> shifts_1st[3]; 2512 reg_shft = vld1q_u32(ind); 2513 results = vandq_u32(reg_shft, reg_masks); 2514 vst1q_u32(out, results); 2515 out += 4; 2516 2517 // shift the 2nd 4 outs 2518 ind[0] = (in[3] >> 8 | in[4] << 24) >> shifts_2nd[0]; 2519 ind[1] = in[4] >> shifts_2nd[1]; 2520 ind[2] = (in[4] >> 28 | in[5] << 4) >> shifts_2nd[2]; 2521 ind[3] = (in[5] >> 22 | in[6] << 10) >> shifts_2nd[3]; 2522 reg_shft = vld1q_u32(ind); 2523 results = vandq_u32(reg_shft, reg_masks); 2524 vst1q_u32(out, results); 2525 out += 4; 2526 2527 // shift the 3rd 4 outs 2528 ind[0] = (in[6] >> 16 | in[7] << 16) >> shifts_3rd[0]; 2529 ind[1] = (in[7] >> 10 | in[8] << 22) >> shifts_3rd[1]; 2530 ind[2] = in[8] >> shifts_3rd[2]; 2531 ind[3] = (in[8] >> 30 | in[9] << 2) >> shifts_3rd[3]; 2532 reg_shft = vld1q_u32(ind); 2533 results = vandq_u32(reg_shft, reg_masks); 2534 vst1q_u32(out, results); 2535 out += 4; 2536 2537 // shift the 4th 4 outs 2538 ind[0] = (in[9] >> 24 | in[10] << 8) >> shifts_4th[0]; 2539 ind[1] = (in[10] >> 18 | in[11] << 14) >> shifts_4th[1]; 2540 ind[2] = (in[11] >> 12 | in[12] << 20) >> shifts_4th[2]; 2541 ind[3] = in[12] >> shifts_4th[3]; 2542 reg_shft = vld1q_u32(ind); 2543 results = vandq_u32(reg_shft, reg_masks); 2544 vst1q_u32(out, results); 2545 out += 4; 2546 2547 // shift the 5th 4 outs 2548 ind[0] = in[13] >> shifts_1st[0]; 2549 ind[1] = (in[13] >> 26 | in[14] << 6) >> shifts_1st[1]; 2550 ind[2] = (in[14] >> 20 | in[15] << 12) >> shifts_1st[2]; 2551 ind[3] = (in[15] >> 14 | in[16] << 18) >> shifts_1st[3]; 2552 reg_shft = vld1q_u32(ind); 2553 results = vandq_u32(reg_shft, reg_masks); 2554 vst1q_u32(out, results); 2555 out += 4; 2556 2557 // shift the 6th 4 outs 2558 ind[0] = (in[16] >> 8 | in[17] << 24) >> shifts_2nd[0]; 2559 ind[1] = in[17] >> shifts_2nd[1]; 2560 ind[2] = (in[17] >> 28 | in[18] << 4) >> shifts_2nd[2]; 2561 ind[3] = (in[18] >> 22 | in[19] << 10) >> shifts_2nd[3]; 2562 reg_shft = vld1q_u32(ind); 2563 results = vandq_u32(reg_shft, reg_masks); 2564 vst1q_u32(out, results); 2565 out += 4; 2566 2567 // shift the 7th 4 outs 2568 ind[0] = (in[19] >> 16 | in[20] << 16) >> shifts_3rd[0]; 2569 ind[1] = (in[20] >> 10 | in[21] << 22) >> shifts_3rd[1]; 2570 ind[2] = in[21] >> shifts_3rd[2]; 2571 ind[3] = (in[21] >> 30 | in[22] << 2) >> shifts_3rd[3]; 2572 reg_shft = vld1q_u32(ind); 2573 results = vandq_u32(reg_shft, reg_masks); 2574 vst1q_u32(out, results); 2575 out += 4; 2576 2577 // shift the 8th 4 outs 2578 ind[0] = (in[22] >> 24 | in[23] << 8) >> shifts_4th[0]; 2579 ind[1] = (in[23] >> 18 | in[24] << 14) >> shifts_4th[1]; 2580 ind[2] = (in[24] >> 12 | in[25] << 20) >> shifts_4th[2]; 2581 ind[3] = in[25] >> shifts_4th[3]; 2582 reg_shft = vld1q_u32(ind); 2583 results = vandq_u32(reg_shft, reg_masks); 2584 vst1q_u32(out, results); 2585 out += 4; 2586 2587 in += 26; 2588 2589 return in; 2590 } 2591 2592 inline static const uint32_t* unpack27_32_neon(const uint32_t* in, uint32_t* out) { 2593 uint32_t mask = 0x7ffffff; 2594 uint32_t ind[4]; 2595 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2596 uint32_t shifts_2nd[4] = {0, 0, 2, 0}; 2597 uint32_t shifts_3rd[4] = {0, 0, 0, 0}; 2598 uint32_t shifts_4th[4] = {4, 0, 0, 0}; 2599 uint32_t shifts_5th[4] = {0, 0, 0, 1}; 2600 uint32_t shifts_6th[4] = {0, 0, 0, 0}; 2601 uint32_t shifts_7th[4] = {0, 3, 0, 0}; 2602 uint32_t shifts_8th[4] = {0, 0, 0, 5}; 2603 uint32x4_t reg_shft, reg_masks; 2604 uint32x4_t results; 2605 2606 reg_masks = vdupq_n_u32(mask); 2607 2608 // shift the first 4 outs 2609 ind[0] = in[0] >> shifts_1st[0]; 2610 ind[1] = (in[0] >> 27 | in[1] << 5) >> shifts_1st[1]; 2611 ind[2] = (in[1] >> 22 | in[2] << 10) >> shifts_1st[2]; 2612 ind[3] = (in[2] >> 17 | in[3] << 15) >> shifts_1st[3]; 2613 reg_shft = vld1q_u32(ind); 2614 results = vandq_u32(reg_shft, reg_masks); 2615 vst1q_u32(out, results); 2616 out += 4; 2617 2618 // shift the 2nd 4 outs 2619 ind[0] = (in[3] >> 12 | in[4] << 20) >> shifts_2nd[0]; 2620 ind[1] = (in[4] >> 7 | in[5] << 25) >> shifts_2nd[1]; 2621 ind[2] = in[5] >> shifts_2nd[2]; 2622 ind[3] = (in[5] >> 29 | in[6] << 3) >> shifts_2nd[3]; 2623 reg_shft = vld1q_u32(ind); 2624 results = vandq_u32(reg_shft, reg_masks); 2625 vst1q_u32(out, results); 2626 out += 4; 2627 2628 // shift the 3rd 4 outs 2629 ind[0] = (in[6] >> 24 | in[7] << 8) >> shifts_3rd[0]; 2630 ind[1] = (in[7] >> 19 | in[8] << 13) >> shifts_3rd[1]; 2631 ind[2] = (in[8] >> 14 | in[9] << 18) >> shifts_3rd[2]; 2632 ind[3] = (in[9] >> 9 | in[10] << 23) >> shifts_3rd[3]; 2633 reg_shft = vld1q_u32(ind); 2634 results = vandq_u32(reg_shft, reg_masks); 2635 vst1q_u32(out, results); 2636 out += 4; 2637 2638 // shift the 4th 4 outs 2639 ind[0] = in[10] >> shifts_4th[0]; 2640 ind[1] = (in[10] >> 31 | in[11] << 1) >> shifts_4th[1]; 2641 ind[2] = (in[11] >> 26 | in[12] << 6) >> shifts_4th[2]; 2642 ind[3] = (in[12] >> 21 | in[13] << 11) >> shifts_4th[3]; 2643 reg_shft = vld1q_u32(ind); 2644 results = vandq_u32(reg_shft, reg_masks); 2645 vst1q_u32(out, results); 2646 out += 4; 2647 2648 // shift the 5th 4 outs 2649 ind[0] = (in[13] >> 16 | in[14] << 16) >> shifts_5th[0]; 2650 ind[1] = (in[14] >> 11 | in[15] << 21) >> shifts_5th[1]; 2651 ind[2] = (in[15] >> 6 | in[16] << 26) >> shifts_5th[2]; 2652 ind[3] = in[16] >> shifts_5th[3]; 2653 reg_shft = vld1q_u32(ind); 2654 results = vandq_u32(reg_shft, reg_masks); 2655 vst1q_u32(out, results); 2656 out += 4; 2657 2658 // shift the 6th 4 outs 2659 ind[0] = (in[16] >> 28 | in[17] << 4) >> shifts_6th[0]; 2660 ind[1] = (in[17] >> 23 | in[18] << 9) >> shifts_6th[1]; 2661 ind[2] = (in[18] >> 18 | in[19] << 14) >> shifts_6th[2]; 2662 ind[3] = (in[19] >> 13 | in[20] << 19) >> shifts_6th[3]; 2663 reg_shft = vld1q_u32(ind); 2664 results = vandq_u32(reg_shft, reg_masks); 2665 vst1q_u32(out, results); 2666 out += 4; 2667 2668 // shift the 7th 4 outs 2669 ind[0] = (in[20] >> 8 | in[21] << 24) >> shifts_7th[0]; 2670 ind[1] = in[21] >> shifts_7th[1]; 2671 ind[2] = (in[21] >> 30 | in[22] << 2) >> shifts_7th[2]; 2672 ind[3] = (in[22] >> 25 | in[23] << 7) >> shifts_7th[3]; 2673 reg_shft = vld1q_u32(ind); 2674 results = vandq_u32(reg_shft, reg_masks); 2675 vst1q_u32(out, results); 2676 out += 4; 2677 2678 // shift the 8th 4 outs 2679 ind[0] = (in[23] >> 20 | in[24] << 12) >> shifts_8th[0]; 2680 ind[1] = (in[24] >> 15 | in[25] << 17) >> shifts_8th[1]; 2681 ind[2] = (in[25] >> 10 | in[26] << 22) >> shifts_8th[2]; 2682 ind[3] = in[26] >> shifts_8th[3]; 2683 reg_shft = vld1q_u32(ind); 2684 results = vandq_u32(reg_shft, reg_masks); 2685 vst1q_u32(out, results); 2686 out += 4; 2687 2688 in += 27; 2689 2690 return in; 2691 } 2692 2693 inline static const uint32_t* unpack28_32_neon(const uint32_t* in, uint32_t* out) { 2694 uint32_t mask = 0xfffffff; 2695 uint32_t ind[4]; 2696 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2697 uint32_t shifts_2nd[4] = {0, 0, 0, 4}; 2698 uint32x4_t reg_shft, reg_masks; 2699 uint32x4_t results; 2700 2701 reg_masks = vdupq_n_u32(mask); 2702 2703 // shift the first 4 outs 2704 ind[0] = in[0] >> shifts_1st[0]; 2705 ind[1] = (in[0] >> 28 | in[1] << 4) >> shifts_1st[1]; 2706 ind[2] = (in[1] >> 24 | in[2] << 8) >> shifts_1st[2]; 2707 ind[3] = (in[2] >> 20 | in[3] << 12) >> shifts_1st[3]; 2708 reg_shft = vld1q_u32(ind); 2709 results = vandq_u32(reg_shft, reg_masks); 2710 vst1q_u32(out, results); 2711 out += 4; 2712 2713 // shift the 2nd 4 outs 2714 ind[0] = (in[3] >> 16 | in[4] << 16) >> shifts_2nd[0]; 2715 ind[1] = (in[4] >> 12 | in[5] << 20) >> shifts_2nd[1]; 2716 ind[2] = (in[5] >> 8 | in[6] << 24) >> shifts_2nd[2]; 2717 ind[3] = in[6] >> shifts_2nd[3]; 2718 reg_shft = vld1q_u32(ind); 2719 results = vandq_u32(reg_shft, reg_masks); 2720 vst1q_u32(out, results); 2721 out += 4; 2722 2723 // shift the 3rd 4 outs 2724 ind[0] = in[7] >> shifts_1st[0]; 2725 ind[1] = (in[7] >> 28 | in[8] << 4) >> shifts_1st[1]; 2726 ind[2] = (in[8] >> 24 | in[9] << 8) >> shifts_1st[2]; 2727 ind[3] = (in[9] >> 20 | in[10] << 12) >> shifts_1st[3]; 2728 reg_shft = vld1q_u32(ind); 2729 results = vandq_u32(reg_shft, reg_masks); 2730 vst1q_u32(out, results); 2731 out += 4; 2732 2733 // shift the 4th 4 outs 2734 ind[0] = (in[10] >> 16 | in[11] << 16) >> shifts_2nd[0]; 2735 ind[1] = (in[11] >> 12 | in[12] << 20) >> shifts_2nd[1]; 2736 ind[2] = (in[12] >> 8 | in[13] << 24) >> shifts_2nd[2]; 2737 ind[3] = in[13] >> shifts_2nd[3]; 2738 reg_shft = vld1q_u32(ind); 2739 results = vandq_u32(reg_shft, reg_masks); 2740 vst1q_u32(out, results); 2741 out += 4; 2742 2743 // shift the 5th 4 outs 2744 ind[0] = in[14] >> shifts_1st[0]; 2745 ind[1] = (in[14] >> 28 | in[15] << 4) >> shifts_1st[1]; 2746 ind[2] = (in[15] >> 24 | in[16] << 8) >> shifts_1st[2]; 2747 ind[3] = (in[16] >> 20 | in[17] << 12) >> shifts_1st[3]; 2748 reg_shft = vld1q_u32(ind); 2749 results = vandq_u32(reg_shft, reg_masks); 2750 vst1q_u32(out, results); 2751 out += 4; 2752 2753 // shift the 6th 4 outs 2754 ind[0] = (in[17] >> 16 | in[18] << 16) >> shifts_2nd[0]; 2755 ind[1] = (in[18] >> 12 | in[19] << 20) >> shifts_2nd[1]; 2756 ind[2] = (in[19] >> 8 | in[20] << 24) >> shifts_2nd[2]; 2757 ind[3] = in[20] >> shifts_2nd[3]; 2758 reg_shft = vld1q_u32(ind); 2759 results = vandq_u32(reg_shft, reg_masks); 2760 vst1q_u32(out, results); 2761 out += 4; 2762 2763 // shift the 7th 4 outs 2764 ind[0] = in[21] >> shifts_1st[0]; 2765 ind[1] = (in[21] >> 28 | in[22] << 4) >> shifts_1st[1]; 2766 ind[2] = (in[22] >> 24 | in[23] << 8) >> shifts_1st[2]; 2767 ind[3] = (in[23] >> 20 | in[24] << 12) >> shifts_1st[3]; 2768 reg_shft = vld1q_u32(ind); 2769 results = vandq_u32(reg_shft, reg_masks); 2770 vst1q_u32(out, results); 2771 out += 4; 2772 2773 // shift the 8th 4 outs 2774 ind[0] = (in[24] >> 16 | in[25] << 16) >> shifts_2nd[0]; 2775 ind[1] = (in[25] >> 12 | in[26] << 20) >> shifts_2nd[1]; 2776 ind[2] = (in[26] >> 8 | in[27] << 24) >> shifts_2nd[2]; 2777 ind[3] = in[27] >> shifts_2nd[3]; 2778 reg_shft = vld1q_u32(ind); 2779 results = vandq_u32(reg_shft, reg_masks); 2780 vst1q_u32(out, results); 2781 out += 4; 2782 2783 in += 28; 2784 2785 return in; 2786 } 2787 2788 inline static const uint32_t* unpack29_32_neon(const uint32_t* in, uint32_t* out) { 2789 uint32_t mask = 0x1fffffff; 2790 uint32_t ind[4]; 2791 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2792 uint32_t shifts_2nd[4] = {0, 0, 0, 0}; 2793 uint32_t shifts_3rd[4] = {0, 0, 2, 0}; 2794 uint32_t shifts_4th[4] = {0, 0, 0, 0}; 2795 uint32_t shifts_5th[4] = {0, 0, 0, 0}; 2796 uint32_t shifts_6th[4] = {0, 1, 0, 0}; 2797 uint32_t shifts_7th[4] = {0, 0, 0, 0}; 2798 uint32_t shifts_8th[4] = {0, 0, 0, 3}; 2799 uint32x4_t reg_shft, reg_masks; 2800 uint32x4_t results; 2801 2802 reg_masks = vdupq_n_u32(mask); 2803 2804 // shift the first 4 outs 2805 ind[0] = in[0] >> shifts_1st[0]; 2806 ind[1] = (in[0] >> 29 | in[1] << 3) >> shifts_1st[1]; 2807 ind[2] = (in[1] >> 26 | in[2] << 6) >> shifts_1st[2]; 2808 ind[3] = (in[2] >> 23 | in[3] << 9) >> shifts_1st[3]; 2809 reg_shft = vld1q_u32(ind); 2810 results = vandq_u32(reg_shft, reg_masks); 2811 vst1q_u32(out, results); 2812 out += 4; 2813 2814 // shift the 2nd 4 outs 2815 ind[0] = (in[3] >> 20 | in[4] << 12) >> shifts_2nd[0]; 2816 ind[1] = (in[4] >> 17 | in[5] << 15) >> shifts_2nd[1]; 2817 ind[2] = (in[5] >> 14 | in[6] << 18) >> shifts_2nd[2]; 2818 ind[3] = (in[6] >> 11 | in[7] << 21) >> shifts_2nd[3]; 2819 reg_shft = vld1q_u32(ind); 2820 results = vandq_u32(reg_shft, reg_masks); 2821 vst1q_u32(out, results); 2822 out += 4; 2823 2824 // shift the 3rd 4 outs 2825 ind[0] = (in[7] >> 8 | in[8] << 24) >> shifts_3rd[0]; 2826 ind[1] = (in[8] >> 5 | in[9] << 27) >> shifts_3rd[1]; 2827 ind[2] = in[9] >> shifts_3rd[2]; 2828 ind[3] = (in[9] >> 31 | in[10] << 1) >> shifts_3rd[3]; 2829 reg_shft = vld1q_u32(ind); 2830 results = vandq_u32(reg_shft, reg_masks); 2831 vst1q_u32(out, results); 2832 out += 4; 2833 2834 // shift the 4th 4 outs 2835 ind[0] = (in[10] >> 28 | in[11] << 4) >> shifts_4th[0]; 2836 ind[1] = (in[11] >> 25 | in[12] << 7) >> shifts_4th[1]; 2837 ind[2] = (in[12] >> 22 | in[13] << 10) >> shifts_4th[2]; 2838 ind[3] = (in[13] >> 19 | in[14] << 13) >> shifts_4th[3]; 2839 reg_shft = vld1q_u32(ind); 2840 results = vandq_u32(reg_shft, reg_masks); 2841 vst1q_u32(out, results); 2842 out += 4; 2843 2844 // shift the 5th 4 outs 2845 ind[0] = (in[14] >> 16 | in[15] << 16) >> shifts_5th[0]; 2846 ind[1] = (in[15] >> 13 | in[16] << 19) >> shifts_5th[1]; 2847 ind[2] = (in[16] >> 10 | in[17] << 22) >> shifts_5th[2]; 2848 ind[3] = (in[17] >> 7 | in[18] << 25) >> shifts_5th[3]; 2849 reg_shft = vld1q_u32(ind); 2850 results = vandq_u32(reg_shft, reg_masks); 2851 vst1q_u32(out, results); 2852 out += 4; 2853 2854 // shift the 6th 4 outs 2855 ind[0] = (in[18] >> 4 | in[19] << 28) >> shifts_6th[0]; 2856 ind[1] = in[19] >> shifts_6th[1]; 2857 ind[2] = (in[19] >> 30 | in[20] << 2) >> shifts_6th[2]; 2858 ind[3] = (in[20] >> 27 | in[21] << 5) >> shifts_6th[3]; 2859 reg_shft = vld1q_u32(ind); 2860 results = vandq_u32(reg_shft, reg_masks); 2861 vst1q_u32(out, results); 2862 out += 4; 2863 2864 // shift the 7th 4 outs 2865 ind[0] = (in[21] >> 24 | in[22] << 8) >> shifts_7th[0]; 2866 ind[1] = (in[22] >> 21 | in[23] << 11) >> shifts_7th[1]; 2867 ind[2] = (in[23] >> 18 | in[24] << 14) >> shifts_7th[2]; 2868 ind[3] = (in[24] >> 15 | in[25] << 17) >> shifts_7th[3]; 2869 reg_shft = vld1q_u32(ind); 2870 results = vandq_u32(reg_shft, reg_masks); 2871 vst1q_u32(out, results); 2872 out += 4; 2873 2874 // shift the 8th 4 outs 2875 ind[0] = (in[25] >> 12 | in[26] << 20) >> shifts_8th[0]; 2876 ind[1] = (in[26] >> 9 | in[27] << 23) >> shifts_8th[1]; 2877 ind[2] = (in[27] >> 6 | in[28] << 26) >> shifts_8th[2]; 2878 ind[3] = in[28] >> shifts_8th[3]; 2879 reg_shft = vld1q_u32(ind); 2880 results = vandq_u32(reg_shft, reg_masks); 2881 vst1q_u32(out, results); 2882 out += 4; 2883 2884 in += 29; 2885 2886 return in; 2887 } 2888 2889 inline static const uint32_t* unpack30_32_neon(const uint32_t* in, uint32_t* out) { 2890 uint32_t mask = 0x3fffffff; 2891 uint32_t ind[4]; 2892 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2893 uint32_t shifts_2nd[4] = {0, 0, 0, 0}; 2894 uint32_t shifts_3rd[4] = {0, 0, 0, 0}; 2895 uint32_t shifts_4th[4] = {0, 0, 0, 2}; 2896 uint32x4_t reg_shft, reg_masks; 2897 uint32x4_t results; 2898 2899 reg_masks = vdupq_n_u32(mask); 2900 2901 // shift the first 4 outs 2902 ind[0] = in[0] >> shifts_1st[0]; 2903 ind[1] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[1]; 2904 ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_1st[2]; 2905 ind[3] = (in[2] >> 26 | in[3] << 6) >> shifts_1st[3]; 2906 reg_shft = vld1q_u32(ind); 2907 results = vandq_u32(reg_shft, reg_masks); 2908 vst1q_u32(out, results); 2909 out += 4; 2910 2911 // shift the 2nd 4 outs 2912 ind[0] = (in[3] >> 24 | in[4] << 8) >> shifts_2nd[0]; 2913 ind[1] = (in[4] >> 22 | in[5] << 10) >> shifts_2nd[1]; 2914 ind[2] = (in[5] >> 20 | in[6] << 12) >> shifts_2nd[2]; 2915 ind[3] = (in[6] >> 18 | in[7] << 14) >> shifts_2nd[3]; 2916 reg_shft = vld1q_u32(ind); 2917 results = vandq_u32(reg_shft, reg_masks); 2918 vst1q_u32(out, results); 2919 out += 4; 2920 2921 // shift the 3rd 4 outs 2922 ind[0] = (in[7] >> 16 | in[8] << 16) >> shifts_3rd[0]; 2923 ind[1] = (in[8] >> 14 | in[9] << 18) >> shifts_3rd[1]; 2924 ind[2] = (in[9] >> 12 | in[10] << 20) >> shifts_3rd[2]; 2925 ind[3] = (in[10] >> 10 | in[11] << 22) >> shifts_3rd[3]; 2926 reg_shft = vld1q_u32(ind); 2927 results = vandq_u32(reg_shft, reg_masks); 2928 vst1q_u32(out, results); 2929 out += 4; 2930 2931 // shift the 4th 4 outs 2932 ind[0] = (in[11] >> 8 | in[12] << 24) >> shifts_4th[0]; 2933 ind[1] = (in[12] >> 6 | in[13] << 26) >> shifts_4th[1]; 2934 ind[2] = (in[13] >> 4 | in[14] << 28) >> shifts_4th[2]; 2935 ind[3] = in[14] >> shifts_4th[3]; 2936 reg_shft = vld1q_u32(ind); 2937 results = vandq_u32(reg_shft, reg_masks); 2938 vst1q_u32(out, results); 2939 out += 4; 2940 2941 // shift the 5th 4 outs 2942 ind[0] = in[15] >> shifts_1st[0]; 2943 ind[1] = (in[15] >> 30 | in[16] << 2) >> shifts_1st[1]; 2944 ind[2] = (in[16] >> 28 | in[17] << 4) >> shifts_1st[2]; 2945 ind[3] = (in[17] >> 26 | in[18] << 6) >> shifts_1st[3]; 2946 reg_shft = vld1q_u32(ind); 2947 results = vandq_u32(reg_shft, reg_masks); 2948 vst1q_u32(out, results); 2949 out += 4; 2950 2951 // shift the 6th 4 outs 2952 ind[0] = (in[18] >> 24 | in[19] << 8) >> shifts_2nd[0]; 2953 ind[1] = (in[19] >> 22 | in[20] << 10) >> shifts_2nd[1]; 2954 ind[2] = (in[20] >> 20 | in[21] << 12) >> shifts_2nd[2]; 2955 ind[3] = (in[21] >> 18 | in[22] << 14) >> shifts_2nd[3]; 2956 reg_shft = vld1q_u32(ind); 2957 results = vandq_u32(reg_shft, reg_masks); 2958 vst1q_u32(out, results); 2959 out += 4; 2960 2961 // shift the 7th 4 outs 2962 ind[0] = (in[22] >> 16 | in[23] << 16) >> shifts_3rd[0]; 2963 ind[1] = (in[23] >> 14 | in[24] << 18) >> shifts_3rd[1]; 2964 ind[2] = (in[24] >> 12 | in[25] << 20) >> shifts_3rd[2]; 2965 ind[3] = (in[25] >> 10 | in[26] << 22) >> shifts_3rd[3]; 2966 reg_shft = vld1q_u32(ind); 2967 results = vandq_u32(reg_shft, reg_masks); 2968 vst1q_u32(out, results); 2969 out += 4; 2970 2971 // shift the 8th 4 outs 2972 ind[0] = (in[26] >> 8 | in[27] << 24) >> shifts_4th[0]; 2973 ind[1] = (in[27] >> 6 | in[28] << 26) >> shifts_4th[1]; 2974 ind[2] = (in[28] >> 4 | in[29] << 28) >> shifts_4th[2]; 2975 ind[3] = in[29] >> shifts_4th[3]; 2976 reg_shft = vld1q_u32(ind); 2977 results = vandq_u32(reg_shft, reg_masks); 2978 vst1q_u32(out, results); 2979 out += 4; 2980 2981 in += 30; 2982 2983 return in; 2984 } 2985 2986 inline static const uint32_t* unpack31_32_neon(const uint32_t* in, uint32_t* out) { 2987 uint32_t mask = 0x7fffffff; 2988 uint32_t ind[4]; 2989 uint32_t shifts_1st[4] = {0, 0, 0, 0}; 2990 uint32_t shifts_2nd[4] = {0, 0, 0, 1}; 2991 uint32x4_t reg_shft, reg_masks; 2992 uint32x4_t results; 2993 2994 reg_masks = vdupq_n_u32(mask); 2995 2996 // shift the first 4 outs 2997 ind[0] = in[0] >> shifts_1st[0]; 2998 ind[1] = (in[0] >> 31 | in[1] << 1) >> shifts_1st[1]; 2999 ind[2] = (in[1] >> 30 | in[2] << 2) >> shifts_1st[2]; 3000 ind[3] = (in[2] >> 29 | in[3] << 3) >> shifts_1st[3]; 3001 reg_shft = vld1q_u32(ind); 3002 results = vandq_u32(reg_shft, reg_masks); 3003 vst1q_u32(out, results); 3004 out += 4; 3005 3006 // shift the 2nd 4 outs 3007 ind[0] = (in[3] >> 28 | in[4] << 4) >> shifts_1st[0]; 3008 ind[1] = (in[4] >> 27 | in[5] << 5) >> shifts_1st[1]; 3009 ind[2] = (in[5] >> 26 | in[6] << 6) >> shifts_1st[2]; 3010 ind[3] = (in[6] >> 25 | in[7] << 7) >> shifts_1st[3]; 3011 reg_shft = vld1q_u32(ind); 3012 results = vandq_u32(reg_shft, reg_masks); 3013 vst1q_u32(out, results); 3014 out += 4; 3015 3016 // shift the 3rd 4 outs 3017 ind[0] = (in[7] >> 24 | in[8] << 8) >> shifts_1st[0]; 3018 ind[1] = (in[8] >> 23 | in[9] << 9) >> shifts_1st[1]; 3019 ind[2] = (in[9] >> 22 | in[10] << 10) >> shifts_1st[2]; 3020 ind[3] = (in[10] >> 21 | in[11] << 11) >> shifts_1st[3]; 3021 reg_shft = vld1q_u32(ind); 3022 results = vandq_u32(reg_shft, reg_masks); 3023 vst1q_u32(out, results); 3024 out += 4; 3025 3026 // shift the 4th 4 outs 3027 ind[0] = (in[11] >> 20 | in[12] << 12) >> shifts_1st[0]; 3028 ind[1] = (in[12] >> 19 | in[13] << 13) >> shifts_1st[1]; 3029 ind[2] = (in[13] >> 18 | in[14] << 14) >> shifts_1st[2]; 3030 ind[3] = (in[14] >> 17 | in[15] << 15) >> shifts_1st[3]; 3031 reg_shft = vld1q_u32(ind); 3032 results = vandq_u32(reg_shft, reg_masks); 3033 vst1q_u32(out, results); 3034 out += 4; 3035 3036 // shift the 5th 4 outs 3037 ind[0] = (in[15] >> 16 | in[16] << 16) >> shifts_1st[0]; 3038 ind[1] = (in[16] >> 15 | in[17] << 17) >> shifts_1st[1]; 3039 ind[2] = (in[17] >> 14 | in[18] << 18) >> shifts_1st[2]; 3040 ind[3] = (in[18] >> 13 | in[19] << 19) >> shifts_1st[3]; 3041 reg_shft = vld1q_u32(ind); 3042 results = vandq_u32(reg_shft, reg_masks); 3043 vst1q_u32(out, results); 3044 out += 4; 3045 3046 // shift the 6th 4 outs 3047 ind[0] = (in[19] >> 12 | in[20] << 20) >> shifts_1st[0]; 3048 ind[1] = (in[20] >> 11 | in[21] << 21) >> shifts_1st[1]; 3049 ind[2] = (in[21] >> 10 | in[22] << 22) >> shifts_1st[2]; 3050 ind[3] = (in[22] >> 9 | in[23] << 23) >> shifts_1st[3]; 3051 reg_shft = vld1q_u32(ind); 3052 results = vandq_u32(reg_shft, reg_masks); 3053 vst1q_u32(out, results); 3054 out += 4; 3055 3056 // shift the 7th 4 outs 3057 ind[0] = (in[23] >> 8 | in[24] << 24) >> shifts_1st[0]; 3058 ind[1] = (in[24] >> 7 | in[25] << 25) >> shifts_1st[1]; 3059 ind[2] = (in[25] >> 6 | in[26] << 26) >> shifts_1st[2]; 3060 ind[3] = (in[26] >> 5 | in[27] << 27) >> shifts_1st[3]; 3061 reg_shft = vld1q_u32(ind); 3062 results = vandq_u32(reg_shft, reg_masks); 3063 vst1q_u32(out, results); 3064 out += 4; 3065 3066 // shift the 8th 4 outs 3067 ind[0] = (in[27] >> 4 | in[28] << 28) >> shifts_2nd[0]; 3068 ind[1] = (in[28] >> 3 | in[29] << 29) >> shifts_2nd[1]; 3069 ind[2] = (in[29] >> 2 | in[30] << 30) >> shifts_2nd[2]; 3070 ind[3] = in[30] >> shifts_2nd[3]; 3071 reg_shft = vld1q_u32(ind); 3072 results = vandq_u32(reg_shft, reg_masks); 3073 vst1q_u32(out, results); 3074 out += 4; 3075 3076 in += 31; 3077 3078 return in; 3079 } 3080 3081 inline const uint32_t* unpack32_32_neon(const uint32_t* in, uint32_t* out) { 3082 memcpy(out, in, 32 * sizeof(*out)); 3083 in += 32; 3084 out += 32; 3085 3086 return in; 3087 } 3088 3089 int unpack32_neon(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { 3090 batch_size = batch_size / 32 * 32; 3091 int num_loops = batch_size / 32; 3092 3093 switch (num_bits) { 3094 case 0: 3095 for (int i = 0; i < num_loops; ++i) in = unpack0_32_neon(in, out + i * 32); 3096 break; 3097 case 1: 3098 for (int i = 0; i < num_loops; ++i) in = unpack1_32_neon(in, out + i * 32); 3099 break; 3100 case 2: 3101 for (int i = 0; i < num_loops; ++i) in = unpack2_32_neon(in, out + i * 32); 3102 break; 3103 case 3: 3104 for (int i = 0; i < num_loops; ++i) in = unpack3_32_neon(in, out + i * 32); 3105 break; 3106 case 4: 3107 for (int i = 0; i < num_loops; ++i) in = unpack4_32_neon(in, out + i * 32); 3108 break; 3109 case 5: 3110 for (int i = 0; i < num_loops; ++i) in = unpack5_32_neon(in, out + i * 32); 3111 break; 3112 case 6: 3113 for (int i = 0; i < num_loops; ++i) in = unpack6_32_neon(in, out + i * 32); 3114 break; 3115 case 7: 3116 for (int i = 0; i < num_loops; ++i) in = unpack7_32_neon(in, out + i * 32); 3117 break; 3118 case 8: 3119 for (int i = 0; i < num_loops; ++i) in = unpack8_32_neon(in, out + i * 32); 3120 break; 3121 case 9: 3122 for (int i = 0; i < num_loops; ++i) in = unpack9_32_neon(in, out + i * 32); 3123 break; 3124 case 10: 3125 for (int i = 0; i < num_loops; ++i) in = unpack10_32_neon(in, out + i * 32); 3126 break; 3127 case 11: 3128 for (int i = 0; i < num_loops; ++i) in = unpack11_32_neon(in, out + i * 32); 3129 break; 3130 case 12: 3131 for (int i = 0; i < num_loops; ++i) in = unpack12_32_neon(in, out + i * 32); 3132 break; 3133 case 13: 3134 for (int i = 0; i < num_loops; ++i) in = unpack13_32_neon(in, out + i * 32); 3135 break; 3136 case 14: 3137 for (int i = 0; i < num_loops; ++i) in = unpack14_32_neon(in, out + i * 32); 3138 break; 3139 case 15: 3140 for (int i = 0; i < num_loops; ++i) in = unpack15_32_neon(in, out + i * 32); 3141 break; 3142 case 16: 3143 for (int i = 0; i < num_loops; ++i) in = unpack16_32_neon(in, out + i * 32); 3144 break; 3145 case 17: 3146 for (int i = 0; i < num_loops; ++i) in = unpack17_32_neon(in, out + i * 32); 3147 break; 3148 case 18: 3149 for (int i = 0; i < num_loops; ++i) in = unpack18_32_neon(in, out + i * 32); 3150 break; 3151 case 19: 3152 for (int i = 0; i < num_loops; ++i) in = unpack19_32_neon(in, out + i * 32); 3153 break; 3154 case 20: 3155 for (int i = 0; i < num_loops; ++i) in = unpack20_32_neon(in, out + i * 32); 3156 break; 3157 case 21: 3158 for (int i = 0; i < num_loops; ++i) in = unpack21_32_neon(in, out + i * 32); 3159 break; 3160 case 22: 3161 for (int i = 0; i < num_loops; ++i) in = unpack22_32_neon(in, out + i * 32); 3162 break; 3163 case 23: 3164 for (int i = 0; i < num_loops; ++i) in = unpack23_32_neon(in, out + i * 32); 3165 break; 3166 case 24: 3167 for (int i = 0; i < num_loops; ++i) in = unpack24_32_neon(in, out + i * 32); 3168 break; 3169 case 25: 3170 for (int i = 0; i < num_loops; ++i) in = unpack25_32_neon(in, out + i * 32); 3171 break; 3172 case 26: 3173 for (int i = 0; i < num_loops; ++i) in = unpack26_32_neon(in, out + i * 32); 3174 break; 3175 case 27: 3176 for (int i = 0; i < num_loops; ++i) in = unpack27_32_neon(in, out + i * 32); 3177 break; 3178 case 28: 3179 for (int i = 0; i < num_loops; ++i) in = unpack28_32_neon(in, out + i * 32); 3180 break; 3181 case 29: 3182 for (int i = 0; i < num_loops; ++i) in = unpack29_32_neon(in, out + i * 32); 3183 break; 3184 case 30: 3185 for (int i = 0; i < num_loops; ++i) in = unpack30_32_neon(in, out + i * 32); 3186 break; 3187 case 31: 3188 for (int i = 0; i < num_loops; ++i) in = unpack31_32_neon(in, out + i * 32); 3189 break; 3190 case 32: 3191 for (int i = 0; i < num_loops; ++i) in = unpack32_32_neon(in, out + i * 32); 3192 break; 3193 } 3194 3195 return batch_size; 3196 }