github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/utils/_lib/bit_packing_avx2.s (about) 1 .text 2 .intel_syntax noprefix 3 .file "bit_packing_avx2.c" 4 .section .rodata.cst8,"aM",@progbits,8 5 .p2align 3 # -- Begin function unpack32_avx2 6 .LCPI0_0: 7 .quad 9223372034707292159 # 0x7fffffff7fffffff 8 .LCPI0_8: 9 .quad 4611686015206162431 # 0x3fffffff3fffffff 10 .LCPI0_12: 11 .quad 2305843005455597567 # 0x1fffffff1fffffff 12 .LCPI0_23: 13 .quad 1152921500580315135 # 0xfffffff0fffffff 14 .LCPI0_25: 15 .quad 576460748142673919 # 0x7ffffff07ffffff 16 .LCPI0_34: 17 .quad 288230371923853311 # 0x3ffffff03ffffff 18 .LCPI0_35: 19 .quad 42949672976 # 0xa00000010 20 .LCPI0_36: 21 .quad 94489280528 # 0x1600000010 22 .LCPI0_38: 23 .quad 144115183814443007 # 0x1ffffff01ffffff 24 .LCPI0_49: 25 .quad 36028792732385279 # 0x7fffff007fffff 26 .LCPI0_56: 27 .quad 18014394218708991 # 0x3fffff003fffff 28 .LCPI0_59: 29 .quad 9007194961870847 # 0x1fffff001fffff 30 .LCPI0_66: 31 .quad 4503595333451775 # 0xfffff000fffff 32 .LCPI0_68: 33 .quad 2251795519242239 # 0x7ffff0007ffff 34 .LCPI0_73: 35 .quad 1125895612137471 # 0x3ffff0003ffff 36 .LCPI0_76: 37 .quad 562945658585087 # 0x1ffff0001ffff 38 .LCPI0_80: 39 .quad 68719476736 # 0x1000000000 40 .LCPI0_82: 41 .quad 140733193420799 # 0x7fff00007fff 42 .LCPI0_87: 43 .quad 70364449226751 # 0x3fff00003fff 44 .LCPI0_90: 45 .quad 35180077129727 # 0x1fff00001fff 46 .LCPI0_95: 47 .quad 17587891081215 # 0xfff00000fff 48 .LCPI0_97: 49 .quad 8791798056959 # 0x7ff000007ff 50 .LCPI0_102: 51 .quad 4393751544831 # 0x3ff000003ff 52 .LCPI0_105: 53 .quad 2194728288767 # 0x1ff000001ff 54 .LCPI0_112: 55 .quad 545460846719 # 0x7f0000007f 56 .LCPI0_117: 57 .quad 270582939711 # 0x3f0000003f 58 .LCPI0_120: 59 .quad 133143986207 # 0x1f0000001f 60 .LCPI0_125: 61 .quad 64424509455 # 0xf0000000f 62 .LCPI0_127: 63 .quad 30064771079 # 0x700000007 64 .LCPI0_132: 65 .quad 12884901891 # 0x300000003 66 .LCPI0_135: 67 .quad 4294967297 # 0x100000001 68 .section .rodata.cst32,"aM",@progbits,32 69 .p2align 5 70 .LCPI0_1: 71 .long 24 # 0x18 72 .long 23 # 0x17 73 .long 22 # 0x16 74 .long 21 # 0x15 75 .long 20 # 0x14 76 .long 19 # 0x13 77 .long 18 # 0x12 78 .long 17 # 0x11 79 .LCPI0_2: 80 .long 8 # 0x8 81 .long 9 # 0x9 82 .long 10 # 0xa 83 .long 11 # 0xb 84 .long 12 # 0xc 85 .long 13 # 0xd 86 .long 14 # 0xe 87 .long 15 # 0xf 88 .LCPI0_3: 89 .long 16 # 0x10 90 .long 15 # 0xf 91 .long 14 # 0xe 92 .long 13 # 0xd 93 .long 12 # 0xc 94 .long 11 # 0xb 95 .long 10 # 0xa 96 .long 9 # 0x9 97 .LCPI0_4: 98 .long 16 # 0x10 99 .long 17 # 0x11 100 .long 18 # 0x12 101 .long 19 # 0x13 102 .long 20 # 0x14 103 .long 21 # 0x15 104 .long 22 # 0x16 105 .long 23 # 0x17 106 .LCPI0_7: 107 .long 0 # 0x0 108 .long 0 # 0x0 109 .long 0 # 0x0 110 .long 0 # 0x0 111 .long 0 # 0x0 112 .long 0 # 0x0 113 .long 0 # 0x0 114 .long 1 # 0x1 115 .LCPI0_11: 116 .long 0 # 0x0 117 .long 0 # 0x0 118 .long 0 # 0x0 119 .long 0 # 0x0 120 .long 0 # 0x0 121 .long 0 # 0x0 122 .long 0 # 0x0 123 .long 2 # 0x2 124 .LCPI0_15: 125 .long 0 # 0x0 126 .long 0 # 0x0 127 .long 2 # 0x2 128 .long 0 # 0x0 129 .long 0 # 0x0 130 .long 0 # 0x0 131 .long 0 # 0x0 132 .long 0 # 0x0 133 .LCPI0_18: 134 .long 0 # 0x0 135 .long 0 # 0x0 136 .long 0 # 0x0 137 .long 0 # 0x0 138 .long 0 # 0x0 139 .long 1 # 0x1 140 .long 0 # 0x0 141 .long 0 # 0x0 142 .LCPI0_21: 143 .long 0 # 0x0 144 .long 0 # 0x0 145 .long 0 # 0x0 146 .long 0 # 0x0 147 .long 0 # 0x0 148 .long 0 # 0x0 149 .long 0 # 0x0 150 .long 3 # 0x3 151 .LCPI0_22: 152 .long 0 # 0x0 153 .long 0 # 0x0 154 .long 0 # 0x0 155 .long 0 # 0x0 156 .long 0 # 0x0 157 .long 0 # 0x0 158 .long 0 # 0x0 159 .long 4 # 0x4 160 .LCPI0_24: 161 .long 0 # 0x0 162 .long 0 # 0x0 163 .long 0 # 0x0 164 .long 0 # 0x0 165 .long 0 # 0x0 166 .long 0 # 0x0 167 .long 2 # 0x2 168 .long 0 # 0x0 169 .LCPI0_28: 170 .long 0 # 0x0 171 .long 0 # 0x0 172 .long 0 # 0x0 173 .long 0 # 0x0 174 .long 4 # 0x4 175 .long 0 # 0x0 176 .long 0 # 0x0 177 .long 0 # 0x0 178 .LCPI0_31: 179 .long 0 # 0x0 180 .long 0 # 0x0 181 .long 0 # 0x0 182 .long 1 # 0x1 183 .long 0 # 0x0 184 .long 0 # 0x0 185 .long 0 # 0x0 186 .long 0 # 0x0 187 .LCPI0_32: 188 .long 0 # 0x0 189 .long 3 # 0x3 190 .long 0 # 0x0 191 .long 0 # 0x0 192 .long 0 # 0x0 193 .long 0 # 0x0 194 .long 0 # 0x0 195 .long 5 # 0x5 196 .LCPI0_33: 197 .long 0 # 0x0 198 .long 0 # 0x0 199 .long 0 # 0x0 200 .long 0 # 0x0 201 .long 0 # 0x0 202 .long 2 # 0x2 203 .long 0 # 0x0 204 .long 0 # 0x0 205 .LCPI0_37: 206 .long 0 # 0x0 207 .long 0 # 0x0 208 .long 4 # 0x4 209 .long 0 # 0x0 210 .long 0 # 0x0 211 .long 0 # 0x0 212 .long 0 # 0x0 213 .long 6 # 0x6 214 .LCPI0_39: 215 .long 0 # 0x0 216 .long 1 # 0x1 217 .long 0 # 0x0 218 .long 0 # 0x0 219 .long 0 # 0x0 220 .long 5 # 0x5 221 .long 0 # 0x0 222 .long 0 # 0x0 223 .LCPI0_42: 224 .long 0 # 0x0 225 .long 0 # 0x0 226 .long 2 # 0x2 227 .long 0 # 0x0 228 .long 0 # 0x0 229 .long 0 # 0x0 230 .long 6 # 0x6 231 .long 0 # 0x0 232 .LCPI0_45: 233 .long 0 # 0x0 234 .long 0 # 0x0 235 .long 0 # 0x0 236 .long 3 # 0x3 237 .long 0 # 0x0 238 .long 0 # 0x0 239 .long 0 # 0x0 240 .long 7 # 0x7 241 .LCPI0_48: 242 .long 0 # 0x0 243 .long 0 # 0x0 244 .long 0 # 0x0 245 .long 5 # 0x5 246 .long 0 # 0x0 247 .long 0 # 0x0 248 .long 0 # 0x0 249 .long 1 # 0x1 250 .LCPI0_52: 251 .long 0 # 0x0 252 .long 0 # 0x0 253 .long 6 # 0x6 254 .long 0 # 0x0 255 .long 0 # 0x0 256 .long 0 # 0x0 257 .long 2 # 0x2 258 .long 0 # 0x0 259 .LCPI0_53: 260 .long 0 # 0x0 261 .long 7 # 0x7 262 .long 0 # 0x0 263 .long 0 # 0x0 264 .long 0 # 0x0 265 .long 3 # 0x3 266 .long 0 # 0x0 267 .long 0 # 0x0 268 .LCPI0_54: 269 .long 8 # 0x8 270 .long 0 # 0x0 271 .long 0 # 0x0 272 .long 0 # 0x0 273 .long 4 # 0x4 274 .long 0 # 0x0 275 .long 0 # 0x0 276 .long 9 # 0x9 277 .LCPI0_55: 278 .long 0 # 0x0 279 .long 0 # 0x0 280 .long 0 # 0x0 281 .long 2 # 0x2 282 .long 0 # 0x0 283 .long 0 # 0x0 284 .long 4 # 0x4 285 .long 0 # 0x0 286 .LCPI0_57: 287 .long 0 # 0x0 288 .long 6 # 0x6 289 .long 0 # 0x0 290 .long 0 # 0x0 291 .long 8 # 0x8 292 .long 0 # 0x0 293 .long 0 # 0x0 294 .long 10 # 0xa 295 .LCPI0_58: 296 .long 0 # 0x0 297 .long 0 # 0x0 298 .long 10 # 0xa 299 .long 0 # 0x0 300 .long 0 # 0x0 301 .long 9 # 0x9 302 .long 0 # 0x0 303 .long 0 # 0x0 304 .LCPI0_60: 305 .long 8 # 0x8 306 .long 0 # 0x0 307 .long 0 # 0x0 308 .long 7 # 0x7 309 .long 0 # 0x0 310 .long 0 # 0x0 311 .long 6 # 0x6 312 .long 0 # 0x0 313 .LCPI0_61: 314 .long 0 # 0x0 315 .long 5 # 0x5 316 .long 0 # 0x0 317 .long 0 # 0x0 318 .long 4 # 0x4 319 .long 0 # 0x0 320 .long 0 # 0x0 321 .long 3 # 0x3 322 .LCPI0_64: 323 .long 0 # 0x0 324 .long 0 # 0x0 325 .long 2 # 0x2 326 .long 0 # 0x0 327 .long 0 # 0x0 328 .long 1 # 0x1 329 .long 0 # 0x0 330 .long 11 # 0xb 331 .LCPI0_65: 332 .long 0 # 0x0 333 .long 0 # 0x0 334 .long 8 # 0x8 335 .long 0 # 0x0 336 .long 0 # 0x0 337 .long 4 # 0x4 338 .long 0 # 0x0 339 .long 12 # 0xc 340 .LCPI0_67: 341 .long 0 # 0x0 342 .long 0 # 0x0 343 .long 6 # 0x6 344 .long 0 # 0x0 345 .long 12 # 0xc 346 .long 0 # 0x0 347 .long 0 # 0x0 348 .long 5 # 0x5 349 .LCPI0_69: 350 .long 0 # 0x0 351 .long 11 # 0xb 352 .long 0 # 0x0 353 .long 0 # 0x0 354 .long 4 # 0x4 355 .long 0 # 0x0 356 .long 10 # 0xa 357 .long 0 # 0x0 358 .LCPI0_70: 359 .long 0 # 0x0 360 .long 3 # 0x3 361 .long 0 # 0x0 362 .long 9 # 0x9 363 .long 0 # 0x0 364 .long 0 # 0x0 365 .long 2 # 0x2 366 .long 0 # 0x0 367 .LCPI0_71: 368 .long 8 # 0x8 369 .long 0 # 0x0 370 .long 0 # 0x0 371 .long 1 # 0x1 372 .long 0 # 0x0 373 .long 7 # 0x7 374 .long 0 # 0x0 375 .long 13 # 0xd 376 .LCPI0_72: 377 .long 0 # 0x0 378 .long 0 # 0x0 379 .long 4 # 0x4 380 .long 0 # 0x0 381 .long 8 # 0x8 382 .long 0 # 0x0 383 .long 12 # 0xc 384 .long 0 # 0x0 385 .LCPI0_74: 386 .long 0 # 0x0 387 .long 2 # 0x2 388 .long 0 # 0x0 389 .long 6 # 0x6 390 .long 0 # 0x0 391 .long 10 # 0xa 392 .long 0 # 0x0 393 .long 14 # 0xe 394 .LCPI0_75: 395 .long 0 # 0x0 396 .long 0 # 0x0 397 .long 2 # 0x2 398 .long 0 # 0x0 399 .long 4 # 0x4 400 .long 0 # 0x0 401 .long 6 # 0x6 402 .long 0 # 0x0 403 .LCPI0_77: 404 .long 8 # 0x8 405 .long 0 # 0x0 406 .long 10 # 0xa 407 .long 0 # 0x0 408 .long 12 # 0xc 409 .long 0 # 0x0 410 .long 14 # 0xe 411 .long 0 # 0x0 412 .LCPI0_78: 413 .long 0 # 0x0 414 .long 1 # 0x1 415 .long 0 # 0x0 416 .long 3 # 0x3 417 .long 0 # 0x0 418 .long 5 # 0x5 419 .long 0 # 0x0 420 .long 7 # 0x7 421 .LCPI0_79: 422 .long 0 # 0x0 423 .long 9 # 0x9 424 .long 0 # 0x0 425 .long 11 # 0xb 426 .long 0 # 0x0 427 .long 13 # 0xd 428 .long 0 # 0x0 429 .long 15 # 0xf 430 .LCPI0_81: 431 .long 0 # 0x0 432 .long 15 # 0xf 433 .long 0 # 0x0 434 .long 13 # 0xd 435 .long 0 # 0x0 436 .long 11 # 0xb 437 .long 0 # 0x0 438 .long 9 # 0x9 439 .LCPI0_83: 440 .long 0 # 0x0 441 .long 7 # 0x7 442 .long 0 # 0x0 443 .long 5 # 0x5 444 .long 0 # 0x0 445 .long 3 # 0x3 446 .long 0 # 0x0 447 .long 1 # 0x1 448 .LCPI0_84: 449 .long 16 # 0x10 450 .long 0 # 0x0 451 .long 14 # 0xe 452 .long 0 # 0x0 453 .long 12 # 0xc 454 .long 0 # 0x0 455 .long 10 # 0xa 456 .long 0 # 0x0 457 .LCPI0_85: 458 .long 8 # 0x8 459 .long 0 # 0x0 460 .long 6 # 0x6 461 .long 0 # 0x0 462 .long 4 # 0x4 463 .long 0 # 0x0 464 .long 2 # 0x2 465 .long 17 # 0x11 466 .LCPI0_86: 467 .long 0 # 0x0 468 .long 14 # 0xe 469 .long 0 # 0x0 470 .long 10 # 0xa 471 .long 0 # 0x0 472 .long 6 # 0x6 473 .long 0 # 0x0 474 .long 2 # 0x2 475 .LCPI0_88: 476 .long 16 # 0x10 477 .long 0 # 0x0 478 .long 12 # 0xc 479 .long 0 # 0x0 480 .long 8 # 0x8 481 .long 0 # 0x0 482 .long 4 # 0x4 483 .long 18 # 0x12 484 .LCPI0_89: 485 .long 0 # 0x0 486 .long 13 # 0xd 487 .long 0 # 0x0 488 .long 7 # 0x7 489 .long 0 # 0x0 490 .long 1 # 0x1 491 .long 14 # 0xe 492 .long 0 # 0x0 493 .LCPI0_91: 494 .long 8 # 0x8 495 .long 0 # 0x0 496 .long 2 # 0x2 497 .long 15 # 0xf 498 .long 0 # 0x0 499 .long 9 # 0x9 500 .long 0 # 0x0 501 .long 3 # 0x3 502 .LCPI0_92: 503 .long 16 # 0x10 504 .long 0 # 0x0 505 .long 10 # 0xa 506 .long 0 # 0x0 507 .long 4 # 0x4 508 .long 17 # 0x11 509 .long 0 # 0x0 510 .long 11 # 0xb 511 .LCPI0_93: 512 .long 0 # 0x0 513 .long 5 # 0x5 514 .long 18 # 0x12 515 .long 0 # 0x0 516 .long 12 # 0xc 517 .long 0 # 0x0 518 .long 6 # 0x6 519 .long 19 # 0x13 520 .LCPI0_94: 521 .long 0 # 0x0 522 .long 12 # 0xc 523 .long 0 # 0x0 524 .long 4 # 0x4 525 .long 16 # 0x10 526 .long 0 # 0x0 527 .long 8 # 0x8 528 .long 20 # 0x14 529 .LCPI0_96: 530 .long 0 # 0x0 531 .long 11 # 0xb 532 .long 0 # 0x0 533 .long 1 # 0x1 534 .long 12 # 0xc 535 .long 0 # 0x0 536 .long 2 # 0x2 537 .long 13 # 0xd 538 .LCPI0_98: 539 .long 0 # 0x0 540 .long 3 # 0x3 541 .long 14 # 0xe 542 .long 0 # 0x0 543 .long 4 # 0x4 544 .long 15 # 0xf 545 .long 0 # 0x0 546 .long 5 # 0x5 547 .LCPI0_99: 548 .long 16 # 0x10 549 .long 0 # 0x0 550 .long 6 # 0x6 551 .long 17 # 0x11 552 .long 0 # 0x0 553 .long 7 # 0x7 554 .long 18 # 0x12 555 .long 0 # 0x0 556 .LCPI0_100: 557 .long 8 # 0x8 558 .long 19 # 0x13 559 .long 0 # 0x0 560 .long 9 # 0x9 561 .long 20 # 0x14 562 .long 0 # 0x0 563 .long 10 # 0xa 564 .long 21 # 0x15 565 .LCPI0_101: 566 .long 0 # 0x0 567 .long 10 # 0xa 568 .long 20 # 0x14 569 .long 0 # 0x0 570 .long 8 # 0x8 571 .long 18 # 0x12 572 .long 0 # 0x0 573 .long 6 # 0x6 574 .LCPI0_103: 575 .long 16 # 0x10 576 .long 0 # 0x0 577 .long 4 # 0x4 578 .long 14 # 0xe 579 .long 0 # 0x0 580 .long 2 # 0x2 581 .long 12 # 0xc 582 .long 22 # 0x16 583 .LCPI0_104: 584 .long 0 # 0x0 585 .long 9 # 0x9 586 .long 18 # 0x12 587 .long 0 # 0x0 588 .long 4 # 0x4 589 .long 13 # 0xd 590 .long 22 # 0x16 591 .long 0 # 0x0 592 .LCPI0_106: 593 .long 8 # 0x8 594 .long 17 # 0x11 595 .long 0 # 0x0 596 .long 3 # 0x3 597 .long 12 # 0xc 598 .long 21 # 0x15 599 .long 0 # 0x0 600 .long 7 # 0x7 601 .LCPI0_107: 602 .long 16 # 0x10 603 .long 0 # 0x0 604 .long 2 # 0x2 605 .long 11 # 0xb 606 .long 20 # 0x14 607 .long 0 # 0x0 608 .long 6 # 0x6 609 .long 15 # 0xf 610 .LCPI0_108: 611 .long 0 # 0x0 612 .long 1 # 0x1 613 .long 10 # 0xa 614 .long 19 # 0x13 615 .long 0 # 0x0 616 .long 5 # 0x5 617 .long 14 # 0xe 618 .long 23 # 0x17 619 .LCPI0_111: 620 .long 0 # 0x0 621 .long 7 # 0x7 622 .long 14 # 0xe 623 .long 21 # 0x15 624 .long 0 # 0x0 625 .long 3 # 0x3 626 .long 10 # 0xa 627 .long 17 # 0x11 628 .LCPI0_113: 629 .long 24 # 0x18 630 .long 0 # 0x0 631 .long 6 # 0x6 632 .long 13 # 0xd 633 .long 20 # 0x14 634 .long 0 # 0x0 635 .long 2 # 0x2 636 .long 9 # 0x9 637 .LCPI0_114: 638 .long 16 # 0x10 639 .long 23 # 0x17 640 .long 0 # 0x0 641 .long 5 # 0x5 642 .long 12 # 0xc 643 .long 19 # 0x13 644 .long 0 # 0x0 645 .long 1 # 0x1 646 .LCPI0_115: 647 .long 8 # 0x8 648 .long 15 # 0xf 649 .long 22 # 0x16 650 .long 0 # 0x0 651 .long 4 # 0x4 652 .long 11 # 0xb 653 .long 18 # 0x12 654 .long 25 # 0x19 655 .LCPI0_116: 656 .long 0 # 0x0 657 .long 6 # 0x6 658 .long 12 # 0xc 659 .long 18 # 0x12 660 .long 24 # 0x18 661 .long 0 # 0x0 662 .long 4 # 0x4 663 .long 10 # 0xa 664 .LCPI0_118: 665 .long 16 # 0x10 666 .long 22 # 0x16 667 .long 0 # 0x0 668 .long 2 # 0x2 669 .long 8 # 0x8 670 .long 14 # 0xe 671 .long 20 # 0x14 672 .long 26 # 0x1a 673 .LCPI0_119: 674 .long 0 # 0x0 675 .long 5 # 0x5 676 .long 10 # 0xa 677 .long 15 # 0xf 678 .long 20 # 0x14 679 .long 25 # 0x19 680 .long 0 # 0x0 681 .long 3 # 0x3 682 .LCPI0_121: 683 .long 8 # 0x8 684 .long 13 # 0xd 685 .long 18 # 0x12 686 .long 23 # 0x17 687 .long 0 # 0x0 688 .long 1 # 0x1 689 .long 6 # 0x6 690 .long 11 # 0xb 691 .LCPI0_122: 692 .long 16 # 0x10 693 .long 21 # 0x15 694 .long 26 # 0x1a 695 .long 0 # 0x0 696 .long 4 # 0x4 697 .long 9 # 0x9 698 .long 14 # 0xe 699 .long 19 # 0x13 700 .LCPI0_123: 701 .long 24 # 0x18 702 .long 0 # 0x0 703 .long 2 # 0x2 704 .long 7 # 0x7 705 .long 12 # 0xc 706 .long 17 # 0x11 707 .long 22 # 0x16 708 .long 27 # 0x1b 709 .LCPI0_124: 710 .long 0 # 0x0 711 .long 4 # 0x4 712 .long 8 # 0x8 713 .long 12 # 0xc 714 .long 16 # 0x10 715 .long 20 # 0x14 716 .long 24 # 0x18 717 .long 28 # 0x1c 718 .LCPI0_126: 719 .long 0 # 0x0 720 .long 3 # 0x3 721 .long 6 # 0x6 722 .long 9 # 0x9 723 .long 12 # 0xc 724 .long 15 # 0xf 725 .long 18 # 0x12 726 .long 21 # 0x15 727 .LCPI0_128: 728 .long 24 # 0x18 729 .long 27 # 0x1b 730 .long 0 # 0x0 731 .long 1 # 0x1 732 .long 4 # 0x4 733 .long 7 # 0x7 734 .long 10 # 0xa 735 .long 13 # 0xd 736 .LCPI0_129: 737 .long 16 # 0x10 738 .long 19 # 0x13 739 .long 22 # 0x16 740 .long 25 # 0x19 741 .long 28 # 0x1c 742 .long 0 # 0x0 743 .long 2 # 0x2 744 .long 5 # 0x5 745 .LCPI0_130: 746 .long 8 # 0x8 747 .long 11 # 0xb 748 .long 14 # 0xe 749 .long 17 # 0x11 750 .long 20 # 0x14 751 .long 23 # 0x17 752 .long 26 # 0x1a 753 .long 29 # 0x1d 754 .LCPI0_131: 755 .long 0 # 0x0 756 .long 2 # 0x2 757 .long 4 # 0x4 758 .long 6 # 0x6 759 .long 8 # 0x8 760 .long 10 # 0xa 761 .long 12 # 0xc 762 .long 14 # 0xe 763 .LCPI0_133: 764 .long 16 # 0x10 765 .long 18 # 0x12 766 .long 20 # 0x14 767 .long 22 # 0x16 768 .long 24 # 0x18 769 .long 26 # 0x1a 770 .long 28 # 0x1c 771 .long 30 # 0x1e 772 .LCPI0_134: 773 .long 0 # 0x0 774 .long 1 # 0x1 775 .long 2 # 0x2 776 .long 3 # 0x3 777 .long 4 # 0x4 778 .long 5 # 0x5 779 .long 6 # 0x6 780 .long 7 # 0x7 781 .LCPI0_136: 782 .long 24 # 0x18 783 .long 25 # 0x19 784 .long 26 # 0x1a 785 .long 27 # 0x1b 786 .long 28 # 0x1c 787 .long 29 # 0x1d 788 .long 30 # 0x1e 789 .long 31 # 0x1f 790 .section .rodata.cst16,"aM",@progbits,16 791 .p2align 4 792 .LCPI0_5: 793 .long 8 # 0x8 794 .long 7 # 0x7 795 .long 6 # 0x6 796 .long 5 # 0x5 797 .LCPI0_6: 798 .long 24 # 0x18 799 .long 25 # 0x19 800 .long 26 # 0x1a 801 .long 27 # 0x1b 802 .LCPI0_9: 803 .long 16 # 0x10 804 .long 14 # 0xe 805 .long 12 # 0xc 806 .long 10 # 0xa 807 .LCPI0_10: 808 .long 16 # 0x10 809 .long 18 # 0x12 810 .long 20 # 0x14 811 .long 22 # 0x16 812 .LCPI0_13: 813 .long 8 # 0x8 814 .long 5 # 0x5 815 .zero 4 816 .zero 4 817 .LCPI0_14: 818 .long 24 # 0x18 819 .long 27 # 0x1b 820 .zero 4 821 .zero 4 822 .LCPI0_16: 823 .long 16 # 0x10 824 .long 13 # 0xd 825 .long 10 # 0xa 826 .long 7 # 0x7 827 .LCPI0_17: 828 .long 16 # 0x10 829 .long 19 # 0x13 830 .long 22 # 0x16 831 .long 25 # 0x19 832 .LCPI0_19: 833 .long 24 # 0x18 834 .long 21 # 0x15 835 .long 18 # 0x12 836 .long 15 # 0xf 837 .LCPI0_20: 838 .long 8 # 0x8 839 .long 11 # 0xb 840 .long 14 # 0xe 841 .long 17 # 0x11 842 .LCPI0_26: 843 .long 24 # 0x18 844 .long 19 # 0x13 845 .long 14 # 0xe 846 .long 9 # 0x9 847 .LCPI0_27: 848 .long 8 # 0x8 849 .long 13 # 0xd 850 .long 18 # 0x12 851 .long 23 # 0x17 852 .LCPI0_29: 853 .long 16 # 0x10 854 .long 11 # 0xb 855 .zero 4 856 .zero 4 857 .LCPI0_30: 858 .long 16 # 0x10 859 .long 21 # 0x15 860 .zero 4 861 .zero 4 862 .LCPI0_40: 863 .long 16 # 0x10 864 .long 9 # 0x9 865 .zero 4 866 .zero 4 867 .LCPI0_41: 868 .long 16 # 0x10 869 .long 23 # 0x17 870 .zero 4 871 .zero 4 872 .LCPI0_43: 873 .long 24 # 0x18 874 .long 17 # 0x11 875 .zero 4 876 .zero 4 877 .LCPI0_44: 878 .long 8 # 0x8 879 .long 15 # 0xf 880 .zero 4 881 .zero 4 882 .LCPI0_46: 883 .long 0 # 0x0 884 .long 0 # 0x0 885 .long 0 # 0x0 886 .long 8 # 0x8 887 .LCPI0_50: 888 .long 24 # 0x18 889 .long 15 # 0xf 890 .zero 4 891 .zero 4 892 .LCPI0_51: 893 .long 8 # 0x8 894 .long 17 # 0x11 895 .zero 4 896 .zero 4 897 .LCPI0_62: 898 .long 24 # 0x18 899 .long 13 # 0xd 900 .zero 4 901 .zero 4 902 .LCPI0_63: 903 .long 8 # 0x8 904 .long 19 # 0x13 905 .zero 4 906 .zero 4 907 .LCPI0_109: 908 .long 0 # 0x0 909 .long 8 # 0x8 910 .long 16 # 0x10 911 .long 24 # 0x18 912 .section .rodata.cst4,"aM",@progbits,4 913 .p2align 2 914 .LCPI0_47: 915 .long 16777215 # 0xffffff 916 .LCPI0_110: 917 .long 255 # 0xff 918 .text 919 .globl unpack32_avx2 920 .p2align 4, 0x90 921 .type unpack32_avx2,@function 922 unpack32_avx2: # @unpack32_avx2 923 # %bb.0: 924 push rbp 925 mov rbp, rsp 926 push r15 927 push r14 928 push r12 929 push rbx 930 and rsp, -16 931 # kill: def $edx killed $edx def $rdx 932 mov r15, rsi 933 mov rbx, rdi 934 lea r14d, [rdx + 31] 935 test edx, edx 936 cmovns r14d, edx 937 sar r14d, 5 938 cmp ecx, 15 939 jle .LBB0_1 940 # %bb.48: 941 cmp ecx, 23 942 jle .LBB0_49 943 # %bb.72: 944 cmp ecx, 27 945 jle .LBB0_73 946 # %bb.84: 947 cmp ecx, 29 948 jle .LBB0_85 949 # %bb.90: 950 cmp ecx, 30 951 je .LBB0_99 952 # %bb.91: 953 cmp ecx, 31 954 je .LBB0_96 955 # %bb.92: 956 cmp ecx, 32 957 jne .LBB0_147 958 # %bb.93: 959 cmp edx, 32 960 jl .LBB0_147 961 # %bb.94: 962 mov r12d, r14d 963 .p2align 4, 0x90 964 .LBB0_95: # =>This Inner Loop Header: Depth=1 965 mov edx, 128 966 mov rdi, r15 967 mov rsi, rbx 968 call clib·_memcpy(SB) 969 sub rbx, -128 970 sub r15, -128 971 add r12, -1 972 jne .LBB0_95 973 jmp .LBB0_147 974 .LBB0_1: 975 cmp ecx, 7 976 jg .LBB0_25 977 # %bb.2: 978 cmp ecx, 3 979 jg .LBB0_14 980 # %bb.3: 981 cmp ecx, 1 982 jg .LBB0_9 983 # %bb.4: 984 test ecx, ecx 985 je .LBB0_144 986 # %bb.5: 987 cmp ecx, 1 988 jne .LBB0_147 989 # %bb.6: 990 cmp edx, 32 991 jl .LBB0_147 992 # %bb.7: 993 mov eax, r14d 994 add r15, 96 995 xor ecx, ecx 996 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_135] # ymm0 = [4294967297,4294967297,4294967297,4294967297] 997 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_134] # ymm1 = [0,1,2,3,4,5,6,7] 998 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_2] # ymm2 = [8,9,10,11,12,13,14,15] 999 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_4] # ymm3 = [16,17,18,19,20,21,22,23] 1000 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_136] # ymm4 = [24,25,26,27,28,29,30,31] 1001 .p2align 4, 0x90 1002 .LBB0_8: # =>This Inner Loop Header: Depth=1 1003 vpbroadcastd ymm5, dword ptr [rbx + 4*rcx] 1004 vpsrlvd ymm5, ymm5, ymm1 1005 vpand ymm5, ymm5, ymm0 1006 vmovdqu ymmword ptr [r15 - 96], ymm5 1007 vpbroadcastd ymm5, dword ptr [rbx + 4*rcx] 1008 vpsrlvd ymm5, ymm5, ymm2 1009 vpand ymm5, ymm5, ymm0 1010 vmovdqu ymmword ptr [r15 - 64], ymm5 1011 vpbroadcastd ymm5, dword ptr [rbx + 4*rcx] 1012 vpsrlvd ymm5, ymm5, ymm3 1013 vpand ymm5, ymm5, ymm0 1014 vmovdqu ymmword ptr [r15 - 32], ymm5 1015 vpbroadcastd ymm5, dword ptr [rbx + 4*rcx] 1016 vpsrlvd ymm5, ymm5, ymm4 1017 vpand ymm5, ymm5, ymm0 1018 vmovdqu ymmword ptr [r15], ymm5 1019 add rcx, 1 1020 sub r15, -128 1021 cmp rax, rcx 1022 jne .LBB0_8 1023 jmp .LBB0_147 1024 .LBB0_49: 1025 cmp ecx, 19 1026 jg .LBB0_61 1027 # %bb.50: 1028 cmp ecx, 17 1029 jg .LBB0_56 1030 # %bb.51: 1031 cmp ecx, 16 1032 je .LBB0_120 1033 # %bb.52: 1034 cmp ecx, 17 1035 jne .LBB0_147 1036 # %bb.53: 1037 cmp edx, 32 1038 jl .LBB0_147 1039 # %bb.54: 1040 mov r8d, r14d 1041 add r15, 96 1042 add rbx, 64 1043 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_76] # ymm0 = [562945658585087,562945658585087,562945658585087,562945658585087] 1044 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_75] # ymm1 = [0,0,2,0,4,0,6,0] 1045 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_77] # ymm2 = [8,0,10,0,12,0,14,0] 1046 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_78] # ymm3 = [0,1,0,3,0,5,0,7] 1047 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_79] # ymm4 = [0,9,0,11,0,13,0,15] 1048 .p2align 4, 0x90 1049 .LBB0_55: # =>This Inner Loop Header: Depth=1 1050 mov ecx, dword ptr [rbx - 52] 1051 mov r10d, dword ptr [rbx - 48] 1052 shld r10d, ecx, 9 1053 mov esi, dword ptr [rbx - 56] 1054 mov edi, ecx 1055 shld edi, esi, 11 1056 mov r9d, dword ptr [rbx - 64] 1057 mov edx, dword ptr [rbx - 60] 1058 mov eax, edx 1059 shld eax, r9d, 15 1060 vmovd xmm5, esi 1061 shld esi, edx, 13 1062 vpinsrd xmm5, xmm5, edi, 1 1063 vpinsrd xmm5, xmm5, ecx, 2 1064 vpinsrd xmm5, xmm5, r10d, 3 1065 vmovd xmm6, r9d 1066 vpinsrd xmm6, xmm6, eax, 1 1067 vpinsrd xmm6, xmm6, edx, 2 1068 vpinsrd xmm6, xmm6, esi, 3 1069 vinserti128 ymm5, ymm6, xmm5, 1 1070 vpsrlvd ymm5, ymm5, ymm1 1071 vpand ymm5, ymm5, ymm0 1072 vmovdqu ymmword ptr [r15 - 96], ymm5 1073 mov eax, dword ptr [rbx - 36] 1074 mov r10d, dword ptr [rbx - 32] 1075 shld r10d, eax, 1 1076 mov edx, dword ptr [rbx - 40] 1077 mov esi, eax 1078 shld esi, edx, 3 1079 mov r9d, dword ptr [rbx - 48] 1080 mov ecx, dword ptr [rbx - 44] 1081 mov edi, ecx 1082 shld edi, r9d, 7 1083 vmovd xmm5, edx 1084 shld edx, ecx, 5 1085 vpinsrd xmm5, xmm5, esi, 1 1086 vpinsrd xmm5, xmm5, eax, 2 1087 vpinsrd xmm5, xmm5, r10d, 3 1088 vmovd xmm6, r9d 1089 vpinsrd xmm6, xmm6, edi, 1 1090 vpinsrd xmm6, xmm6, ecx, 2 1091 vpinsrd xmm6, xmm6, edx, 3 1092 vinserti128 ymm5, ymm6, xmm5, 1 1093 vpsrlvd ymm5, ymm5, ymm2 1094 vpand ymm5, ymm5, ymm0 1095 vmovdqu ymmword ptr [r15 - 64], ymm5 1096 mov r9d, dword ptr [rbx - 16] 1097 mov r11d, dword ptr [rbx - 20] 1098 mov edx, r9d 1099 shld edx, r11d, 10 1100 mov r10d, dword ptr [rbx - 24] 1101 mov edi, r11d 1102 shld edi, r10d, 12 1103 mov eax, dword ptr [rbx - 28] 1104 mov esi, r10d 1105 shld esi, eax, 14 1106 mov ecx, dword ptr [rbx - 32] 1107 shrd ecx, eax, 16 1108 vmovd xmm5, edi 1109 vpinsrd xmm5, xmm5, r11d, 1 1110 vpinsrd xmm5, xmm5, edx, 2 1111 vpinsrd xmm5, xmm5, r9d, 3 1112 vmovd xmm6, ecx 1113 vpinsrd xmm6, xmm6, eax, 1 1114 vpinsrd xmm6, xmm6, esi, 2 1115 vpinsrd xmm6, xmm6, r10d, 3 1116 vinserti128 ymm5, ymm6, xmm5, 1 1117 vpsrlvd ymm5, ymm5, ymm3 1118 vpand ymm5, ymm5, ymm0 1119 vmovdqu ymmword ptr [r15 - 32], ymm5 1120 mov r9d, dword ptr [rbx] 1121 mov r11d, dword ptr [rbx - 4] 1122 mov edx, r9d 1123 shld edx, r11d, 2 1124 mov r10d, dword ptr [rbx - 8] 1125 mov edi, r11d 1126 shld edi, r10d, 4 1127 mov eax, dword ptr [rbx - 16] 1128 mov esi, dword ptr [rbx - 12] 1129 mov ecx, r10d 1130 shld ecx, esi, 6 1131 shrd eax, esi, 24 1132 vmovd xmm5, edi 1133 vpinsrd xmm5, xmm5, r11d, 1 1134 vpinsrd xmm5, xmm5, edx, 2 1135 vpinsrd xmm5, xmm5, r9d, 3 1136 vmovd xmm6, eax 1137 vpinsrd xmm6, xmm6, esi, 1 1138 vpinsrd xmm6, xmm6, ecx, 2 1139 vpinsrd xmm6, xmm6, r10d, 3 1140 vinserti128 ymm5, ymm6, xmm5, 1 1141 vpsrlvd ymm5, ymm5, ymm4 1142 vpand ymm5, ymm5, ymm0 1143 vmovdqu ymmword ptr [r15], ymm5 1144 sub r15, -128 1145 add rbx, 68 1146 add r8, -1 1147 jne .LBB0_55 1148 jmp .LBB0_147 1149 .LBB0_25: 1150 cmp ecx, 11 1151 jg .LBB0_37 1152 # %bb.26: 1153 cmp ecx, 9 1154 jg .LBB0_32 1155 # %bb.27: 1156 cmp ecx, 8 1157 je .LBB0_132 1158 # %bb.28: 1159 cmp ecx, 9 1160 jne .LBB0_147 1161 # %bb.29: 1162 cmp edx, 32 1163 jl .LBB0_147 1164 # %bb.30: 1165 mov r8d, r14d 1166 add r15, 96 1167 add rbx, 32 1168 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_105] # ymm0 = [2194728288767,2194728288767,2194728288767,2194728288767] 1169 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_104] # ymm1 = [0,9,18,0,4,13,22,0] 1170 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_106] # ymm2 = [8,17,0,3,12,21,0,7] 1171 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_107] # ymm3 = [16,0,2,11,20,0,6,15] 1172 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_108] # ymm4 = [0,1,10,19,0,5,14,23] 1173 .p2align 4, 0x90 1174 .LBB0_31: # =>This Inner Loop Header: Depth=1 1175 mov ecx, dword ptr [rbx - 32] 1176 mov edx, dword ptr [rbx - 28] 1177 mov esi, dword ptr [rbx - 24] 1178 shld esi, edx, 1 1179 vmovd xmm5, edx 1180 vpinsrd xmm5, xmm5, edx, 1 1181 vpinsrd xmm5, xmm5, edx, 2 1182 shld edx, ecx, 5 1183 vpinsrd xmm5, xmm5, esi, 3 1184 vmovd xmm6, ecx 1185 vpinsrd xmm6, xmm6, ecx, 1 1186 vpinsrd xmm6, xmm6, ecx, 2 1187 vpinsrd xmm6, xmm6, edx, 3 1188 vinserti128 ymm5, ymm6, xmm5, 1 1189 vpsrlvd ymm5, ymm5, ymm1 1190 vpand ymm5, ymm5, ymm0 1191 vmovdqu ymmword ptr [r15 - 96], ymm5 1192 mov ecx, dword ptr [rbx - 16] 1193 mov edx, dword ptr [rbx - 24] 1194 mov esi, dword ptr [rbx - 20] 1195 mov edi, ecx 1196 shld edi, esi, 2 1197 mov eax, esi 1198 shld eax, edx, 6 1199 vmovd xmm5, esi 1200 vpinsrd xmm5, xmm5, esi, 1 1201 vpinsrd xmm5, xmm5, edi, 2 1202 vpinsrd xmm5, xmm5, ecx, 3 1203 vmovd xmm6, edx 1204 vpinsrd xmm6, xmm6, edx, 1 1205 vpinsrd xmm6, xmm6, eax, 2 1206 vpinsrd xmm6, xmm6, esi, 3 1207 vinserti128 ymm5, ymm6, xmm5, 1 1208 vpsrlvd ymm5, ymm5, ymm2 1209 vpand ymm5, ymm5, ymm0 1210 vmovdqu ymmword ptr [r15 - 64], ymm5 1211 mov eax, dword ptr [rbx - 8] 1212 mov ecx, dword ptr [rbx - 16] 1213 mov edx, dword ptr [rbx - 12] 1214 mov esi, eax 1215 shld esi, edx, 3 1216 mov edi, edx 1217 shld edi, ecx, 7 1218 vmovd xmm5, edx 1219 vpinsrd xmm5, xmm5, esi, 1 1220 vpinsrd xmm5, xmm5, eax, 2 1221 vpinsrd xmm5, xmm5, eax, 3 1222 vmovd xmm6, ecx 1223 vpinsrd xmm6, xmm6, edi, 1 1224 vpinsrd xmm6, xmm6, edx, 2 1225 vpinsrd xmm6, xmm6, edx, 3 1226 vinserti128 ymm5, ymm6, xmm5, 1 1227 vpsrlvd ymm5, ymm5, ymm3 1228 vpand ymm5, ymm5, ymm0 1229 vmovdqu ymmword ptr [r15 - 32], ymm5 1230 mov eax, dword ptr [rbx] 1231 mov ecx, dword ptr [rbx - 8] 1232 mov edx, dword ptr [rbx - 4] 1233 mov esi, eax 1234 shld esi, edx, 4 1235 shrd ecx, edx, 24 1236 vmovd xmm5, esi 1237 vpinsrd xmm5, xmm5, eax, 1 1238 vpinsrd xmm5, xmm5, eax, 2 1239 vpinsrd xmm5, xmm5, eax, 3 1240 vmovd xmm6, ecx 1241 vpinsrd xmm6, xmm6, edx, 1 1242 vpinsrd xmm6, xmm6, edx, 2 1243 vpinsrd xmm6, xmm6, edx, 3 1244 vinserti128 ymm5, ymm6, xmm5, 1 1245 vpsrlvd ymm5, ymm5, ymm4 1246 vpand ymm5, ymm5, ymm0 1247 vmovdqu ymmword ptr [r15], ymm5 1248 sub r15, -128 1249 add rbx, 36 1250 add r8, -1 1251 jne .LBB0_31 1252 jmp .LBB0_147 1253 .LBB0_73: 1254 cmp ecx, 25 1255 jg .LBB0_79 1256 # %bb.74: 1257 cmp ecx, 24 1258 je .LBB0_108 1259 # %bb.75: 1260 cmp ecx, 25 1261 jne .LBB0_147 1262 # %bb.76: 1263 cmp edx, 32 1264 jl .LBB0_147 1265 # %bb.77: 1266 mov r8d, r14d 1267 add r15, 96 1268 add rbx, 96 1269 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_38] # ymm0 = [144115183814443007,144115183814443007,144115183814443007,144115183814443007] 1270 vmovdqa ymm9, ymmword ptr [rip + .LCPI0_28] # ymm9 = [0,0,0,0,4,0,0,0] 1271 vmovdqa ymm10, ymmword ptr [rip + .LCPI0_39] # ymm10 = [0,1,0,0,0,5,0,0] 1272 vmovdqa xmm11, xmmword ptr [rip + .LCPI0_40] # xmm11 = <16,9,u,u> 1273 vmovdqa xmm4, xmmword ptr [rip + .LCPI0_41] # xmm4 = <16,23,u,u> 1274 vmovdqa ymm5, ymmword ptr [rip + .LCPI0_42] # ymm5 = [0,0,2,0,0,0,6,0] 1275 vmovdqa xmm6, xmmword ptr [rip + .LCPI0_43] # xmm6 = <24,17,u,u> 1276 vmovdqa xmm7, xmmword ptr [rip + .LCPI0_44] # xmm7 = <8,15,u,u> 1277 vmovdqa ymm8, ymmword ptr [rip + .LCPI0_45] # ymm8 = [0,0,0,3,0,0,0,7] 1278 .p2align 4, 0x90 1279 .LBB0_78: # =>This Inner Loop Header: Depth=1 1280 mov ecx, dword ptr [rbx - 76] 1281 mov r9d, dword ptr [rbx - 72] 1282 shld r9d, ecx, 17 1283 mov esi, dword ptr [rbx - 80] 1284 shld ecx, esi, 10 1285 mov edi, dword ptr [rbx - 84] 1286 shld esi, edi, 3 1287 mov eax, dword ptr [rbx - 88] 1288 vmovd xmm1, edi 1289 shld edi, eax, 21 1290 mov r10d, dword ptr [rbx - 96] 1291 mov edx, dword ptr [rbx - 92] 1292 shld eax, edx, 14 1293 shld edx, r10d, 7 1294 vpinsrd xmm1, xmm1, esi, 1 1295 vmovd xmm2, r10d 1296 vpinsrd xmm1, xmm1, ecx, 2 1297 vpinsrd xmm2, xmm2, edx, 1 1298 vpinsrd xmm1, xmm1, r9d, 3 1299 vpinsrd xmm2, xmm2, eax, 2 1300 vpinsrd xmm2, xmm2, edi, 3 1301 vinserti128 ymm1, ymm2, xmm1, 1 1302 vpsrlvd ymm1, ymm1, ymm9 1303 vpand ymm1, ymm1, ymm0 1304 vmovdqu ymmword ptr [r15 - 96], ymm1 1305 mov r11d, dword ptr [rbx - 52] 1306 mov r9d, dword ptr [rbx - 48] 1307 shld r9d, r11d, 9 1308 mov r10d, dword ptr [rbx - 56] 1309 shld r11d, r10d, 2 1310 mov esi, dword ptr [rbx - 60] 1311 mov edi, r10d 1312 mov ecx, dword ptr [rbx - 64] 1313 shld edi, esi, 20 1314 mov edx, dword ptr [rbx - 72] 1315 mov eax, dword ptr [rbx - 68] 1316 shld esi, ecx, 13 1317 shrd edx, eax, 8 1318 shld ecx, eax, 6 1319 vmovd xmm1, edi 1320 vpinsrd xmm1, xmm1, r10d, 1 1321 vmovd xmm2, edx 1322 vpinsrd xmm1, xmm1, r11d, 2 1323 vpinsrd xmm2, xmm2, eax, 1 1324 vpinsrd xmm1, xmm1, r9d, 3 1325 vpinsrd xmm2, xmm2, ecx, 2 1326 vpinsrd xmm2, xmm2, esi, 3 1327 vinserti128 ymm1, ymm2, xmm1, 1 1328 vpsrlvd ymm1, ymm1, ymm10 1329 vpand ymm1, ymm1, ymm0 1330 vmovdqu ymmword ptr [r15 - 64], ymm1 1331 mov eax, dword ptr [rbx - 28] 1332 mov r9d, dword ptr [rbx - 24] 1333 shld r9d, eax, 1 1334 mov edx, dword ptr [rbx - 32] 1335 mov esi, eax 1336 shld esi, edx, 19 1337 mov edi, dword ptr [rbx - 40] 1338 mov ecx, dword ptr [rbx - 36] 1339 shld edx, ecx, 12 1340 shld ecx, edi, 5 1341 vmovq xmm1, qword ptr [rbx - 48] # xmm1 = mem[0],zero 1342 vpsrlvd xmm2, xmm1, xmm11 1343 vpshufd xmm1, xmm1, 229 # xmm1 = xmm1[1,1,2,3] 1344 vpinsrd xmm1, xmm1, edi, 1 1345 vpsllvd xmm1, xmm1, xmm4 1346 vpor xmm1, xmm2, xmm1 1347 vmovd xmm2, edx 1348 vpinsrd xmm2, xmm2, esi, 1 1349 vpinsrd xmm2, xmm2, eax, 2 1350 vpinsrd xmm2, xmm2, r9d, 3 1351 vpinsrd xmm1, xmm1, edi, 2 1352 vpinsrd xmm1, xmm1, ecx, 3 1353 vinserti128 ymm1, ymm1, xmm2, 1 1354 vpsrlvd ymm1, ymm1, ymm5 1355 vpand ymm1, ymm1, ymm0 1356 vmovdqu ymmword ptr [r15 - 32], ymm1 1357 mov r9d, dword ptr [rbx] 1358 mov ecx, dword ptr [rbx - 4] 1359 mov edx, r9d 1360 shld edx, ecx, 18 1361 mov esi, dword ptr [rbx - 8] 1362 shld ecx, esi, 11 1363 mov r10d, dword ptr [rbx - 16] 1364 mov edi, dword ptr [rbx - 12] 1365 shld esi, edi, 4 1366 mov eax, edi 1367 shld eax, r10d, 22 1368 vmovq xmm1, qword ptr [rbx - 24] # xmm1 = mem[0],zero 1369 vpsrlvd xmm2, xmm1, xmm6 1370 vpshufd xmm1, xmm1, 229 # xmm1 = xmm1[1,1,2,3] 1371 vpinsrd xmm1, xmm1, r10d, 1 1372 vpsllvd xmm1, xmm1, xmm7 1373 vmovd xmm3, esi 1374 vpinsrd xmm3, xmm3, ecx, 1 1375 vpor xmm1, xmm2, xmm1 1376 vpinsrd xmm2, xmm3, edx, 2 1377 vpinsrd xmm2, xmm2, r9d, 3 1378 vpinsrd xmm1, xmm1, eax, 2 1379 vpinsrd xmm1, xmm1, edi, 3 1380 vinserti128 ymm1, ymm1, xmm2, 1 1381 vpsrlvd ymm1, ymm1, ymm8 1382 vpand ymm1, ymm1, ymm0 1383 vmovdqu ymmword ptr [r15], ymm1 1384 sub r15, -128 1385 add rbx, 100 1386 add r8, -1 1387 jne .LBB0_78 1388 jmp .LBB0_147 1389 .LBB0_14: 1390 cmp ecx, 5 1391 jg .LBB0_20 1392 # %bb.15: 1393 cmp ecx, 4 1394 je .LBB0_138 1395 # %bb.16: 1396 cmp ecx, 5 1397 jne .LBB0_147 1398 # %bb.17: 1399 cmp edx, 32 1400 jl .LBB0_147 1401 # %bb.18: 1402 mov eax, r14d 1403 add r15, 96 1404 add rbx, 16 1405 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_120] # ymm0 = [133143986207,133143986207,133143986207,133143986207] 1406 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_119] # ymm1 = [0,5,10,15,20,25,0,3] 1407 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_121] # ymm2 = [8,13,18,23,0,1,6,11] 1408 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_122] # ymm3 = [16,21,26,0,4,9,14,19] 1409 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_123] # ymm4 = [24,0,2,7,12,17,22,27] 1410 .p2align 4, 0x90 1411 .LBB0_19: # =>This Inner Loop Header: Depth=1 1412 mov ecx, dword ptr [rbx - 16] 1413 mov edx, dword ptr [rbx - 12] 1414 mov esi, edx 1415 shld esi, ecx, 2 1416 vmovd xmm5, ecx 1417 vpbroadcastd xmm6, xmm5 1418 vpinsrd xmm5, xmm5, ecx, 1 1419 vpinsrd xmm5, xmm5, esi, 2 1420 vpinsrd xmm5, xmm5, edx, 3 1421 vinserti128 ymm5, ymm6, xmm5, 1 1422 vpsrlvd ymm5, ymm5, ymm1 1423 vpand ymm5, ymm5, ymm0 1424 vmovdqu ymmword ptr [r15 - 96], ymm5 1425 mov ecx, dword ptr [rbx - 12] 1426 mov edx, dword ptr [rbx - 8] 1427 mov esi, edx 1428 shld esi, ecx, 4 1429 vmovd xmm5, ecx 1430 vpbroadcastd xmm5, xmm5 1431 vmovd xmm6, esi 1432 vpinsrd xmm6, xmm6, edx, 1 1433 vpinsrd xmm6, xmm6, edx, 2 1434 vpinsrd xmm6, xmm6, edx, 3 1435 vinserti128 ymm5, ymm5, xmm6, 1 1436 vpsrlvd ymm5, ymm5, ymm2 1437 vpand ymm5, ymm5, ymm0 1438 vmovdqu ymmword ptr [r15 - 64], ymm5 1439 mov ecx, dword ptr [rbx - 8] 1440 mov edx, dword ptr [rbx - 4] 1441 vmovd xmm5, edx 1442 shld edx, ecx, 1 1443 vmovd xmm6, ecx 1444 vpinsrd xmm6, xmm6, ecx, 1 1445 vpinsrd xmm6, xmm6, ecx, 2 1446 vpinsrd xmm6, xmm6, edx, 3 1447 vpbroadcastd xmm5, xmm5 1448 vinserti128 ymm5, ymm6, xmm5, 1 1449 vpsrlvd ymm5, ymm5, ymm3 1450 vpand ymm5, ymm5, ymm0 1451 vmovdqu ymmword ptr [r15 - 32], ymm5 1452 mov ecx, dword ptr [rbx - 4] 1453 mov edx, dword ptr [rbx] 1454 mov esi, edx 1455 shld esi, ecx, 3 1456 vmovd xmm5, ecx 1457 vpinsrd xmm5, xmm5, esi, 1 1458 vpinsrd xmm5, xmm5, edx, 2 1459 vpinsrd xmm5, xmm5, edx, 3 1460 vmovd xmm6, edx 1461 vpbroadcastd xmm6, xmm6 1462 vinserti128 ymm5, ymm5, xmm6, 1 1463 vpsrlvd ymm5, ymm5, ymm4 1464 vpand ymm5, ymm5, ymm0 1465 vmovdqu ymmword ptr [r15], ymm5 1466 sub r15, -128 1467 add rbx, 20 1468 add rax, -1 1469 jne .LBB0_19 1470 jmp .LBB0_147 1471 .LBB0_61: 1472 cmp ecx, 21 1473 jg .LBB0_67 1474 # %bb.62: 1475 cmp ecx, 20 1476 je .LBB0_114 1477 # %bb.63: 1478 cmp ecx, 21 1479 jne .LBB0_147 1480 # %bb.64: 1481 cmp edx, 32 1482 jl .LBB0_147 1483 # %bb.65: 1484 mov r8d, r14d 1485 add r15, 96 1486 add rbx, 80 1487 vmovdqa ymm8, ymmword ptr [rip + .LCPI0_58] # ymm8 = [0,0,10,0,0,9,0,0] 1488 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_59] # ymm1 = [9007194961870847,9007194961870847,9007194961870847,9007194961870847] 1489 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_60] # ymm2 = [8,0,0,7,0,0,6,0] 1490 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_61] # ymm3 = [0,5,0,0,4,0,0,3] 1491 vmovdqa xmm4, xmmword ptr [rip + .LCPI0_62] # xmm4 = <24,13,u,u> 1492 vmovdqa xmm5, xmmword ptr [rip + .LCPI0_63] # xmm5 = <8,19,u,u> 1493 vmovdqa ymm6, ymmword ptr [rip + .LCPI0_64] # ymm6 = [0,0,2,0,0,1,0,11] 1494 .p2align 4, 0x90 1495 .LBB0_66: # =>This Inner Loop Header: Depth=1 1496 mov ecx, dword ptr [rbx - 64] 1497 mov r9d, dword ptr [rbx - 60] 1498 shld r9d, ecx, 13 1499 mov r11d, dword ptr [rbx - 68] 1500 shld ecx, r11d, 2 1501 mov edi, dword ptr [rbx - 72] 1502 mov esi, r11d 1503 shld esi, edi, 12 1504 mov r10d, dword ptr [rbx - 80] 1505 mov eax, dword ptr [rbx - 76] 1506 shld edi, eax, 1 1507 mov edx, eax 1508 shld edx, r10d, 11 1509 vmovd xmm7, r10d 1510 vmovd xmm0, esi 1511 vpinsrd xmm7, xmm7, edx, 1 1512 vpinsrd xmm0, xmm0, r11d, 1 1513 vpinsrd xmm7, xmm7, eax, 2 1514 vpinsrd xmm0, xmm0, ecx, 2 1515 vpinsrd xmm7, xmm7, edi, 3 1516 vpinsrd xmm0, xmm0, r9d, 3 1517 vinserti128 ymm0, ymm7, xmm0, 1 1518 vpsrlvd ymm0, ymm0, ymm8 1519 vpand ymm0, ymm0, ymm1 1520 vmovdqu ymmword ptr [r15 - 96], ymm0 1521 mov r10d, dword ptr [rbx - 44] 1522 mov r9d, dword ptr [rbx - 40] 1523 shld r9d, r10d, 5 1524 mov edx, dword ptr [rbx - 48] 1525 mov esi, r10d 1526 shld esi, edx, 15 1527 mov ecx, dword ptr [rbx - 52] 1528 shld edx, ecx, 4 1529 mov r11d, dword ptr [rbx - 60] 1530 mov eax, dword ptr [rbx - 56] 1531 mov edi, ecx 1532 shld edi, eax, 14 1533 shld eax, r11d, 3 1534 vmovd xmm0, r11d 1535 vmovd xmm7, edx 1536 vpinsrd xmm0, xmm0, eax, 1 1537 vpinsrd xmm7, xmm7, esi, 1 1538 vpinsrd xmm0, xmm0, edi, 2 1539 vpinsrd xmm7, xmm7, r10d, 2 1540 vpinsrd xmm0, xmm0, ecx, 3 1541 vpinsrd xmm7, xmm7, r9d, 3 1542 vinserti128 ymm0, ymm0, xmm7, 1 1543 vpsrlvd ymm0, ymm0, ymm2 1544 vpand ymm0, ymm0, ymm1 1545 vmovdqu ymmword ptr [r15 - 64], ymm0 1546 mov r9d, dword ptr [rbx - 20] 1547 mov ecx, dword ptr [rbx - 24] 1548 mov r10d, r9d 1549 shld r10d, ecx, 18 1550 mov esi, dword ptr [rbx - 28] 1551 shld ecx, esi, 7 1552 mov edi, dword ptr [rbx - 32] 1553 vmovd xmm0, esi 1554 shld esi, edi, 17 1555 mov eax, dword ptr [rbx - 40] 1556 mov edx, dword ptr [rbx - 36] 1557 shld edi, edx, 6 1558 shrd eax, edx, 16 1559 vpinsrd xmm0, xmm0, ecx, 1 1560 vmovd xmm7, eax 1561 vpinsrd xmm0, xmm0, r10d, 2 1562 vpinsrd xmm7, xmm7, edx, 1 1563 vpinsrd xmm0, xmm0, r9d, 3 1564 vpinsrd xmm7, xmm7, edi, 2 1565 vpinsrd xmm7, xmm7, esi, 3 1566 vinserti128 ymm0, ymm7, xmm0, 1 1567 vpsrlvd ymm0, ymm0, ymm3 1568 vpand ymm0, ymm0, ymm1 1569 vmovdqu ymmword ptr [r15 - 32], ymm0 1570 mov r9d, dword ptr [rbx] 1571 mov eax, dword ptr [rbx - 4] 1572 mov edx, r9d 1573 shld edx, eax, 10 1574 mov esi, dword ptr [rbx - 12] 1575 mov edi, dword ptr [rbx - 8] 1576 mov ecx, eax 1577 shld ecx, edi, 20 1578 shld edi, esi, 9 1579 vmovq xmm0, qword ptr [rbx - 20] # xmm0 = mem[0],zero 1580 vpsrlvd xmm7, xmm0, xmm4 1581 vpshufd xmm0, xmm0, 229 # xmm0 = xmm0[1,1,2,3] 1582 vpinsrd xmm0, xmm0, esi, 1 1583 vpsllvd xmm0, xmm0, xmm5 1584 vpor xmm0, xmm7, xmm0 1585 vmovd xmm7, ecx 1586 vpinsrd xmm7, xmm7, eax, 1 1587 vpinsrd xmm7, xmm7, edx, 2 1588 vpinsrd xmm7, xmm7, r9d, 3 1589 vpinsrd xmm0, xmm0, esi, 2 1590 vpinsrd xmm0, xmm0, edi, 3 1591 vinserti128 ymm0, ymm0, xmm7, 1 1592 vpsrlvd ymm0, ymm0, ymm6 1593 vpand ymm0, ymm0, ymm1 1594 vmovdqu ymmword ptr [r15], ymm0 1595 sub r15, -128 1596 add rbx, 84 1597 add r8, -1 1598 jne .LBB0_66 1599 jmp .LBB0_147 1600 .LBB0_37: 1601 cmp ecx, 13 1602 jg .LBB0_43 1603 # %bb.38: 1604 cmp ecx, 12 1605 je .LBB0_126 1606 # %bb.39: 1607 cmp ecx, 13 1608 jne .LBB0_147 1609 # %bb.40: 1610 cmp edx, 32 1611 jl .LBB0_147 1612 # %bb.41: 1613 mov r8d, r14d 1614 add r15, 96 1615 add rbx, 48 1616 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_90] # ymm0 = [35180077129727,35180077129727,35180077129727,35180077129727] 1617 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_89] # ymm1 = [0,13,0,7,0,1,14,0] 1618 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_91] # ymm2 = [8,0,2,15,0,9,0,3] 1619 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_92] # ymm3 = [16,0,10,0,4,17,0,11] 1620 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_93] # ymm4 = [0,5,18,0,12,0,6,19] 1621 .p2align 4, 0x90 1622 .LBB0_42: # =>This Inner Loop Header: Depth=1 1623 mov eax, dword ptr [rbx - 40] 1624 mov r9d, dword ptr [rbx - 36] 1625 shld r9d, eax, 5 1626 mov esi, dword ptr [rbx - 48] 1627 mov edx, dword ptr [rbx - 44] 1628 mov ecx, eax 1629 shld ecx, edx, 12 1630 mov edi, edx 1631 shld edi, esi, 6 1632 vmovd xmm5, ecx 1633 vpinsrd xmm5, xmm5, eax, 1 1634 vpinsrd xmm5, xmm5, eax, 2 1635 vpinsrd xmm5, xmm5, r9d, 3 1636 vmovd xmm6, esi 1637 vpinsrd xmm6, xmm6, esi, 1 1638 vpinsrd xmm6, xmm6, edi, 2 1639 vpinsrd xmm6, xmm6, edx, 3 1640 vinserti128 ymm5, ymm6, xmm5, 1 1641 vpsrlvd ymm5, ymm5, ymm1 1642 vpand ymm5, ymm5, ymm0 1643 vmovdqu ymmword ptr [r15 - 96], ymm5 1644 mov r9d, dword ptr [rbx - 24] 1645 mov ecx, dword ptr [rbx - 28] 1646 mov edx, r9d 1647 shld edx, ecx, 10 1648 mov esi, dword ptr [rbx - 32] 1649 mov edi, ecx 1650 shld edi, esi, 4 1651 mov r10d, dword ptr [rbx - 36] 1652 mov eax, esi 1653 shld eax, r10d, 11 1654 vmovd xmm5, edi 1655 vpinsrd xmm5, xmm5, ecx, 1 1656 vpinsrd xmm5, xmm5, edx, 2 1657 vpinsrd xmm5, xmm5, r9d, 3 1658 vmovd xmm6, r10d 1659 vpinsrd xmm6, xmm6, eax, 1 1660 vpinsrd xmm6, xmm6, esi, 2 1661 vpinsrd xmm6, xmm6, esi, 3 1662 vinserti128 ymm5, ymm6, xmm5, 1 1663 vpsrlvd ymm5, ymm5, ymm2 1664 vpand ymm5, ymm5, ymm0 1665 vmovdqu ymmword ptr [r15 - 64], ymm5 1666 mov r9d, dword ptr [rbx - 12] 1667 mov ecx, dword ptr [rbx - 16] 1668 mov edx, r9d 1669 shld edx, ecx, 2 1670 mov esi, dword ptr [rbx - 24] 1671 mov eax, dword ptr [rbx - 20] 1672 vmovd xmm5, ecx 1673 vpinsrd xmm5, xmm5, ecx, 1 1674 shld ecx, eax, 9 1675 mov edi, eax 1676 shld edi, esi, 3 1677 vpinsrd xmm5, xmm5, edx, 2 1678 vpinsrd xmm5, xmm5, r9d, 3 1679 vmovd xmm6, esi 1680 vpinsrd xmm6, xmm6, edi, 1 1681 vpinsrd xmm6, xmm6, eax, 2 1682 vpinsrd xmm6, xmm6, ecx, 3 1683 vinserti128 ymm5, ymm6, xmm5, 1 1684 vpsrlvd ymm5, ymm5, ymm3 1685 vpand ymm5, ymm5, ymm0 1686 vmovdqu ymmword ptr [r15 - 32], ymm5 1687 mov eax, dword ptr [rbx] 1688 mov ecx, dword ptr [rbx - 4] 1689 mov edx, eax 1690 shld edx, ecx, 7 1691 mov esi, dword ptr [rbx - 8] 1692 vmovd xmm5, ecx 1693 shld ecx, esi, 1 1694 mov edi, dword ptr [rbx - 12] 1695 shrd edi, esi, 24 1696 vmovd xmm6, edi 1697 vpinsrd xmm6, xmm6, esi, 1 1698 vpinsrd xmm6, xmm6, esi, 2 1699 vpinsrd xmm6, xmm6, ecx, 3 1700 vpinsrd xmm5, xmm5, edx, 1 1701 vpinsrd xmm5, xmm5, eax, 2 1702 vpinsrd xmm5, xmm5, eax, 3 1703 vinserti128 ymm5, ymm6, xmm5, 1 1704 vpsrlvd ymm5, ymm5, ymm4 1705 vpand ymm5, ymm5, ymm0 1706 vmovdqu ymmword ptr [r15], ymm5 1707 sub r15, -128 1708 add rbx, 52 1709 add r8, -1 1710 jne .LBB0_42 1711 jmp .LBB0_147 1712 .LBB0_85: 1713 cmp ecx, 28 1714 je .LBB0_102 1715 # %bb.86: 1716 cmp ecx, 29 1717 jne .LBB0_147 1718 # %bb.87: 1719 cmp edx, 32 1720 jl .LBB0_147 1721 # %bb.88: 1722 mov r8d, r14d 1723 add r15, 96 1724 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_12] # ymm0 = [2305843005455597567,2305843005455597567,2305843005455597567,2305843005455597567] 1725 vmovdqa xmm8, xmmword ptr [rip + .LCPI0_13] # xmm8 = <8,5,u,u> 1726 vmovdqa xmm10, xmmword ptr [rip + .LCPI0_14] # xmm10 = <24,27,u,u> 1727 vmovdqa ymm11, ymmword ptr [rip + .LCPI0_15] # ymm11 = [0,0,2,0,0,0,0,0] 1728 vmovdqa xmm12, xmmword ptr [rip + .LCPI0_16] # xmm12 = [16,13,10,7] 1729 vmovdqa xmm5, xmmword ptr [rip + .LCPI0_17] # xmm5 = [16,19,22,25] 1730 vmovdqa ymm6, ymmword ptr [rip + .LCPI0_18] # ymm6 = [0,0,0,0,0,1,0,0] 1731 vmovdqa xmm7, xmmword ptr [rip + .LCPI0_19] # xmm7 = [24,21,18,15] 1732 vmovdqa xmm1, xmmword ptr [rip + .LCPI0_20] # xmm1 = [8,11,14,17] 1733 vmovdqa ymm9, ymmword ptr [rip + .LCPI0_21] # ymm9 = [0,0,0,0,0,0,0,3] 1734 .p2align 4, 0x90 1735 .LBB0_89: # =>This Inner Loop Header: Depth=1 1736 mov r11d, dword ptr [rbx + 24] 1737 mov r9d, dword ptr [rbx + 28] 1738 shld r9d, r11d, 21 1739 mov esi, dword ptr [rbx + 20] 1740 shld r11d, esi, 18 1741 mov edi, dword ptr [rbx + 16] 1742 shld esi, edi, 15 1743 mov eax, dword ptr [rbx + 12] 1744 shld edi, eax, 12 1745 mov edx, dword ptr [rbx + 8] 1746 shld eax, edx, 9 1747 mov r10d, dword ptr [rbx] 1748 mov ecx, dword ptr [rbx + 4] 1749 shld edx, ecx, 6 1750 shld ecx, r10d, 3 1751 vmovd xmm2, r10d 1752 vmovd xmm3, edi 1753 vpinsrd xmm2, xmm2, ecx, 1 1754 vpinsrd xmm3, xmm3, esi, 1 1755 vpinsrd xmm2, xmm2, edx, 2 1756 vpinsrd xmm3, xmm3, r11d, 2 1757 vpinsrd xmm2, xmm2, eax, 3 1758 vpinsrd xmm3, xmm3, r9d, 3 1759 vinserti128 ymm2, ymm2, xmm3, 1 1760 vpand ymm2, ymm2, ymm0 1761 vmovdqu ymmword ptr [r15 - 96], ymm2 1762 mov eax, dword ptr [rbx + 52] 1763 mov r9d, dword ptr [rbx + 56] 1764 shld r9d, eax, 13 1765 mov edx, dword ptr [rbx + 48] 1766 shld eax, edx, 10 1767 mov esi, dword ptr [rbx + 44] 1768 shld edx, esi, 7 1769 mov edi, dword ptr [rbx + 36] 1770 mov ecx, dword ptr [rbx + 40] 1771 shld esi, ecx, 4 1772 shld ecx, edi, 1 1773 vmovq xmm2, qword ptr [rbx + 28] # xmm2 = mem[0],zero 1774 vpsrlvd xmm3, xmm2, xmm8 1775 vpshufd xmm2, xmm2, 229 # xmm2 = xmm2[1,1,2,3] 1776 vpinsrd xmm2, xmm2, edi, 1 1777 vpsllvd xmm2, xmm2, xmm10 1778 vpor xmm2, xmm3, xmm2 1779 vmovd xmm3, esi 1780 vpinsrd xmm3, xmm3, edx, 1 1781 vpinsrd xmm3, xmm3, eax, 2 1782 vpinsrd xmm3, xmm3, r9d, 3 1783 vpinsrd xmm2, xmm2, edi, 2 1784 vpinsrd xmm2, xmm2, ecx, 3 1785 vinserti128 ymm2, ymm2, xmm3, 1 1786 vpsrlvd ymm2, ymm2, ymm11 1787 vpand ymm2, ymm2, ymm0 1788 vmovdqu ymmword ptr [r15 - 64], ymm2 1789 mov eax, dword ptr [rbx + 80] 1790 mov ecx, dword ptr [rbx + 84] 1791 shld ecx, eax, 5 1792 mov edx, dword ptr [rbx + 76] 1793 mov esi, dword ptr [rbx + 72] 1794 shld eax, edx, 2 1795 mov edi, edx 1796 shld edi, esi, 28 1797 vmovdqu xmm2, xmmword ptr [rbx + 56] 1798 vpsrlvd xmm3, xmm2, xmm12 1799 vpshufd xmm2, xmm2, 249 # xmm2 = xmm2[1,2,3,3] 1800 vpinsrd xmm2, xmm2, esi, 3 1801 vmovd xmm4, edi 1802 vpinsrd xmm4, xmm4, edx, 1 1803 vpinsrd xmm4, xmm4, eax, 2 1804 vpsllvd xmm2, xmm2, xmm5 1805 vpinsrd xmm4, xmm4, ecx, 3 1806 vpor xmm2, xmm3, xmm2 1807 vinserti128 ymm2, ymm2, xmm4, 1 1808 vpsrlvd ymm2, ymm2, ymm6 1809 vpand ymm2, ymm2, ymm0 1810 vmovdqu ymmword ptr [r15 - 32], ymm2 1811 mov eax, dword ptr [rbx + 112] 1812 mov ecx, dword ptr [rbx + 108] 1813 mov edx, eax 1814 shld edx, ecx, 26 1815 mov esi, dword ptr [rbx + 104] 1816 shld ecx, esi, 23 1817 mov edi, dword ptr [rbx + 100] 1818 vmovdqu xmm2, xmmword ptr [rbx + 84] 1819 shld esi, edi, 20 1820 vpsrlvd xmm3, xmm2, xmm7 1821 vpshufd xmm2, xmm2, 249 # xmm2 = xmm2[1,2,3,3] 1822 vpinsrd xmm2, xmm2, edi, 3 1823 vmovd xmm4, esi 1824 vpinsrd xmm4, xmm4, ecx, 1 1825 vpsllvd xmm2, xmm2, xmm1 1826 vpinsrd xmm4, xmm4, edx, 2 1827 vpinsrd xmm4, xmm4, eax, 3 1828 vpor xmm2, xmm3, xmm2 1829 vinserti128 ymm2, ymm2, xmm4, 1 1830 vpsrlvd ymm2, ymm2, ymm9 1831 vpand ymm2, ymm2, ymm0 1832 vmovdqu ymmword ptr [r15], ymm2 1833 add rbx, 116 1834 sub r15, -128 1835 add r8, -1 1836 jne .LBB0_89 1837 jmp .LBB0_147 1838 .LBB0_9: 1839 cmp ecx, 2 1840 je .LBB0_141 1841 # %bb.10: 1842 cmp ecx, 3 1843 jne .LBB0_147 1844 # %bb.11: 1845 cmp edx, 32 1846 jl .LBB0_147 1847 # %bb.12: 1848 mov eax, r14d 1849 add r15, 96 1850 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_127] # ymm0 = [30064771079,30064771079,30064771079,30064771079] 1851 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_126] # ymm1 = [0,3,6,9,12,15,18,21] 1852 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_128] # ymm2 = [24,27,0,1,4,7,10,13] 1853 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_129] # ymm3 = [16,19,22,25,28,0,2,5] 1854 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_130] # ymm4 = [8,11,14,17,20,23,26,29] 1855 .p2align 4, 0x90 1856 .LBB0_13: # =>This Inner Loop Header: Depth=1 1857 vpbroadcastd ymm5, dword ptr [rbx] 1858 vpsrlvd ymm5, ymm5, ymm1 1859 vpand ymm5, ymm5, ymm0 1860 vmovdqu ymmword ptr [r15 - 96], ymm5 1861 mov ecx, dword ptr [rbx] 1862 mov edx, dword ptr [rbx + 4] 1863 mov esi, edx 1864 shld esi, ecx, 2 1865 vmovd xmm5, ecx 1866 vpinsrd xmm5, xmm5, ecx, 1 1867 vpinsrd xmm5, xmm5, esi, 2 1868 vpinsrd xmm5, xmm5, edx, 3 1869 vmovd xmm6, edx 1870 vpbroadcastd xmm6, xmm6 1871 vinserti128 ymm5, ymm5, xmm6, 1 1872 vpsrlvd ymm5, ymm5, ymm2 1873 vpand ymm5, ymm5, ymm0 1874 vmovdqu ymmword ptr [r15 - 64], ymm5 1875 mov ecx, dword ptr [rbx + 4] 1876 mov edx, dword ptr [rbx + 8] 1877 mov esi, edx 1878 shld esi, ecx, 1 1879 vmovd xmm5, ecx 1880 vpbroadcastd xmm6, xmm5 1881 vpinsrd xmm5, xmm5, esi, 1 1882 vpinsrd xmm5, xmm5, edx, 2 1883 vpinsrd xmm5, xmm5, edx, 3 1884 vinserti128 ymm5, ymm6, xmm5, 1 1885 vpsrlvd ymm5, ymm5, ymm3 1886 vpand ymm5, ymm5, ymm0 1887 vmovdqu ymmword ptr [r15 - 32], ymm5 1888 vpbroadcastd ymm5, dword ptr [rbx + 8] 1889 vpsrlvd ymm5, ymm5, ymm4 1890 vpand ymm5, ymm5, ymm0 1891 vmovdqu ymmword ptr [r15], ymm5 1892 sub r15, -128 1893 add rbx, 12 1894 add rax, -1 1895 jne .LBB0_13 1896 jmp .LBB0_147 1897 .LBB0_56: 1898 cmp ecx, 18 1899 je .LBB0_117 1900 # %bb.57: 1901 cmp ecx, 19 1902 jne .LBB0_147 1903 # %bb.58: 1904 cmp edx, 32 1905 jl .LBB0_147 1906 # %bb.59: 1907 mov r8d, r14d 1908 add r15, 96 1909 add rbx, 72 1910 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_68] # ymm0 = [2251795519242239,2251795519242239,2251795519242239,2251795519242239] 1911 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_67] # ymm1 = [0,0,6,0,12,0,0,5] 1912 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_69] # ymm2 = [0,11,0,0,4,0,10,0] 1913 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_70] # ymm3 = [0,3,0,9,0,0,2,0] 1914 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_71] # ymm4 = [8,0,0,1,0,7,0,13] 1915 .p2align 4, 0x90 1916 .LBB0_60: # =>This Inner Loop Header: Depth=1 1917 mov r9d, dword ptr [rbx - 56] 1918 mov edx, dword ptr [rbx - 60] 1919 mov esi, r9d 1920 shld esi, edx, 14 1921 mov edi, dword ptr [rbx - 64] 1922 mov r10d, dword ptr [rbx - 72] 1923 shld edx, edi, 1 1924 mov eax, dword ptr [rbx - 68] 1925 mov ecx, eax 1926 shld ecx, r10d, 13 1927 vmovd xmm5, edi 1928 shld edi, eax, 7 1929 vpinsrd xmm5, xmm5, edx, 1 1930 vmovd xmm6, r10d 1931 vpinsrd xmm5, xmm5, esi, 2 1932 vpinsrd xmm6, xmm6, ecx, 1 1933 vpinsrd xmm5, xmm5, r9d, 3 1934 vpinsrd xmm6, xmm6, eax, 2 1935 vpinsrd xmm6, xmm6, edi, 3 1936 vinserti128 ymm5, ymm6, xmm5, 1 1937 vpsrlvd ymm5, ymm5, ymm1 1938 vpand ymm5, ymm5, ymm0 1939 vmovdqu ymmword ptr [r15 - 96], ymm5 1940 mov r10d, dword ptr [rbx - 40] 1941 mov r9d, dword ptr [rbx - 36] 1942 shld r9d, r10d, 3 1943 mov edx, dword ptr [rbx - 44] 1944 mov esi, r10d 1945 shld esi, edx, 9 1946 mov edi, dword ptr [rbx - 48] 1947 vmovd xmm5, edx 1948 shld edx, edi, 15 1949 mov ecx, dword ptr [rbx - 56] 1950 mov eax, dword ptr [rbx - 52] 1951 shld edi, eax, 2 1952 shrd ecx, eax, 24 1953 vpinsrd xmm5, xmm5, esi, 1 1954 vmovd xmm6, ecx 1955 vpinsrd xmm5, xmm5, r10d, 2 1956 vpinsrd xmm6, xmm6, eax, 1 1957 vpinsrd xmm5, xmm5, r9d, 3 1958 vpinsrd xmm6, xmm6, edi, 2 1959 vpinsrd xmm6, xmm6, edx, 3 1960 vinserti128 ymm5, ymm6, xmm5, 1 1961 vpsrlvd ymm5, ymm5, ymm2 1962 vpand ymm5, ymm5, ymm0 1963 vmovdqu ymmword ptr [r15 - 64], ymm5 1964 mov r10d, dword ptr [rbx - 20] 1965 mov r9d, dword ptr [rbx - 16] 1966 shld r9d, r10d, 11 1967 mov edx, dword ptr [rbx - 24] 1968 mov esi, r10d 1969 mov r11d, dword ptr [rbx - 28] 1970 shld esi, edx, 17 1971 mov ecx, dword ptr [rbx - 36] 1972 mov eax, dword ptr [rbx - 32] 1973 shld edx, r11d, 4 1974 mov edi, r11d 1975 shld edi, eax, 10 1976 shrd ecx, eax, 16 1977 vmovd xmm5, edx 1978 vpinsrd xmm5, xmm5, esi, 1 1979 vmovd xmm6, ecx 1980 vpinsrd xmm5, xmm5, r10d, 2 1981 vpinsrd xmm6, xmm6, eax, 1 1982 vpinsrd xmm5, xmm5, r9d, 3 1983 vpinsrd xmm6, xmm6, edi, 2 1984 vpinsrd xmm6, xmm6, r11d, 3 1985 vinserti128 ymm5, ymm6, xmm5, 1 1986 vpsrlvd ymm5, ymm5, ymm3 1987 vpand ymm5, ymm5, ymm0 1988 vmovdqu ymmword ptr [r15 - 32], ymm5 1989 mov r9d, dword ptr [rbx] 1990 mov r11d, dword ptr [rbx - 4] 1991 mov edx, r9d 1992 shld edx, r11d, 6 1993 mov ecx, dword ptr [rbx - 8] 1994 mov edi, r11d 1995 shld edi, ecx, 12 1996 mov r10d, dword ptr [rbx - 16] 1997 mov eax, dword ptr [rbx - 12] 1998 mov esi, ecx 1999 shld esi, eax, 18 2000 shld eax, r10d, 5 2001 vmovd xmm5, r10d 2002 vmovd xmm6, edi 2003 vpinsrd xmm5, xmm5, eax, 1 2004 vpinsrd xmm6, xmm6, r11d, 1 2005 vpinsrd xmm5, xmm5, esi, 2 2006 vpinsrd xmm6, xmm6, edx, 2 2007 vpinsrd xmm5, xmm5, ecx, 3 2008 vpinsrd xmm6, xmm6, r9d, 3 2009 vinserti128 ymm5, ymm5, xmm6, 1 2010 vpsrlvd ymm5, ymm5, ymm4 2011 vpand ymm5, ymm5, ymm0 2012 vmovdqu ymmword ptr [r15], ymm5 2013 sub r15, -128 2014 add rbx, 76 2015 add r8, -1 2016 jne .LBB0_60 2017 jmp .LBB0_147 2018 .LBB0_32: 2019 cmp ecx, 10 2020 je .LBB0_129 2021 # %bb.33: 2022 cmp ecx, 11 2023 jne .LBB0_147 2024 # %bb.34: 2025 cmp edx, 32 2026 jl .LBB0_147 2027 # %bb.35: 2028 mov r8d, r14d 2029 add r15, 96 2030 add rbx, 40 2031 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_97] # ymm0 = [8791798056959,8791798056959,8791798056959,8791798056959] 2032 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_96] # ymm1 = [0,11,0,1,12,0,2,13] 2033 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_98] # ymm2 = [0,3,14,0,4,15,0,5] 2034 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_99] # ymm3 = [16,0,6,17,0,7,18,0] 2035 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_100] # ymm4 = [8,19,0,9,20,0,10,21] 2036 .p2align 4, 0x90 2037 .LBB0_36: # =>This Inner Loop Header: Depth=1 2038 mov ecx, dword ptr [rbx - 32] 2039 mov edx, dword ptr [rbx - 40] 2040 mov esi, dword ptr [rbx - 36] 2041 mov edi, ecx 2042 shld edi, esi, 9 2043 mov eax, esi 2044 shld eax, edx, 10 2045 vmovd xmm5, esi 2046 vpinsrd xmm5, xmm5, edi, 1 2047 vpinsrd xmm5, xmm5, ecx, 2 2048 vpinsrd xmm5, xmm5, ecx, 3 2049 vmovd xmm6, edx 2050 vpinsrd xmm6, xmm6, edx, 1 2051 vpinsrd xmm6, xmm6, eax, 2 2052 vpinsrd xmm6, xmm6, esi, 3 2053 vinserti128 ymm5, ymm6, xmm5, 1 2054 vpsrlvd ymm5, ymm5, ymm1 2055 vpand ymm5, ymm5, ymm0 2056 vmovdqu ymmword ptr [r15 - 96], ymm5 2057 mov eax, dword ptr [rbx - 20] 2058 mov ecx, dword ptr [rbx - 24] 2059 mov edx, eax 2060 shld edx, ecx, 6 2061 mov esi, dword ptr [rbx - 32] 2062 mov edi, dword ptr [rbx - 28] 2063 vmovd xmm5, ecx 2064 vpinsrd xmm5, xmm5, ecx, 1 2065 shld ecx, edi, 7 2066 shrd esi, edi, 24 2067 vpinsrd xmm5, xmm5, edx, 2 2068 vpinsrd xmm5, xmm5, eax, 3 2069 vmovd xmm6, esi 2070 vpinsrd xmm6, xmm6, edi, 1 2071 vpinsrd xmm6, xmm6, edi, 2 2072 vpinsrd xmm6, xmm6, ecx, 3 2073 vinserti128 ymm5, ymm6, xmm5, 1 2074 vpsrlvd ymm5, ymm5, ymm2 2075 vpand ymm5, ymm5, ymm0 2076 vmovdqu ymmword ptr [r15 - 64], ymm5 2077 mov eax, dword ptr [rbx - 12] 2078 mov ecx, dword ptr [rbx - 8] 2079 shld ecx, eax, 3 2080 mov r9d, dword ptr [rbx - 20] 2081 mov esi, dword ptr [rbx - 16] 2082 mov edi, eax 2083 shld edi, esi, 4 2084 mov edx, esi 2085 shld edx, r9d, 5 2086 vmovd xmm5, edi 2087 vpinsrd xmm5, xmm5, eax, 1 2088 vpinsrd xmm5, xmm5, eax, 2 2089 vpinsrd xmm5, xmm5, ecx, 3 2090 vmovd xmm6, r9d 2091 vpinsrd xmm6, xmm6, edx, 1 2092 vpinsrd xmm6, xmm6, esi, 2 2093 vpinsrd xmm6, xmm6, esi, 3 2094 vinserti128 ymm5, ymm6, xmm5, 1 2095 vpsrlvd ymm5, ymm5, ymm3 2096 vpand ymm5, ymm5, ymm0 2097 vmovdqu ymmword ptr [r15 - 32], ymm5 2098 mov eax, dword ptr [rbx] 2099 mov ecx, dword ptr [rbx - 8] 2100 mov edx, dword ptr [rbx - 4] 2101 mov esi, eax 2102 shld esi, edx, 1 2103 mov edi, edx 2104 shld edi, ecx, 2 2105 vmovd xmm5, edx 2106 vpinsrd xmm5, xmm5, esi, 1 2107 vpinsrd xmm5, xmm5, eax, 2 2108 vpinsrd xmm5, xmm5, eax, 3 2109 vmovd xmm6, ecx 2110 vpinsrd xmm6, xmm6, ecx, 1 2111 vpinsrd xmm6, xmm6, edi, 2 2112 vpinsrd xmm6, xmm6, edx, 3 2113 vinserti128 ymm5, ymm6, xmm5, 1 2114 vpsrlvd ymm5, ymm5, ymm4 2115 vpand ymm5, ymm5, ymm0 2116 vmovdqu ymmword ptr [r15], ymm5 2117 sub r15, -128 2118 add rbx, 44 2119 add r8, -1 2120 jne .LBB0_36 2121 jmp .LBB0_147 2122 .LBB0_79: 2123 cmp ecx, 26 2124 je .LBB0_105 2125 # %bb.80: 2126 cmp ecx, 27 2127 jne .LBB0_147 2128 # %bb.81: 2129 cmp edx, 32 2130 jl .LBB0_147 2131 # %bb.82: 2132 mov r8d, r14d 2133 add r15, 96 2134 add rbx, 104 2135 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_25] # ymm0 = [576460748142673919,576460748142673919,576460748142673919,576460748142673919] 2136 vmovdqa ymm9, ymmword ptr [rip + .LCPI0_24] # ymm9 = [0,0,0,0,0,0,2,0] 2137 vmovdqa xmm10, xmmword ptr [rip + .LCPI0_26] # xmm10 = [24,19,14,9] 2138 vmovdqa xmm11, xmmword ptr [rip + .LCPI0_27] # xmm11 = [8,13,18,23] 2139 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_28] # ymm4 = [0,0,0,0,4,0,0,0] 2140 vmovdqa xmm5, xmmword ptr [rip + .LCPI0_29] # xmm5 = <16,11,u,u> 2141 vmovdqa xmm6, xmmword ptr [rip + .LCPI0_30] # xmm6 = <16,21,u,u> 2142 vmovdqa ymm7, ymmword ptr [rip + .LCPI0_31] # ymm7 = [0,0,0,1,0,0,0,0] 2143 vmovdqa ymm8, ymmword ptr [rip + .LCPI0_32] # ymm8 = [0,3,0,0,0,0,0,5] 2144 .p2align 4, 0x90 2145 .LBB0_83: # =>This Inner Loop Header: Depth=1 2146 mov r10d, dword ptr [rbx - 84] 2147 mov r9d, dword ptr [rbx - 80] 2148 shld r9d, r10d, 3 2149 mov esi, dword ptr [rbx - 88] 2150 mov edi, r10d 2151 shld edi, esi, 25 2152 mov eax, dword ptr [rbx - 92] 2153 shld esi, eax, 20 2154 mov edx, dword ptr [rbx - 96] 2155 shld eax, edx, 15 2156 mov r11d, dword ptr [rbx - 104] 2157 mov ecx, dword ptr [rbx - 100] 2158 shld edx, ecx, 10 2159 shld ecx, r11d, 5 2160 vmovd xmm1, r11d 2161 vmovd xmm2, esi 2162 vpinsrd xmm1, xmm1, ecx, 1 2163 vpinsrd xmm2, xmm2, edi, 1 2164 vpinsrd xmm1, xmm1, edx, 2 2165 vpinsrd xmm2, xmm2, r10d, 2 2166 vpinsrd xmm1, xmm1, eax, 3 2167 vpinsrd xmm2, xmm2, r9d, 3 2168 vinserti128 ymm1, ymm1, xmm2, 1 2169 vpsrlvd ymm1, ymm1, ymm9 2170 vpand ymm1, ymm1, ymm0 2171 vmovdqu ymmword ptr [r15 - 96], ymm1 2172 mov eax, dword ptr [rbx - 56] 2173 mov ecx, dword ptr [rbx - 52] 2174 shld ecx, eax, 11 2175 mov edx, dword ptr [rbx - 60] 2176 mov esi, dword ptr [rbx - 64] 2177 shld eax, edx, 6 2178 shld edx, esi, 1 2179 vmovdqu xmm1, xmmword ptr [rbx - 80] 2180 vpsrlvd xmm2, xmm1, xmm10 2181 vpshufd xmm1, xmm1, 249 # xmm1 = xmm1[1,2,3,3] 2182 vmovd xmm3, esi 2183 vpinsrd xmm1, xmm1, esi, 3 2184 vpinsrd xmm3, xmm3, edx, 1 2185 vpinsrd xmm3, xmm3, eax, 2 2186 vpsllvd xmm1, xmm1, xmm11 2187 vpinsrd xmm3, xmm3, ecx, 3 2188 vpor xmm1, xmm2, xmm1 2189 vinserti128 ymm1, ymm1, xmm3, 1 2190 vpsrlvd ymm1, ymm1, ymm4 2191 vpand ymm1, ymm1, ymm0 2192 vmovdqu ymmword ptr [r15 - 64], ymm1 2193 mov eax, dword ptr [rbx - 28] 2194 mov r9d, dword ptr [rbx - 24] 2195 shld r9d, eax, 19 2196 mov edx, dword ptr [rbx - 32] 2197 shld eax, edx, 14 2198 mov esi, dword ptr [rbx - 36] 2199 shld edx, esi, 9 2200 mov r10d, dword ptr [rbx - 44] 2201 mov edi, dword ptr [rbx - 40] 2202 shld esi, edi, 4 2203 mov ecx, edi 2204 shld ecx, r10d, 26 2205 vmovq xmm1, qword ptr [rbx - 52] # xmm1 = mem[0],zero 2206 vpsrlvd xmm2, xmm1, xmm5 2207 vpshufd xmm1, xmm1, 229 # xmm1 = xmm1[1,1,2,3] 2208 vpinsrd xmm1, xmm1, r10d, 1 2209 vpsllvd xmm1, xmm1, xmm6 2210 vmovd xmm3, esi 2211 vpinsrd xmm3, xmm3, edx, 1 2212 vpor xmm1, xmm2, xmm1 2213 vpinsrd xmm2, xmm3, eax, 2 2214 vpinsrd xmm2, xmm2, r9d, 3 2215 vpinsrd xmm1, xmm1, ecx, 2 2216 vpinsrd xmm1, xmm1, edi, 3 2217 vinserti128 ymm1, ymm1, xmm2, 1 2218 vpsrlvd ymm1, ymm1, ymm7 2219 vpand ymm1, ymm1, ymm0 2220 vmovdqu ymmword ptr [r15 - 32], ymm1 2221 mov r9d, dword ptr [rbx] 2222 mov r11d, dword ptr [rbx - 4] 2223 mov r10d, r9d 2224 shld r10d, r11d, 22 2225 mov esi, dword ptr [rbx - 8] 2226 shld r11d, esi, 17 2227 mov edi, dword ptr [rbx - 12] 2228 mov eax, dword ptr [rbx - 16] 2229 shld esi, edi, 12 2230 mov edx, dword ptr [rbx - 24] 2231 mov ecx, dword ptr [rbx - 20] 2232 shld edi, eax, 7 2233 shrd edx, ecx, 8 2234 shld eax, ecx, 2 2235 vmovd xmm1, esi 2236 vpinsrd xmm1, xmm1, r11d, 1 2237 vmovd xmm2, edx 2238 vpinsrd xmm1, xmm1, r10d, 2 2239 vpinsrd xmm2, xmm2, ecx, 1 2240 vpinsrd xmm1, xmm1, r9d, 3 2241 vpinsrd xmm2, xmm2, eax, 2 2242 vpinsrd xmm2, xmm2, edi, 3 2243 vinserti128 ymm1, ymm2, xmm1, 1 2244 vpsrlvd ymm1, ymm1, ymm8 2245 vpand ymm1, ymm1, ymm0 2246 vmovdqu ymmword ptr [r15], ymm1 2247 sub r15, -128 2248 add rbx, 108 2249 add r8, -1 2250 jne .LBB0_83 2251 jmp .LBB0_147 2252 .LBB0_20: 2253 cmp ecx, 6 2254 je .LBB0_135 2255 # %bb.21: 2256 cmp ecx, 7 2257 jne .LBB0_147 2258 # %bb.22: 2259 cmp edx, 32 2260 jl .LBB0_147 2261 # %bb.23: 2262 mov r8d, r14d 2263 add r15, 96 2264 add rbx, 24 2265 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_112] # ymm0 = [545460846719,545460846719,545460846719,545460846719] 2266 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_111] # ymm1 = [0,7,14,21,0,3,10,17] 2267 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_113] # ymm2 = [24,0,6,13,20,0,2,9] 2268 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_114] # ymm3 = [16,23,0,5,12,19,0,1] 2269 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_115] # ymm4 = [8,15,22,0,4,11,18,25] 2270 .p2align 4, 0x90 2271 .LBB0_24: # =>This Inner Loop Header: Depth=1 2272 mov ecx, dword ptr [rbx - 24] 2273 mov edx, dword ptr [rbx - 20] 2274 mov esi, edx 2275 shld esi, ecx, 4 2276 vmovd xmm5, ecx 2277 vmovd xmm6, esi 2278 vpinsrd xmm6, xmm6, edx, 1 2279 vpinsrd xmm6, xmm6, edx, 2 2280 vpinsrd xmm6, xmm6, edx, 3 2281 vpbroadcastd xmm5, xmm5 2282 vinserti128 ymm5, ymm5, xmm6, 1 2283 vpsrlvd ymm5, ymm5, ymm1 2284 vpand ymm5, ymm5, ymm0 2285 vmovdqu ymmword ptr [r15 - 96], ymm5 2286 mov ecx, dword ptr [rbx - 12] 2287 mov edx, dword ptr [rbx - 20] 2288 mov esi, dword ptr [rbx - 16] 2289 mov edi, ecx 2290 shld edi, esi, 5 2291 mov eax, esi 2292 shld eax, edx, 1 2293 vmovd xmm5, esi 2294 vpinsrd xmm5, xmm5, edi, 1 2295 vpinsrd xmm5, xmm5, ecx, 2 2296 vpinsrd xmm5, xmm5, ecx, 3 2297 vmovd xmm6, edx 2298 vpinsrd xmm6, xmm6, eax, 1 2299 vpinsrd xmm6, xmm6, esi, 2 2300 vpinsrd xmm6, xmm6, esi, 3 2301 vinserti128 ymm5, ymm6, xmm5, 1 2302 vpsrlvd ymm5, ymm5, ymm2 2303 vpand ymm5, ymm5, ymm0 2304 vmovdqu ymmword ptr [r15 - 64], ymm5 2305 mov eax, dword ptr [rbx - 4] 2306 mov ecx, dword ptr [rbx - 12] 2307 mov edx, dword ptr [rbx - 8] 2308 mov esi, eax 2309 shld esi, edx, 6 2310 mov edi, edx 2311 shld edi, ecx, 2 2312 vmovd xmm5, edx 2313 vpinsrd xmm5, xmm5, edx, 1 2314 vpinsrd xmm5, xmm5, esi, 2 2315 vpinsrd xmm5, xmm5, eax, 3 2316 vmovd xmm6, ecx 2317 vpinsrd xmm6, xmm6, ecx, 1 2318 vpinsrd xmm6, xmm6, edi, 2 2319 vpinsrd xmm6, xmm6, edx, 3 2320 vinserti128 ymm5, ymm6, xmm5, 1 2321 vpsrlvd ymm5, ymm5, ymm3 2322 vpand ymm5, ymm5, ymm0 2323 vmovdqu ymmword ptr [r15 - 32], ymm5 2324 mov eax, dword ptr [rbx - 4] 2325 mov ecx, dword ptr [rbx] 2326 mov edx, ecx 2327 shld edx, eax, 3 2328 vmovd xmm5, ecx 2329 vmovd xmm6, eax 2330 vpinsrd xmm6, xmm6, eax, 1 2331 vpinsrd xmm6, xmm6, eax, 2 2332 vpinsrd xmm6, xmm6, edx, 3 2333 vpbroadcastd xmm5, xmm5 2334 vinserti128 ymm5, ymm6, xmm5, 1 2335 vpsrlvd ymm5, ymm5, ymm4 2336 vpand ymm5, ymm5, ymm0 2337 vmovdqu ymmword ptr [r15], ymm5 2338 sub r15, -128 2339 add rbx, 28 2340 add r8, -1 2341 jne .LBB0_24 2342 jmp .LBB0_147 2343 .LBB0_67: 2344 cmp ecx, 22 2345 je .LBB0_111 2346 # %bb.68: 2347 cmp ecx, 23 2348 jne .LBB0_147 2349 # %bb.69: 2350 cmp edx, 32 2351 jl .LBB0_147 2352 # %bb.70: 2353 mov r8d, r14d 2354 add r15, 96 2355 add rbx, 88 2356 vmovdqa ymm8, ymmword ptr [rip + .LCPI0_48] # ymm8 = [0,0,0,5,0,0,0,1] 2357 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_49] # ymm1 = [36028792732385279,36028792732385279,36028792732385279,36028792732385279] 2358 vmovdqa xmm2, xmmword ptr [rip + .LCPI0_50] # xmm2 = <24,15,u,u> 2359 vmovdqa xmm3, xmmword ptr [rip + .LCPI0_51] # xmm3 = <8,17,u,u> 2360 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_52] # ymm4 = [0,0,6,0,0,0,2,0] 2361 vmovdqa ymm5, ymmword ptr [rip + .LCPI0_53] # ymm5 = [0,7,0,0,0,3,0,0] 2362 vmovdqa ymm6, ymmword ptr [rip + .LCPI0_54] # ymm6 = [8,0,0,0,4,0,0,9] 2363 .p2align 4, 0x90 2364 .LBB0_71: # =>This Inner Loop Header: Depth=1 2365 mov r9d, dword ptr [rbx - 68] 2366 mov edx, dword ptr [rbx - 72] 2367 mov r11d, r9d 2368 shld r11d, edx, 22 2369 mov edi, dword ptr [rbx - 76] 2370 shld edx, edi, 13 2371 mov esi, dword ptr [rbx - 80] 2372 shld edi, esi, 4 2373 mov r10d, dword ptr [rbx - 88] 2374 mov ecx, dword ptr [rbx - 84] 2375 mov eax, esi 2376 shld eax, ecx, 18 2377 shld ecx, r10d, 9 2378 vmovd xmm7, r10d 2379 vmovd xmm0, edi 2380 vpinsrd xmm7, xmm7, ecx, 1 2381 vpinsrd xmm0, xmm0, edx, 1 2382 vpinsrd xmm7, xmm7, eax, 2 2383 vpinsrd xmm0, xmm0, r11d, 2 2384 vpinsrd xmm7, xmm7, esi, 3 2385 vpinsrd xmm0, xmm0, r9d, 3 2386 vinserti128 ymm0, ymm7, xmm0, 1 2387 vpsrlvd ymm0, ymm0, ymm8 2388 vpand ymm0, ymm0, ymm1 2389 vmovdqu ymmword ptr [r15 - 96], ymm0 2390 mov eax, dword ptr [rbx - 48] 2391 mov r9d, dword ptr [rbx - 44] 2392 shld r9d, eax, 7 2393 mov edx, dword ptr [rbx - 52] 2394 mov esi, eax 2395 shld esi, edx, 21 2396 mov edi, dword ptr [rbx - 60] 2397 mov ecx, dword ptr [rbx - 56] 2398 shld edx, ecx, 12 2399 shld ecx, edi, 3 2400 vmovq xmm0, qword ptr [rbx - 68] # xmm0 = mem[0],zero 2401 vpsrlvd xmm7, xmm0, xmm2 2402 vpshufd xmm0, xmm0, 229 # xmm0 = xmm0[1,1,2,3] 2403 vpinsrd xmm0, xmm0, edi, 1 2404 vpsllvd xmm0, xmm0, xmm3 2405 vpor xmm0, xmm7, xmm0 2406 vmovd xmm7, edx 2407 vpinsrd xmm7, xmm7, esi, 1 2408 vpinsrd xmm7, xmm7, eax, 2 2409 vpinsrd xmm7, xmm7, r9d, 3 2410 vpinsrd xmm0, xmm0, edi, 2 2411 vpinsrd xmm0, xmm0, ecx, 3 2412 vinserti128 ymm0, ymm0, xmm7, 1 2413 vpsrlvd ymm0, ymm0, ymm4 2414 vpand ymm0, ymm0, ymm1 2415 vmovdqu ymmword ptr [r15 - 64], ymm0 2416 mov r11d, dword ptr [rbx - 24] 2417 mov r9d, dword ptr [rbx - 20] 2418 shld r9d, r11d, 15 2419 mov r10d, dword ptr [rbx - 28] 2420 shld r11d, r10d, 6 2421 mov esi, dword ptr [rbx - 32] 2422 mov edi, r10d 2423 mov ecx, dword ptr [rbx - 36] 2424 shld edi, esi, 20 2425 mov edx, dword ptr [rbx - 44] 2426 mov eax, dword ptr [rbx - 40] 2427 shld esi, ecx, 11 2428 shrd edx, eax, 16 2429 shld ecx, eax, 2 2430 vmovd xmm0, edi 2431 vpinsrd xmm0, xmm0, r10d, 1 2432 vmovd xmm7, edx 2433 vpinsrd xmm0, xmm0, r11d, 2 2434 vpinsrd xmm7, xmm7, eax, 1 2435 vpinsrd xmm0, xmm0, r9d, 3 2436 vpinsrd xmm7, xmm7, ecx, 2 2437 vpinsrd xmm7, xmm7, esi, 3 2438 vinserti128 ymm0, ymm7, xmm0, 1 2439 vpsrlvd ymm0, ymm0, ymm5 2440 vpand ymm0, ymm0, ymm1 2441 vmovdqu ymmword ptr [r15 - 32], ymm0 2442 mov r9d, dword ptr [rbx] 2443 mov ecx, dword ptr [rbx - 4] 2444 mov edx, r9d 2445 shld edx, ecx, 14 2446 mov esi, dword ptr [rbx - 8] 2447 shld ecx, esi, 5 2448 mov edi, dword ptr [rbx - 12] 2449 vmovd xmm0, esi 2450 shld esi, edi, 19 2451 mov r10d, dword ptr [rbx - 20] 2452 mov eax, dword ptr [rbx - 16] 2453 shld edi, eax, 10 2454 shld eax, r10d, 1 2455 vpinsrd xmm0, xmm0, ecx, 1 2456 vmovd xmm7, r10d 2457 vpinsrd xmm0, xmm0, edx, 2 2458 vpinsrd xmm7, xmm7, eax, 1 2459 vpinsrd xmm0, xmm0, r9d, 3 2460 vpinsrd xmm7, xmm7, edi, 2 2461 vpinsrd xmm7, xmm7, esi, 3 2462 vinserti128 ymm0, ymm7, xmm0, 1 2463 vpsrlvd ymm0, ymm0, ymm6 2464 vpand ymm0, ymm0, ymm1 2465 vmovdqu ymmword ptr [r15], ymm0 2466 sub r15, -128 2467 add rbx, 92 2468 add r8, -1 2469 jne .LBB0_71 2470 jmp .LBB0_147 2471 .LBB0_43: 2472 cmp ecx, 14 2473 je .LBB0_123 2474 # %bb.44: 2475 cmp ecx, 15 2476 jne .LBB0_147 2477 # %bb.45: 2478 cmp edx, 32 2479 jl .LBB0_147 2480 # %bb.46: 2481 mov r8d, r14d 2482 add r15, 96 2483 add rbx, 56 2484 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_82] # ymm0 = [140733193420799,140733193420799,140733193420799,140733193420799] 2485 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_81] # ymm1 = [0,15,0,13,0,11,0,9] 2486 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_83] # ymm2 = [0,7,0,5,0,3,0,1] 2487 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_84] # ymm3 = [16,0,14,0,12,0,10,0] 2488 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_85] # ymm4 = [8,0,6,0,4,0,2,17] 2489 .p2align 4, 0x90 2490 .LBB0_47: # =>This Inner Loop Header: Depth=1 2491 mov r9d, dword ptr [rbx - 44] 2492 mov eax, dword ptr [rbx - 48] 2493 mov esi, r9d 2494 shld esi, eax, 6 2495 mov r10d, dword ptr [rbx - 52] 2496 mov edx, eax 2497 shld edx, r10d, 4 2498 mov ecx, dword ptr [rbx - 56] 2499 mov edi, r10d 2500 shld edi, ecx, 2 2501 vmovd xmm5, edx 2502 vpinsrd xmm5, xmm5, eax, 1 2503 vpinsrd xmm5, xmm5, esi, 2 2504 vpinsrd xmm5, xmm5, r9d, 3 2505 vmovd xmm6, ecx 2506 vpinsrd xmm6, xmm6, ecx, 1 2507 vpinsrd xmm6, xmm6, edi, 2 2508 vpinsrd xmm6, xmm6, r10d, 3 2509 vinserti128 ymm5, ymm6, xmm5, 1 2510 vpsrlvd ymm5, ymm5, ymm1 2511 vpand ymm5, ymm5, ymm0 2512 vmovdqu ymmword ptr [r15 - 96], ymm5 2513 mov r9d, dword ptr [rbx - 28] 2514 mov r11d, dword ptr [rbx - 32] 2515 mov edx, r9d 2516 shld edx, r11d, 14 2517 mov r10d, dword ptr [rbx - 36] 2518 mov edi, r11d 2519 shld edi, r10d, 12 2520 mov eax, dword ptr [rbx - 44] 2521 mov esi, dword ptr [rbx - 40] 2522 mov ecx, r10d 2523 shld ecx, esi, 10 2524 shrd eax, esi, 24 2525 vmovd xmm5, edi 2526 vpinsrd xmm5, xmm5, r11d, 1 2527 vpinsrd xmm5, xmm5, edx, 2 2528 vpinsrd xmm5, xmm5, r9d, 3 2529 vmovd xmm6, eax 2530 vpinsrd xmm6, xmm6, esi, 1 2531 vpinsrd xmm6, xmm6, ecx, 2 2532 vpinsrd xmm6, xmm6, r10d, 3 2533 vinserti128 ymm5, ymm6, xmm5, 1 2534 vpsrlvd ymm5, ymm5, ymm2 2535 vpand ymm5, ymm5, ymm0 2536 vmovdqu ymmword ptr [r15 - 64], ymm5 2537 mov eax, dword ptr [rbx - 16] 2538 mov r10d, dword ptr [rbx - 12] 2539 shld r10d, eax, 7 2540 mov edx, dword ptr [rbx - 20] 2541 mov esi, eax 2542 shld esi, edx, 5 2543 mov r9d, dword ptr [rbx - 28] 2544 mov ecx, dword ptr [rbx - 24] 2545 mov edi, ecx 2546 shld edi, r9d, 1 2547 vmovd xmm5, edx 2548 shld edx, ecx, 3 2549 vpinsrd xmm5, xmm5, esi, 1 2550 vpinsrd xmm5, xmm5, eax, 2 2551 vpinsrd xmm5, xmm5, r10d, 3 2552 vmovd xmm6, r9d 2553 vpinsrd xmm6, xmm6, edi, 1 2554 vpinsrd xmm6, xmm6, ecx, 2 2555 vpinsrd xmm6, xmm6, edx, 3 2556 vinserti128 ymm5, ymm6, xmm5, 1 2557 vpsrlvd ymm5, ymm5, ymm3 2558 vpand ymm5, ymm5, ymm0 2559 vmovdqu ymmword ptr [r15 - 32], ymm5 2560 mov r9d, dword ptr [rbx] 2561 mov ecx, dword ptr [rbx - 4] 2562 mov edx, r9d 2563 shld edx, ecx, 13 2564 mov eax, dword ptr [rbx - 8] 2565 vmovd xmm5, ecx 2566 shld ecx, eax, 11 2567 mov edi, dword ptr [rbx - 12] 2568 mov esi, eax 2569 shld esi, edi, 9 2570 vmovd xmm6, edi 2571 vpinsrd xmm6, xmm6, esi, 1 2572 vpinsrd xmm6, xmm6, eax, 2 2573 vpinsrd xmm6, xmm6, ecx, 3 2574 vpinsrd xmm5, xmm5, edx, 1 2575 vpinsrd xmm5, xmm5, r9d, 2 2576 vpinsrd xmm5, xmm5, r9d, 3 2577 vinserti128 ymm5, ymm6, xmm5, 1 2578 vpsrlvd ymm5, ymm5, ymm4 2579 vpand ymm5, ymm5, ymm0 2580 vmovdqu ymmword ptr [r15], ymm5 2581 sub r15, -128 2582 add rbx, 60 2583 add r8, -1 2584 jne .LBB0_47 2585 jmp .LBB0_147 2586 .LBB0_96: 2587 cmp edx, 32 2588 jl .LBB0_147 2589 # %bb.97: 2590 mov r8d, r14d 2591 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_0] # ymm0 = [9223372034707292159,9223372034707292159,9223372034707292159,9223372034707292159] 2592 add r15, 96 2593 vmovdqa ymm8, ymmword ptr [rip + .LCPI0_1] # ymm8 = [24,23,22,21,20,19,18,17] 2594 vmovdqa ymm9, ymmword ptr [rip + .LCPI0_2] # ymm9 = [8,9,10,11,12,13,14,15] 2595 vmovdqa ymm10, ymmword ptr [rip + .LCPI0_3] # ymm10 = [16,15,14,13,12,11,10,9] 2596 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_4] # ymm4 = [16,17,18,19,20,21,22,23] 2597 vmovdqa xmm5, xmmword ptr [rip + .LCPI0_5] # xmm5 = [8,7,6,5] 2598 vmovdqa xmm6, xmmword ptr [rip + .LCPI0_6] # xmm6 = [24,25,26,27] 2599 vmovdqa ymm7, ymmword ptr [rip + .LCPI0_7] # ymm7 = [0,0,0,0,0,0,0,1] 2600 .p2align 4, 0x90 2601 .LBB0_98: # =>This Inner Loop Header: Depth=1 2602 mov r10d, dword ptr [rbx + 24] 2603 mov r9d, dword ptr [rbx + 28] 2604 shld r9d, r10d, 7 2605 mov esi, dword ptr [rbx + 20] 2606 shld r10d, esi, 6 2607 mov edi, dword ptr [rbx + 16] 2608 shld esi, edi, 5 2609 mov eax, dword ptr [rbx + 12] 2610 shld edi, eax, 4 2611 mov edx, dword ptr [rbx + 8] 2612 shld eax, edx, 3 2613 mov ecx, dword ptr [rbx + 4] 2614 shld edx, ecx, 2 2615 mov r11d, dword ptr [rbx] 2616 shld ecx, r11d, 1 2617 vmovd xmm1, edi 2618 vpinsrd xmm1, xmm1, esi, 1 2619 vpinsrd xmm1, xmm1, r10d, 2 2620 vpinsrd xmm1, xmm1, r9d, 3 2621 vmovd xmm2, r11d 2622 vpinsrd xmm2, xmm2, ecx, 1 2623 vpinsrd xmm2, xmm2, edx, 2 2624 vpinsrd xmm2, xmm2, eax, 3 2625 vinserti128 ymm1, ymm2, xmm1, 1 2626 vpand ymm1, ymm1, ymm0 2627 vmovdqu ymmword ptr [r15 - 96], ymm1 2628 vmovdqu ymm1, ymmword ptr [rbx + 28] 2629 vpsrlvd ymm1, ymm1, ymm8 2630 vmovdqu xmm2, xmmword ptr [rbx + 44] 2631 vpshufd xmm3, xmm2, 249 # xmm3 = xmm2[1,2,3,3] 2632 vpinsrd xmm3, xmm3, dword ptr [rbx + 60], 3 2633 vpalignr xmm2, xmm2, xmmword ptr [rbx + 28], 4 # xmm2 = mem[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 2634 vinserti128 ymm2, ymm2, xmm3, 1 2635 vpsllvd ymm2, ymm2, ymm9 2636 vpor ymm1, ymm1, ymm2 2637 vpand ymm1, ymm1, ymm0 2638 vmovdqu ymmword ptr [r15 - 64], ymm1 2639 vmovdqu ymm1, ymmword ptr [rbx + 60] 2640 vmovdqu xmm2, xmmword ptr [rbx + 76] 2641 vpshufd xmm3, xmm2, 249 # xmm3 = xmm2[1,2,3,3] 2642 vpinsrd xmm3, xmm3, dword ptr [rbx + 92], 3 2643 vpsrlvd ymm1, ymm1, ymm10 2644 vpalignr xmm2, xmm2, xmmword ptr [rbx + 60], 4 # xmm2 = mem[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 2645 vinserti128 ymm2, ymm2, xmm3, 1 2646 vpsllvd ymm2, ymm2, ymm4 2647 vpor ymm1, ymm1, ymm2 2648 vpand ymm1, ymm1, ymm0 2649 vmovdqu ymmword ptr [r15 - 32], ymm1 2650 mov eax, dword ptr [rbx + 120] 2651 mov ecx, dword ptr [rbx + 116] 2652 mov edx, eax 2653 shld edx, ecx, 30 2654 mov esi, dword ptr [rbx + 112] 2655 shld ecx, esi, 29 2656 mov edi, dword ptr [rbx + 108] 2657 shld esi, edi, 28 2658 vmovdqu xmm1, xmmword ptr [rbx + 92] 2659 vpsrlvd xmm2, xmm1, xmm5 2660 vpshufd xmm1, xmm1, 249 # xmm1 = xmm1[1,2,3,3] 2661 vpinsrd xmm1, xmm1, edi, 3 2662 vpsllvd xmm1, xmm1, xmm6 2663 vmovd xmm3, esi 2664 vpinsrd xmm3, xmm3, ecx, 1 2665 vpinsrd xmm3, xmm3, edx, 2 2666 vpinsrd xmm3, xmm3, eax, 3 2667 vpor xmm1, xmm2, xmm1 2668 vinserti128 ymm1, ymm1, xmm3, 1 2669 vpsrlvd ymm1, ymm1, ymm7 2670 vpand ymm1, ymm1, ymm0 2671 vmovdqu ymmword ptr [r15], ymm1 2672 add rbx, 124 2673 sub r15, -128 2674 add r8, -1 2675 jne .LBB0_98 2676 jmp .LBB0_147 2677 .LBB0_144: 2678 cmp edx, 32 2679 jl .LBB0_147 2680 # %bb.145: 2681 mov ebx, r14d 2682 .p2align 4, 0x90 2683 .LBB0_146: # =>This Inner Loop Header: Depth=1 2684 mov edx, 128 2685 mov rdi, r15 2686 xor esi, esi 2687 call clib·_memset(SB) 2688 sub r15, -128 2689 add rbx, -1 2690 jne .LBB0_146 2691 jmp .LBB0_147 2692 .LBB0_120: 2693 cmp edx, 32 2694 jl .LBB0_147 2695 # %bb.121: 2696 mov eax, r14d 2697 xor ecx, ecx 2698 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_80] # ymm0 = [68719476736,68719476736,68719476736,68719476736] 2699 vpxor xmm1, xmm1, xmm1 2700 .p2align 4, 0x90 2701 .LBB0_122: # =>This Inner Loop Header: Depth=1 2702 vmovdqu xmm2, xmmword ptr [rbx + rcx] 2703 vpermq ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3] 2704 vpshufd ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1,4,4,5,5] 2705 vpsrlvd ymm2, ymm2, ymm0 2706 vpblendw ymm2, ymm2, ymm1, 170 # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 2707 vmovdqu ymmword ptr [r15 + 2*rcx], ymm2 2708 vmovdqu xmm2, xmmword ptr [rbx + rcx + 16] 2709 vpermq ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3] 2710 vpshufd ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1,4,4,5,5] 2711 vpsrlvd ymm2, ymm2, ymm0 2712 vpblendw ymm2, ymm2, ymm1, 170 # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 2713 vmovdqu ymmword ptr [r15 + 2*rcx + 32], ymm2 2714 vmovdqu xmm2, xmmword ptr [rbx + rcx + 32] 2715 vpermq ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3] 2716 vpshufd ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1,4,4,5,5] 2717 vpsrlvd ymm2, ymm2, ymm0 2718 vpblendw ymm2, ymm2, ymm1, 170 # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 2719 vmovdqu ymmword ptr [r15 + 2*rcx + 64], ymm2 2720 vmovdqu xmm2, xmmword ptr [rbx + rcx + 48] 2721 vpermq ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3] 2722 vpshufd ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1,4,4,5,5] 2723 vpsrlvd ymm2, ymm2, ymm0 2724 vpblendw ymm2, ymm2, ymm1, 170 # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 2725 vmovdqu ymmword ptr [r15 + 2*rcx + 96], ymm2 2726 add rcx, 64 2727 add rax, -1 2728 jne .LBB0_122 2729 jmp .LBB0_147 2730 .LBB0_132: 2731 cmp edx, 32 2732 jl .LBB0_147 2733 # %bb.133: 2734 mov eax, r14d 2735 xor ecx, ecx 2736 vbroadcasti128 ymm0, xmmword ptr [rip + .LCPI0_109] # ymm0 = [0,8,16,24,0,8,16,24] 2737 # ymm0 = mem[0,1,0,1] 2738 vpbroadcastd ymm1, dword ptr [rip + .LCPI0_110] # ymm1 = [255,255,255,255,255,255,255,255] 2739 .p2align 4, 0x90 2740 .LBB0_134: # =>This Inner Loop Header: Depth=1 2741 vmovq xmm2, qword ptr [rbx + rcx] # xmm2 = mem[0],zero 2742 vpshufd xmm2, xmm2, 80 # xmm2 = xmm2[0,0,1,1] 2743 vpermq ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1] 2744 vpsrlvd ymm2, ymm2, ymm0 2745 vpand ymm2, ymm2, ymm1 2746 vmovdqu ymmword ptr [r15 + 4*rcx], ymm2 2747 vmovq xmm2, qword ptr [rbx + rcx + 8] # xmm2 = mem[0],zero 2748 vpshufd xmm2, xmm2, 80 # xmm2 = xmm2[0,0,1,1] 2749 vpermq ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1] 2750 vpsrlvd ymm2, ymm2, ymm0 2751 vpand ymm2, ymm2, ymm1 2752 vmovdqu ymmword ptr [r15 + 4*rcx + 32], ymm2 2753 vmovq xmm2, qword ptr [rbx + rcx + 16] # xmm2 = mem[0],zero 2754 vpshufd xmm2, xmm2, 80 # xmm2 = xmm2[0,0,1,1] 2755 vpermq ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1] 2756 vpsrlvd ymm2, ymm2, ymm0 2757 vpand ymm2, ymm2, ymm1 2758 vmovdqu ymmword ptr [r15 + 4*rcx + 64], ymm2 2759 vmovq xmm2, qword ptr [rbx + rcx + 24] # xmm2 = mem[0],zero 2760 vpshufd xmm2, xmm2, 80 # xmm2 = xmm2[0,0,1,1] 2761 vpermq ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1] 2762 vpsrlvd ymm2, ymm2, ymm0 2763 vpand ymm2, ymm2, ymm1 2764 vmovdqu ymmword ptr [r15 + 4*rcx + 96], ymm2 2765 add rcx, 32 2766 add rax, -1 2767 jne .LBB0_134 2768 jmp .LBB0_147 2769 .LBB0_108: 2770 cmp edx, 32 2771 jl .LBB0_147 2772 # %bb.109: 2773 mov r8d, r14d 2774 add r15, 96 2775 add rbx, 92 2776 vbroadcasti128 ymm0, xmmword ptr [rip + .LCPI0_46] # ymm0 = [0,0,0,8,0,0,0,8] 2777 # ymm0 = mem[0,1,0,1] 2778 vpbroadcastd ymm1, dword ptr [rip + .LCPI0_47] # ymm1 = [16777215,16777215,16777215,16777215,16777215,16777215,16777215,16777215] 2779 .p2align 4, 0x90 2780 .LBB0_110: # =>This Inner Loop Header: Depth=1 2781 mov r9d, dword ptr [rbx - 72] 2782 mov edx, dword ptr [rbx - 76] 2783 mov esi, r9d 2784 mov edi, dword ptr [rbx - 80] 2785 mov r10d, dword ptr [rbx - 84] 2786 shld esi, edx, 16 2787 mov r11d, dword ptr [rbx - 92] 2788 mov eax, dword ptr [rbx - 88] 2789 shld edx, edi, 8 2790 mov ecx, r10d 2791 shld ecx, eax, 16 2792 shld eax, r11d, 8 2793 vmovd xmm2, edi 2794 vmovd xmm3, r11d 2795 vpinsrd xmm2, xmm2, edx, 1 2796 vpinsrd xmm3, xmm3, eax, 1 2797 vpinsrd xmm2, xmm2, esi, 2 2798 vpinsrd xmm3, xmm3, ecx, 2 2799 vpinsrd xmm2, xmm2, r9d, 3 2800 vpinsrd xmm3, xmm3, r10d, 3 2801 vinserti128 ymm2, ymm3, xmm2, 1 2802 vpsrlvd ymm2, ymm2, ymm0 2803 vpand ymm2, ymm2, ymm1 2804 vmovdqu ymmword ptr [r15 - 96], ymm2 2805 mov r9d, dword ptr [rbx - 48] 2806 mov ecx, dword ptr [rbx - 52] 2807 mov edx, r9d 2808 mov esi, dword ptr [rbx - 56] 2809 mov r10d, dword ptr [rbx - 60] 2810 shld edx, ecx, 16 2811 mov r11d, dword ptr [rbx - 68] 2812 mov edi, dword ptr [rbx - 64] 2813 shld ecx, esi, 8 2814 mov eax, r10d 2815 shld eax, edi, 16 2816 shld edi, r11d, 8 2817 vmovd xmm2, esi 2818 vmovd xmm3, r11d 2819 vpinsrd xmm2, xmm2, ecx, 1 2820 vpinsrd xmm3, xmm3, edi, 1 2821 vpinsrd xmm2, xmm2, edx, 2 2822 vpinsrd xmm3, xmm3, eax, 2 2823 vpinsrd xmm2, xmm2, r9d, 3 2824 vpinsrd xmm3, xmm3, r10d, 3 2825 vinserti128 ymm2, ymm3, xmm2, 1 2826 vpsrlvd ymm2, ymm2, ymm0 2827 vpand ymm2, ymm2, ymm1 2828 vmovdqu ymmword ptr [r15 - 64], ymm2 2829 mov r9d, dword ptr [rbx - 24] 2830 mov ecx, dword ptr [rbx - 28] 2831 mov edx, r9d 2832 mov esi, dword ptr [rbx - 32] 2833 mov r10d, dword ptr [rbx - 36] 2834 shld edx, ecx, 16 2835 mov r11d, dword ptr [rbx - 44] 2836 mov edi, dword ptr [rbx - 40] 2837 shld ecx, esi, 8 2838 mov eax, r10d 2839 shld eax, edi, 16 2840 shld edi, r11d, 8 2841 vmovd xmm2, esi 2842 vmovd xmm3, r11d 2843 vpinsrd xmm2, xmm2, ecx, 1 2844 vpinsrd xmm3, xmm3, edi, 1 2845 vpinsrd xmm2, xmm2, edx, 2 2846 vpinsrd xmm3, xmm3, eax, 2 2847 vpinsrd xmm2, xmm2, r9d, 3 2848 vpinsrd xmm3, xmm3, r10d, 3 2849 vinserti128 ymm2, ymm3, xmm2, 1 2850 vpsrlvd ymm2, ymm2, ymm0 2851 vpand ymm2, ymm2, ymm1 2852 vmovdqu ymmword ptr [r15 - 32], ymm2 2853 mov r9d, dword ptr [rbx] 2854 mov ecx, dword ptr [rbx - 4] 2855 mov edx, r9d 2856 mov esi, dword ptr [rbx - 8] 2857 mov r10d, dword ptr [rbx - 12] 2858 shld edx, ecx, 16 2859 mov r11d, dword ptr [rbx - 20] 2860 mov edi, dword ptr [rbx - 16] 2861 shld ecx, esi, 8 2862 mov eax, r10d 2863 shld eax, edi, 16 2864 shld edi, r11d, 8 2865 vmovd xmm2, esi 2866 vpinsrd xmm2, xmm2, ecx, 1 2867 vmovd xmm3, r11d 2868 vpinsrd xmm2, xmm2, edx, 2 2869 vpinsrd xmm3, xmm3, edi, 1 2870 vpinsrd xmm2, xmm2, r9d, 3 2871 vpinsrd xmm3, xmm3, eax, 2 2872 vpinsrd xmm3, xmm3, r10d, 3 2873 vinserti128 ymm2, ymm3, xmm2, 1 2874 vpsrlvd ymm2, ymm2, ymm0 2875 vpand ymm2, ymm2, ymm1 2876 vmovdqu ymmword ptr [r15], ymm2 2877 sub r15, -128 2878 add rbx, 96 2879 add r8, -1 2880 jne .LBB0_110 2881 jmp .LBB0_147 2882 .LBB0_138: 2883 cmp edx, 32 2884 jl .LBB0_147 2885 # %bb.139: 2886 mov eax, r14d 2887 xor ecx, ecx 2888 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_124] # ymm0 = [0,4,8,12,16,20,24,28] 2889 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_125] # ymm1 = [64424509455,64424509455,64424509455,64424509455] 2890 .p2align 4, 0x90 2891 .LBB0_140: # =>This Inner Loop Header: Depth=1 2892 vpbroadcastd ymm2, dword ptr [rbx + rcx] 2893 vpsrlvd ymm2, ymm2, ymm0 2894 vpand ymm2, ymm2, ymm1 2895 vmovdqu ymmword ptr [r15 + 8*rcx], ymm2 2896 vpbroadcastd ymm2, dword ptr [rbx + rcx + 4] 2897 vpsrlvd ymm2, ymm2, ymm0 2898 vpand ymm2, ymm2, ymm1 2899 vmovdqu ymmword ptr [r15 + 8*rcx + 32], ymm2 2900 vpbroadcastd ymm2, dword ptr [rbx + rcx + 8] 2901 vpsrlvd ymm2, ymm2, ymm0 2902 vpand ymm2, ymm2, ymm1 2903 vmovdqu ymmword ptr [r15 + 8*rcx + 64], ymm2 2904 vpbroadcastd ymm2, dword ptr [rbx + rcx + 12] 2905 vpsrlvd ymm2, ymm2, ymm0 2906 vpand ymm2, ymm2, ymm1 2907 vmovdqu ymmword ptr [r15 + 8*rcx + 96], ymm2 2908 add rcx, 16 2909 add rax, -1 2910 jne .LBB0_140 2911 jmp .LBB0_147 2912 .LBB0_114: 2913 cmp edx, 32 2914 jl .LBB0_147 2915 # %bb.115: 2916 mov r8d, r14d 2917 add r15, 96 2918 add rbx, 76 2919 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_65] # ymm0 = [0,0,8,0,0,4,0,12] 2920 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_66] # ymm1 = [4503595333451775,4503595333451775,4503595333451775,4503595333451775] 2921 .p2align 4, 0x90 2922 .LBB0_116: # =>This Inner Loop Header: Depth=1 2923 mov r9d, dword ptr [rbx - 60] 2924 mov r11d, dword ptr [rbx - 64] 2925 mov esi, r9d 2926 shld esi, r11d, 8 2927 mov edi, dword ptr [rbx - 68] 2928 mov edx, r11d 2929 shld edx, edi, 16 2930 mov eax, dword ptr [rbx - 72] 2931 shld edi, eax, 4 2932 mov r10d, dword ptr [rbx - 76] 2933 mov ecx, eax 2934 shld ecx, r10d, 12 2935 vmovd xmm2, edx 2936 vpinsrd xmm2, xmm2, r11d, 1 2937 vpinsrd xmm2, xmm2, esi, 2 2938 vpinsrd xmm2, xmm2, r9d, 3 2939 vmovd xmm3, r10d 2940 vpinsrd xmm3, xmm3, ecx, 1 2941 vpinsrd xmm3, xmm3, eax, 2 2942 vpinsrd xmm3, xmm3, edi, 3 2943 vinserti128 ymm2, ymm3, xmm2, 1 2944 vpsrlvd ymm2, ymm2, ymm0 2945 vpand ymm2, ymm2, ymm1 2946 vmovdqu ymmword ptr [r15 - 96], ymm2 2947 mov r9d, dword ptr [rbx - 40] 2948 mov r11d, dword ptr [rbx - 44] 2949 mov edx, r9d 2950 shld edx, r11d, 8 2951 mov esi, dword ptr [rbx - 48] 2952 mov edi, r11d 2953 shld edi, esi, 16 2954 mov r10d, dword ptr [rbx - 56] 2955 mov ecx, dword ptr [rbx - 52] 2956 shld esi, ecx, 4 2957 mov eax, ecx 2958 shld eax, r10d, 12 2959 vmovd xmm2, edi 2960 vpinsrd xmm2, xmm2, r11d, 1 2961 vpinsrd xmm2, xmm2, edx, 2 2962 vpinsrd xmm2, xmm2, r9d, 3 2963 vmovd xmm3, r10d 2964 vpinsrd xmm3, xmm3, eax, 1 2965 vpinsrd xmm3, xmm3, ecx, 2 2966 vpinsrd xmm3, xmm3, esi, 3 2967 vinserti128 ymm2, ymm3, xmm2, 1 2968 vpsrlvd ymm2, ymm2, ymm0 2969 vpand ymm2, ymm2, ymm1 2970 vmovdqu ymmword ptr [r15 - 64], ymm2 2971 mov r9d, dword ptr [rbx - 20] 2972 mov r11d, dword ptr [rbx - 24] 2973 mov edx, r9d 2974 shld edx, r11d, 8 2975 mov esi, dword ptr [rbx - 28] 2976 mov edi, r11d 2977 shld edi, esi, 16 2978 mov ecx, dword ptr [rbx - 32] 2979 shld esi, ecx, 4 2980 mov r10d, dword ptr [rbx - 36] 2981 mov eax, ecx 2982 shld eax, r10d, 12 2983 vmovd xmm2, edi 2984 vpinsrd xmm2, xmm2, r11d, 1 2985 vpinsrd xmm2, xmm2, edx, 2 2986 vpinsrd xmm2, xmm2, r9d, 3 2987 vmovd xmm3, r10d 2988 vpinsrd xmm3, xmm3, eax, 1 2989 vpinsrd xmm3, xmm3, ecx, 2 2990 vpinsrd xmm3, xmm3, esi, 3 2991 vinserti128 ymm2, ymm3, xmm2, 1 2992 vpsrlvd ymm2, ymm2, ymm0 2993 vpand ymm2, ymm2, ymm1 2994 vmovdqu ymmword ptr [r15 - 32], ymm2 2995 mov r9d, dword ptr [rbx] 2996 mov r11d, dword ptr [rbx - 4] 2997 mov edx, r9d 2998 shld edx, r11d, 8 2999 mov esi, dword ptr [rbx - 8] 3000 mov edi, r11d 3001 shld edi, esi, 16 3002 mov r10d, dword ptr [rbx - 16] 3003 mov ecx, dword ptr [rbx - 12] 3004 shld esi, ecx, 4 3005 mov eax, ecx 3006 shld eax, r10d, 12 3007 vmovd xmm2, edi 3008 vpinsrd xmm2, xmm2, r11d, 1 3009 vpinsrd xmm2, xmm2, edx, 2 3010 vpinsrd xmm2, xmm2, r9d, 3 3011 vmovd xmm3, r10d 3012 vpinsrd xmm3, xmm3, eax, 1 3013 vpinsrd xmm3, xmm3, ecx, 2 3014 vpinsrd xmm3, xmm3, esi, 3 3015 vinserti128 ymm2, ymm3, xmm2, 1 3016 vpsrlvd ymm2, ymm2, ymm0 3017 vpand ymm2, ymm2, ymm1 3018 vmovdqu ymmword ptr [r15], ymm2 3019 sub r15, -128 3020 add rbx, 80 3021 add r8, -1 3022 jne .LBB0_116 3023 jmp .LBB0_147 3024 .LBB0_126: 3025 cmp edx, 32 3026 jl .LBB0_147 3027 # %bb.127: 3028 mov r8d, r14d 3029 add r15, 96 3030 add rbx, 44 3031 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_94] # ymm0 = [0,12,0,4,16,0,8,20] 3032 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_95] # ymm1 = [17587891081215,17587891081215,17587891081215,17587891081215] 3033 .p2align 4, 0x90 3034 .LBB0_128: # =>This Inner Loop Header: Depth=1 3035 mov ecx, dword ptr [rbx - 36] 3036 mov edx, dword ptr [rbx - 44] 3037 mov esi, dword ptr [rbx - 40] 3038 mov edi, ecx 3039 shld edi, esi, 4 3040 mov eax, esi 3041 shld eax, edx, 8 3042 vmovd xmm2, esi 3043 vpinsrd xmm2, xmm2, edi, 1 3044 vpinsrd xmm2, xmm2, ecx, 2 3045 vpinsrd xmm2, xmm2, ecx, 3 3046 vmovd xmm3, edx 3047 vpinsrd xmm3, xmm3, edx, 1 3048 vpinsrd xmm3, xmm3, eax, 2 3049 vpinsrd xmm3, xmm3, esi, 3 3050 vinserti128 ymm2, ymm3, xmm2, 1 3051 vpsrlvd ymm2, ymm2, ymm0 3052 vpand ymm2, ymm2, ymm1 3053 vmovdqu ymmword ptr [r15 - 96], ymm2 3054 mov eax, dword ptr [rbx - 24] 3055 mov ecx, dword ptr [rbx - 32] 3056 mov edx, dword ptr [rbx - 28] 3057 mov esi, eax 3058 shld esi, edx, 4 3059 mov edi, edx 3060 shld edi, ecx, 8 3061 vmovd xmm2, edx 3062 vpinsrd xmm2, xmm2, esi, 1 3063 vpinsrd xmm2, xmm2, eax, 2 3064 vpinsrd xmm2, xmm2, eax, 3 3065 vmovd xmm3, ecx 3066 vpinsrd xmm3, xmm3, ecx, 1 3067 vpinsrd xmm3, xmm3, edi, 2 3068 vpinsrd xmm3, xmm3, edx, 3 3069 vinserti128 ymm2, ymm3, xmm2, 1 3070 vpsrlvd ymm2, ymm2, ymm0 3071 vpand ymm2, ymm2, ymm1 3072 vmovdqu ymmword ptr [r15 - 64], ymm2 3073 mov eax, dword ptr [rbx - 12] 3074 mov ecx, dword ptr [rbx - 20] 3075 mov edx, dword ptr [rbx - 16] 3076 mov esi, eax 3077 shld esi, edx, 4 3078 mov edi, edx 3079 shld edi, ecx, 8 3080 vmovd xmm2, edx 3081 vpinsrd xmm2, xmm2, esi, 1 3082 vpinsrd xmm2, xmm2, eax, 2 3083 vpinsrd xmm2, xmm2, eax, 3 3084 vmovd xmm3, ecx 3085 vpinsrd xmm3, xmm3, ecx, 1 3086 vpinsrd xmm3, xmm3, edi, 2 3087 vpinsrd xmm3, xmm3, edx, 3 3088 vinserti128 ymm2, ymm3, xmm2, 1 3089 vpsrlvd ymm2, ymm2, ymm0 3090 vpand ymm2, ymm2, ymm1 3091 vmovdqu ymmword ptr [r15 - 32], ymm2 3092 mov eax, dword ptr [rbx] 3093 mov ecx, dword ptr [rbx - 8] 3094 mov edx, dword ptr [rbx - 4] 3095 mov esi, eax 3096 shld esi, edx, 4 3097 mov edi, edx 3098 shld edi, ecx, 8 3099 vmovd xmm2, edx 3100 vpinsrd xmm2, xmm2, esi, 1 3101 vpinsrd xmm2, xmm2, eax, 2 3102 vpinsrd xmm2, xmm2, eax, 3 3103 vmovd xmm3, ecx 3104 vpinsrd xmm3, xmm3, ecx, 1 3105 vpinsrd xmm3, xmm3, edi, 2 3106 vpinsrd xmm3, xmm3, edx, 3 3107 vinserti128 ymm2, ymm3, xmm2, 1 3108 vpsrlvd ymm2, ymm2, ymm0 3109 vpand ymm2, ymm2, ymm1 3110 vmovdqu ymmword ptr [r15], ymm2 3111 sub r15, -128 3112 add rbx, 48 3113 add r8, -1 3114 jne .LBB0_128 3115 jmp .LBB0_147 3116 .LBB0_102: 3117 cmp edx, 32 3118 jl .LBB0_147 3119 # %bb.103: 3120 mov r8d, r14d 3121 add r15, 96 3122 add rbx, 108 3123 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_22] # ymm0 = [0,0,0,0,0,0,0,4] 3124 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_23] # ymm1 = [1152921500580315135,1152921500580315135,1152921500580315135,1152921500580315135] 3125 .p2align 4, 0x90 3126 .LBB0_104: # =>This Inner Loop Header: Depth=1 3127 mov r9d, dword ptr [rbx - 84] 3128 mov edx, dword ptr [rbx - 88] 3129 mov r10d, r9d 3130 shld r10d, edx, 24 3131 mov edi, dword ptr [rbx - 92] 3132 shld edx, edi, 20 3133 mov eax, dword ptr [rbx - 96] 3134 shld edi, eax, 16 3135 mov ecx, dword ptr [rbx - 100] 3136 shld eax, ecx, 12 3137 mov r11d, dword ptr [rbx - 108] 3138 mov esi, dword ptr [rbx - 104] 3139 shld ecx, esi, 8 3140 shld esi, r11d, 4 3141 vmovd xmm2, r11d 3142 vmovd xmm3, edi 3143 vpinsrd xmm2, xmm2, esi, 1 3144 vpinsrd xmm3, xmm3, edx, 1 3145 vpinsrd xmm2, xmm2, ecx, 2 3146 vpinsrd xmm3, xmm3, r10d, 2 3147 vpinsrd xmm2, xmm2, eax, 3 3148 vpinsrd xmm3, xmm3, r9d, 3 3149 vinserti128 ymm2, ymm2, xmm3, 1 3150 vpsrlvd ymm2, ymm2, ymm0 3151 vpand ymm2, ymm2, ymm1 3152 vmovdqu ymmword ptr [r15 - 96], ymm2 3153 mov r9d, dword ptr [rbx - 56] 3154 mov ecx, dword ptr [rbx - 60] 3155 mov r10d, r9d 3156 shld r10d, ecx, 24 3157 mov esi, dword ptr [rbx - 64] 3158 shld ecx, esi, 20 3159 mov edi, dword ptr [rbx - 68] 3160 shld esi, edi, 16 3161 mov eax, dword ptr [rbx - 72] 3162 shld edi, eax, 12 3163 mov r11d, dword ptr [rbx - 80] 3164 mov edx, dword ptr [rbx - 76] 3165 shld eax, edx, 8 3166 shld edx, r11d, 4 3167 vmovd xmm2, r11d 3168 vmovd xmm3, esi 3169 vpinsrd xmm2, xmm2, edx, 1 3170 vpinsrd xmm3, xmm3, ecx, 1 3171 vpinsrd xmm2, xmm2, eax, 2 3172 vpinsrd xmm3, xmm3, r10d, 2 3173 vpinsrd xmm2, xmm2, edi, 3 3174 vpinsrd xmm3, xmm3, r9d, 3 3175 vinserti128 ymm2, ymm2, xmm3, 1 3176 vpsrlvd ymm2, ymm2, ymm0 3177 vpand ymm2, ymm2, ymm1 3178 vmovdqu ymmword ptr [r15 - 64], ymm2 3179 mov r9d, dword ptr [rbx - 28] 3180 mov ecx, dword ptr [rbx - 32] 3181 mov r10d, r9d 3182 shld r10d, ecx, 24 3183 mov esi, dword ptr [rbx - 36] 3184 shld ecx, esi, 20 3185 mov edi, dword ptr [rbx - 40] 3186 shld esi, edi, 16 3187 mov eax, dword ptr [rbx - 44] 3188 shld edi, eax, 12 3189 mov r11d, dword ptr [rbx - 52] 3190 mov edx, dword ptr [rbx - 48] 3191 shld eax, edx, 8 3192 shld edx, r11d, 4 3193 vmovd xmm2, r11d 3194 vmovd xmm3, esi 3195 vpinsrd xmm2, xmm2, edx, 1 3196 vpinsrd xmm3, xmm3, ecx, 1 3197 vpinsrd xmm2, xmm2, eax, 2 3198 vpinsrd xmm3, xmm3, r10d, 2 3199 vpinsrd xmm2, xmm2, edi, 3 3200 vpinsrd xmm3, xmm3, r9d, 3 3201 vinserti128 ymm2, ymm2, xmm3, 1 3202 vpsrlvd ymm2, ymm2, ymm0 3203 vpand ymm2, ymm2, ymm1 3204 vmovdqu ymmword ptr [r15 - 32], ymm2 3205 mov r9d, dword ptr [rbx] 3206 mov ecx, dword ptr [rbx - 4] 3207 mov r10d, r9d 3208 shld r10d, ecx, 24 3209 mov esi, dword ptr [rbx - 8] 3210 shld ecx, esi, 20 3211 mov edi, dword ptr [rbx - 12] 3212 shld esi, edi, 16 3213 mov eax, dword ptr [rbx - 16] 3214 shld edi, eax, 12 3215 mov r11d, dword ptr [rbx - 24] 3216 mov edx, dword ptr [rbx - 20] 3217 shld eax, edx, 8 3218 shld edx, r11d, 4 3219 vmovd xmm2, r11d 3220 vmovd xmm3, esi 3221 vpinsrd xmm2, xmm2, edx, 1 3222 vpinsrd xmm3, xmm3, ecx, 1 3223 vpinsrd xmm2, xmm2, eax, 2 3224 vpinsrd xmm3, xmm3, r10d, 2 3225 vpinsrd xmm2, xmm2, edi, 3 3226 vpinsrd xmm3, xmm3, r9d, 3 3227 vinserti128 ymm2, ymm2, xmm3, 1 3228 vpsrlvd ymm2, ymm2, ymm0 3229 vpand ymm2, ymm2, ymm1 3230 vmovdqu ymmword ptr [r15], ymm2 3231 sub r15, -128 3232 add rbx, 112 3233 add r8, -1 3234 jne .LBB0_104 3235 jmp .LBB0_147 3236 .LBB0_141: 3237 cmp edx, 32 3238 jl .LBB0_147 3239 # %bb.142: 3240 mov eax, r14d 3241 add r15, 96 3242 xor ecx, ecx 3243 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_131] # ymm0 = [0,2,4,6,8,10,12,14] 3244 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_132] # ymm1 = [12884901891,12884901891,12884901891,12884901891] 3245 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_133] # ymm2 = [16,18,20,22,24,26,28,30] 3246 .p2align 4, 0x90 3247 .LBB0_143: # =>This Inner Loop Header: Depth=1 3248 vpbroadcastd ymm3, dword ptr [rbx + 8*rcx] 3249 vpsrlvd ymm3, ymm3, ymm0 3250 vpand ymm3, ymm3, ymm1 3251 vmovdqu ymmword ptr [r15 - 96], ymm3 3252 vpbroadcastd ymm3, dword ptr [rbx + 8*rcx] 3253 vpsrlvd ymm3, ymm3, ymm2 3254 vpand ymm3, ymm3, ymm1 3255 vmovdqu ymmword ptr [r15 - 64], ymm3 3256 vpbroadcastd ymm3, dword ptr [rbx + 8*rcx + 4] 3257 vpsrlvd ymm3, ymm3, ymm0 3258 vpand ymm3, ymm3, ymm1 3259 vmovdqu ymmword ptr [r15 - 32], ymm3 3260 vpbroadcastd ymm3, dword ptr [rbx + 8*rcx + 4] 3261 vpsrlvd ymm3, ymm3, ymm2 3262 vpand ymm3, ymm3, ymm1 3263 vmovdqu ymmword ptr [r15], ymm3 3264 add rcx, 1 3265 sub r15, -128 3266 cmp rax, rcx 3267 jne .LBB0_143 3268 jmp .LBB0_147 3269 .LBB0_117: 3270 cmp edx, 32 3271 jl .LBB0_147 3272 # %bb.118: 3273 mov r8d, r14d 3274 add r15, 96 3275 add rbx, 68 3276 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_72] # ymm0 = [0,0,4,0,8,0,12,0] 3277 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_73] # ymm1 = [1125895612137471,1125895612137471,1125895612137471,1125895612137471] 3278 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_74] # ymm2 = [0,2,0,6,0,10,0,14] 3279 .p2align 4, 0x90 3280 .LBB0_119: # =>This Inner Loop Header: Depth=1 3281 mov ecx, dword ptr [rbx - 56] 3282 mov r10d, dword ptr [rbx - 52] 3283 shld r10d, ecx, 2 3284 mov esi, dword ptr [rbx - 60] 3285 mov edi, ecx 3286 shld edi, esi, 6 3287 mov r9d, dword ptr [rbx - 68] 3288 mov edx, dword ptr [rbx - 64] 3289 mov eax, edx 3290 shld eax, r9d, 14 3291 vmovd xmm3, esi 3292 shld esi, edx, 10 3293 vpinsrd xmm3, xmm3, edi, 1 3294 vpinsrd xmm3, xmm3, ecx, 2 3295 vpinsrd xmm3, xmm3, r10d, 3 3296 vmovd xmm4, r9d 3297 vpinsrd xmm4, xmm4, eax, 1 3298 vpinsrd xmm4, xmm4, edx, 2 3299 vpinsrd xmm4, xmm4, esi, 3 3300 vinserti128 ymm3, ymm4, xmm3, 1 3301 vpsrlvd ymm3, ymm3, ymm0 3302 vpand ymm3, ymm3, ymm1 3303 vmovdqu ymmword ptr [r15 - 96], ymm3 3304 mov r9d, dword ptr [rbx - 36] 3305 mov r11d, dword ptr [rbx - 40] 3306 mov edx, r9d 3307 shld edx, r11d, 4 3308 mov r10d, dword ptr [rbx - 44] 3309 mov edi, r11d 3310 shld edi, r10d, 8 3311 mov eax, dword ptr [rbx - 52] 3312 mov esi, dword ptr [rbx - 48] 3313 mov ecx, r10d 3314 shld ecx, esi, 12 3315 shrd eax, esi, 16 3316 vmovd xmm3, edi 3317 vpinsrd xmm3, xmm3, r11d, 1 3318 vpinsrd xmm3, xmm3, edx, 2 3319 vpinsrd xmm3, xmm3, r9d, 3 3320 vmovd xmm4, eax 3321 vpinsrd xmm4, xmm4, esi, 1 3322 vpinsrd xmm4, xmm4, ecx, 2 3323 vpinsrd xmm4, xmm4, r10d, 3 3324 vinserti128 ymm3, ymm4, xmm3, 1 3325 vpsrlvd ymm3, ymm3, ymm2 3326 vpand ymm3, ymm3, ymm1 3327 vmovdqu ymmword ptr [r15 - 64], ymm3 3328 mov eax, dword ptr [rbx - 20] 3329 mov r10d, dword ptr [rbx - 16] 3330 shld r10d, eax, 2 3331 mov edx, dword ptr [rbx - 24] 3332 mov esi, eax 3333 shld esi, edx, 6 3334 mov r9d, dword ptr [rbx - 32] 3335 mov ecx, dword ptr [rbx - 28] 3336 mov edi, ecx 3337 shld edi, r9d, 14 3338 vmovd xmm3, edx 3339 shld edx, ecx, 10 3340 vpinsrd xmm3, xmm3, esi, 1 3341 vpinsrd xmm3, xmm3, eax, 2 3342 vpinsrd xmm3, xmm3, r10d, 3 3343 vmovd xmm4, r9d 3344 vpinsrd xmm4, xmm4, edi, 1 3345 vpinsrd xmm4, xmm4, ecx, 2 3346 vpinsrd xmm4, xmm4, edx, 3 3347 vinserti128 ymm3, ymm4, xmm3, 1 3348 vpsrlvd ymm3, ymm3, ymm0 3349 vpand ymm3, ymm3, ymm1 3350 vmovdqu ymmword ptr [r15 - 32], ymm3 3351 mov r9d, dword ptr [rbx] 3352 mov r11d, dword ptr [rbx - 4] 3353 mov edx, r9d 3354 shld edx, r11d, 4 3355 mov r10d, dword ptr [rbx - 8] 3356 mov edi, r11d 3357 shld edi, r10d, 8 3358 mov eax, dword ptr [rbx - 16] 3359 mov esi, dword ptr [rbx - 12] 3360 mov ecx, r10d 3361 shld ecx, esi, 12 3362 shrd eax, esi, 16 3363 vmovd xmm3, edi 3364 vpinsrd xmm3, xmm3, r11d, 1 3365 vpinsrd xmm3, xmm3, edx, 2 3366 vpinsrd xmm3, xmm3, r9d, 3 3367 vmovd xmm4, eax 3368 vpinsrd xmm4, xmm4, esi, 1 3369 vpinsrd xmm4, xmm4, ecx, 2 3370 vpinsrd xmm4, xmm4, r10d, 3 3371 vinserti128 ymm3, ymm4, xmm3, 1 3372 vpsrlvd ymm3, ymm3, ymm2 3373 vpand ymm3, ymm3, ymm1 3374 vmovdqu ymmword ptr [r15], ymm3 3375 sub r15, -128 3376 add rbx, 72 3377 add r8, -1 3378 jne .LBB0_119 3379 jmp .LBB0_147 3380 .LBB0_129: 3381 cmp edx, 32 3382 jl .LBB0_147 3383 # %bb.130: 3384 mov r8d, r14d 3385 add r15, 96 3386 add rbx, 36 3387 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_101] # ymm0 = [0,10,20,0,8,18,0,6] 3388 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_102] # ymm1 = [4393751544831,4393751544831,4393751544831,4393751544831] 3389 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_103] # ymm2 = [16,0,4,14,0,2,12,22] 3390 .p2align 4, 0x90 3391 .LBB0_131: # =>This Inner Loop Header: Depth=1 3392 mov ecx, dword ptr [rbx - 28] 3393 mov edx, dword ptr [rbx - 36] 3394 mov esi, dword ptr [rbx - 32] 3395 mov edi, ecx 3396 shld edi, esi, 4 3397 vmovd xmm3, esi 3398 vpinsrd xmm3, xmm3, esi, 1 3399 shld esi, edx, 2 3400 vpinsrd xmm3, xmm3, edi, 2 3401 vpinsrd xmm3, xmm3, ecx, 3 3402 vmovd xmm4, edx 3403 vpinsrd xmm4, xmm4, edx, 1 3404 vpinsrd xmm4, xmm4, edx, 2 3405 vpinsrd xmm4, xmm4, esi, 3 3406 vinserti128 ymm3, ymm4, xmm3, 1 3407 vpsrlvd ymm3, ymm3, ymm0 3408 vpand ymm3, ymm3, ymm1 3409 vmovdqu ymmword ptr [r15 - 96], ymm3 3410 mov ecx, dword ptr [rbx - 20] 3411 mov edx, dword ptr [rbx - 24] 3412 mov esi, ecx 3413 shld esi, edx, 8 3414 mov edi, dword ptr [rbx - 28] 3415 mov eax, edx 3416 shld eax, edi, 6 3417 vmovd xmm3, esi 3418 vpinsrd xmm3, xmm3, ecx, 1 3419 vpinsrd xmm3, xmm3, ecx, 2 3420 vpinsrd xmm3, xmm3, ecx, 3 3421 vmovd xmm4, edi 3422 vpinsrd xmm4, xmm4, eax, 1 3423 vpinsrd xmm4, xmm4, edx, 2 3424 vpinsrd xmm4, xmm4, edx, 3 3425 vinserti128 ymm3, ymm4, xmm3, 1 3426 vpsrlvd ymm3, ymm3, ymm2 3427 vpand ymm3, ymm3, ymm1 3428 vmovdqu ymmword ptr [r15 - 64], ymm3 3429 mov eax, dword ptr [rbx - 8] 3430 mov ecx, dword ptr [rbx - 16] 3431 mov edx, dword ptr [rbx - 12] 3432 mov esi, eax 3433 shld esi, edx, 4 3434 vmovd xmm3, edx 3435 vpinsrd xmm3, xmm3, edx, 1 3436 shld edx, ecx, 2 3437 vpinsrd xmm3, xmm3, esi, 2 3438 vpinsrd xmm3, xmm3, eax, 3 3439 vmovd xmm4, ecx 3440 vpinsrd xmm4, xmm4, ecx, 1 3441 vpinsrd xmm4, xmm4, ecx, 2 3442 vpinsrd xmm4, xmm4, edx, 3 3443 vinserti128 ymm3, ymm4, xmm3, 1 3444 vpsrlvd ymm3, ymm3, ymm0 3445 vpand ymm3, ymm3, ymm1 3446 vmovdqu ymmword ptr [r15 - 32], ymm3 3447 mov eax, dword ptr [rbx] 3448 mov ecx, dword ptr [rbx - 8] 3449 mov edx, dword ptr [rbx - 4] 3450 mov esi, eax 3451 shld esi, edx, 8 3452 mov edi, edx 3453 shld edi, ecx, 6 3454 vmovd xmm3, esi 3455 vpinsrd xmm3, xmm3, eax, 1 3456 vpinsrd xmm3, xmm3, eax, 2 3457 vpinsrd xmm3, xmm3, eax, 3 3458 vmovd xmm4, ecx 3459 vpinsrd xmm4, xmm4, edi, 1 3460 vpinsrd xmm4, xmm4, edx, 2 3461 vpinsrd xmm4, xmm4, edx, 3 3462 vinserti128 ymm3, ymm4, xmm3, 1 3463 vpsrlvd ymm3, ymm3, ymm2 3464 vpand ymm3, ymm3, ymm1 3465 vmovdqu ymmword ptr [r15], ymm3 3466 sub r15, -128 3467 add rbx, 40 3468 add r8, -1 3469 jne .LBB0_131 3470 jmp .LBB0_147 3471 .LBB0_105: 3472 cmp edx, 32 3473 jl .LBB0_147 3474 # %bb.106: 3475 mov r8d, r14d 3476 add r15, 96 3477 add rbx, 100 3478 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_34] # ymm0 = [288230371923853311,288230371923853311,288230371923853311,288230371923853311] 3479 vpbroadcastq xmm1, qword ptr [rip + .LCPI0_35] # xmm1 = [42949672976,42949672976] 3480 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_33] # ymm2 = [0,0,0,0,0,2,0,0] 3481 vpbroadcastq xmm3, qword ptr [rip + .LCPI0_36] # xmm3 = [94489280528,94489280528] 3482 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_37] # ymm4 = [0,0,4,0,0,0,0,6] 3483 .p2align 4, 0x90 3484 .LBB0_107: # =>This Inner Loop Header: Depth=1 3485 mov ecx, dword ptr [rbx - 80] 3486 mov r9d, dword ptr [rbx - 76] 3487 shld r9d, ecx, 10 3488 mov r11d, dword ptr [rbx - 84] 3489 shld ecx, r11d, 4 3490 mov edi, dword ptr [rbx - 88] 3491 mov esi, r11d 3492 shld esi, edi, 24 3493 mov edx, dword ptr [rbx - 92] 3494 shld edi, edx, 18 3495 mov r10d, dword ptr [rbx - 100] 3496 mov eax, dword ptr [rbx - 96] 3497 shld edx, eax, 12 3498 shld eax, r10d, 6 3499 vmovd xmm5, r10d 3500 vmovd xmm6, esi 3501 vpinsrd xmm5, xmm5, eax, 1 3502 vpinsrd xmm6, xmm6, r11d, 1 3503 vpinsrd xmm5, xmm5, edx, 2 3504 vpinsrd xmm6, xmm6, ecx, 2 3505 vpinsrd xmm5, xmm5, edi, 3 3506 vpinsrd xmm6, xmm6, r9d, 3 3507 vinserti128 ymm5, ymm5, xmm6, 1 3508 vpsrlvd ymm5, ymm5, ymm2 3509 vpand ymm5, ymm5, ymm0 3510 vmovdqu ymmword ptr [r15 - 96], ymm5 3511 mov r9d, dword ptr [rbx - 52] 3512 mov ecx, dword ptr [rbx - 56] 3513 mov edx, r9d 3514 shld edx, ecx, 20 3515 mov esi, dword ptr [rbx - 60] 3516 shld ecx, esi, 14 3517 mov edi, dword ptr [rbx - 68] 3518 mov eax, dword ptr [rbx - 64] 3519 shld esi, eax, 8 3520 shld eax, edi, 2 3521 vmovq xmm5, qword ptr [rbx - 76] # xmm5 = mem[0],zero 3522 vpsrlvd xmm6, xmm5, xmm1 3523 vpshufd xmm5, xmm5, 229 # xmm5 = xmm5[1,1,2,3] 3524 vpinsrd xmm5, xmm5, edi, 1 3525 vpsllvd xmm5, xmm5, xmm3 3526 vpor xmm5, xmm6, xmm5 3527 vmovd xmm6, esi 3528 vpinsrd xmm6, xmm6, ecx, 1 3529 vpinsrd xmm6, xmm6, edx, 2 3530 vpinsrd xmm6, xmm6, r9d, 3 3531 vpinsrd xmm5, xmm5, edi, 2 3532 vpinsrd xmm5, xmm5, eax, 3 3533 vinserti128 ymm5, ymm5, xmm6, 1 3534 vpsrlvd ymm5, ymm5, ymm4 3535 vpand ymm5, ymm5, ymm0 3536 vmovdqu ymmword ptr [r15 - 64], ymm5 3537 mov eax, dword ptr [rbx - 28] 3538 mov r9d, dword ptr [rbx - 24] 3539 shld r9d, eax, 10 3540 mov r11d, dword ptr [rbx - 32] 3541 shld eax, r11d, 4 3542 mov esi, dword ptr [rbx - 36] 3543 mov edi, r11d 3544 shld edi, esi, 24 3545 mov ecx, dword ptr [rbx - 40] 3546 shld esi, ecx, 18 3547 mov r10d, dword ptr [rbx - 48] 3548 mov edx, dword ptr [rbx - 44] 3549 shld ecx, edx, 12 3550 shld edx, r10d, 6 3551 vmovd xmm5, r10d 3552 vmovd xmm6, edi 3553 vpinsrd xmm5, xmm5, edx, 1 3554 vpinsrd xmm6, xmm6, r11d, 1 3555 vpinsrd xmm5, xmm5, ecx, 2 3556 vpinsrd xmm6, xmm6, eax, 2 3557 vpinsrd xmm5, xmm5, esi, 3 3558 vpinsrd xmm6, xmm6, r9d, 3 3559 vinserti128 ymm5, ymm5, xmm6, 1 3560 vpsrlvd ymm5, ymm5, ymm2 3561 vpand ymm5, ymm5, ymm0 3562 vmovdqu ymmword ptr [r15 - 32], ymm5 3563 mov r9d, dword ptr [rbx] 3564 mov ecx, dword ptr [rbx - 4] 3565 mov edx, r9d 3566 shld edx, ecx, 20 3567 mov esi, dword ptr [rbx - 8] 3568 shld ecx, esi, 14 3569 mov edi, dword ptr [rbx - 16] 3570 mov eax, dword ptr [rbx - 12] 3571 shld esi, eax, 8 3572 shld eax, edi, 2 3573 vmovq xmm5, qword ptr [rbx - 24] # xmm5 = mem[0],zero 3574 vpsrlvd xmm6, xmm5, xmm1 3575 vpshufd xmm5, xmm5, 229 # xmm5 = xmm5[1,1,2,3] 3576 vpinsrd xmm5, xmm5, edi, 1 3577 vpsllvd xmm5, xmm5, xmm3 3578 vpor xmm5, xmm6, xmm5 3579 vmovd xmm6, esi 3580 vpinsrd xmm6, xmm6, ecx, 1 3581 vpinsrd xmm6, xmm6, edx, 2 3582 vpinsrd xmm6, xmm6, r9d, 3 3583 vpinsrd xmm5, xmm5, edi, 2 3584 vpinsrd xmm5, xmm5, eax, 3 3585 vinserti128 ymm5, ymm5, xmm6, 1 3586 vpsrlvd ymm5, ymm5, ymm4 3587 vpand ymm5, ymm5, ymm0 3588 vmovdqu ymmword ptr [r15], ymm5 3589 sub r15, -128 3590 add rbx, 104 3591 add r8, -1 3592 jne .LBB0_107 3593 jmp .LBB0_147 3594 .LBB0_135: 3595 cmp edx, 32 3596 jl .LBB0_147 3597 # %bb.136: 3598 mov eax, r14d 3599 add r15, 96 3600 add rbx, 20 3601 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_116] # ymm0 = [0,6,12,18,24,0,4,10] 3602 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_117] # ymm1 = [270582939711,270582939711,270582939711,270582939711] 3603 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_118] # ymm2 = [16,22,0,2,8,14,20,26] 3604 .p2align 4, 0x90 3605 .LBB0_137: # =>This Inner Loop Header: Depth=1 3606 mov ecx, dword ptr [rbx - 20] 3607 mov edx, dword ptr [rbx - 16] 3608 mov esi, edx 3609 shld esi, ecx, 2 3610 vmovd xmm3, ecx 3611 vpbroadcastd xmm4, xmm3 3612 vpinsrd xmm3, xmm3, esi, 1 3613 vpinsrd xmm3, xmm3, edx, 2 3614 vpinsrd xmm3, xmm3, edx, 3 3615 vinserti128 ymm3, ymm4, xmm3, 1 3616 vpsrlvd ymm3, ymm3, ymm0 3617 vpand ymm3, ymm3, ymm1 3618 vmovdqu ymmword ptr [r15 - 96], ymm3 3619 mov ecx, dword ptr [rbx - 16] 3620 mov edx, dword ptr [rbx - 12] 3621 mov esi, edx 3622 shld esi, ecx, 4 3623 vmovd xmm3, ecx 3624 vpinsrd xmm3, xmm3, ecx, 1 3625 vpinsrd xmm3, xmm3, esi, 2 3626 vpinsrd xmm3, xmm3, edx, 3 3627 vmovd xmm4, edx 3628 vpbroadcastd xmm4, xmm4 3629 vinserti128 ymm3, ymm3, xmm4, 1 3630 vpsrlvd ymm3, ymm3, ymm2 3631 vpand ymm3, ymm3, ymm1 3632 vmovdqu ymmword ptr [r15 - 64], ymm3 3633 mov ecx, dword ptr [rbx - 8] 3634 mov edx, dword ptr [rbx - 4] 3635 mov esi, edx 3636 shld esi, ecx, 2 3637 vmovd xmm3, ecx 3638 vpinsrd xmm4, xmm3, esi, 1 3639 vpinsrd xmm4, xmm4, edx, 2 3640 vpbroadcastd xmm3, xmm3 3641 vpinsrd xmm4, xmm4, edx, 3 3642 vinserti128 ymm3, ymm3, xmm4, 1 3643 vpsrlvd ymm3, ymm3, ymm0 3644 vpand ymm3, ymm3, ymm1 3645 vmovdqu ymmword ptr [r15 - 32], ymm3 3646 mov ecx, dword ptr [rbx - 4] 3647 mov edx, dword ptr [rbx] 3648 mov esi, edx 3649 shld esi, ecx, 4 3650 vmovd xmm3, ecx 3651 vpinsrd xmm3, xmm3, ecx, 1 3652 vpinsrd xmm3, xmm3, esi, 2 3653 vpinsrd xmm3, xmm3, edx, 3 3654 vmovd xmm4, edx 3655 vpbroadcastd xmm4, xmm4 3656 vinserti128 ymm3, ymm3, xmm4, 1 3657 vpsrlvd ymm3, ymm3, ymm2 3658 vpand ymm3, ymm3, ymm1 3659 vmovdqu ymmword ptr [r15], ymm3 3660 sub r15, -128 3661 add rbx, 24 3662 add rax, -1 3663 jne .LBB0_137 3664 jmp .LBB0_147 3665 .LBB0_111: 3666 cmp edx, 32 3667 jl .LBB0_147 3668 # %bb.112: 3669 mov r8d, r14d 3670 add r15, 96 3671 add rbx, 84 3672 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_55] # ymm0 = [0,0,0,2,0,0,4,0] 3673 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_56] # ymm1 = [18014394218708991,18014394218708991,18014394218708991,18014394218708991] 3674 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_57] # ymm2 = [0,6,0,0,8,0,0,10] 3675 .p2align 4, 0x90 3676 .LBB0_113: # =>This Inner Loop Header: Depth=1 3677 mov r10d, dword ptr [rbx - 68] 3678 mov r9d, dword ptr [rbx - 64] 3679 shld r9d, r10d, 6 3680 mov esi, dword ptr [rbx - 72] 3681 mov edi, r10d 3682 shld edi, esi, 18 3683 mov edx, dword ptr [rbx - 76] 3684 shld esi, edx, 8 3685 mov r11d, dword ptr [rbx - 84] 3686 mov ecx, dword ptr [rbx - 80] 3687 mov eax, edx 3688 shld eax, ecx, 20 3689 shld ecx, r11d, 10 3690 vmovd xmm3, r11d 3691 vmovd xmm4, esi 3692 vpinsrd xmm3, xmm3, ecx, 1 3693 vpinsrd xmm4, xmm4, edi, 1 3694 vpinsrd xmm3, xmm3, eax, 2 3695 vpinsrd xmm4, xmm4, r10d, 2 3696 vpinsrd xmm3, xmm3, edx, 3 3697 vpinsrd xmm4, xmm4, r9d, 3 3698 vinserti128 ymm3, ymm3, xmm4, 1 3699 vpsrlvd ymm3, ymm3, ymm0 3700 vpand ymm3, ymm3, ymm1 3701 vmovdqu ymmword ptr [r15 - 96], ymm3 3702 mov r9d, dword ptr [rbx - 44] 3703 mov ecx, dword ptr [rbx - 48] 3704 mov r10d, r9d 3705 shld r10d, ecx, 12 3706 mov esi, dword ptr [rbx - 52] 3707 shld ecx, esi, 2 3708 mov edi, dword ptr [rbx - 56] 3709 vmovd xmm3, esi 3710 shld esi, edi, 14 3711 mov eax, dword ptr [rbx - 64] 3712 mov edx, dword ptr [rbx - 60] 3713 shld edi, edx, 4 3714 shrd eax, edx, 16 3715 vpinsrd xmm3, xmm3, ecx, 1 3716 vmovd xmm4, eax 3717 vpinsrd xmm3, xmm3, r10d, 2 3718 vpinsrd xmm4, xmm4, edx, 1 3719 vpinsrd xmm3, xmm3, r9d, 3 3720 vpinsrd xmm4, xmm4, edi, 2 3721 vpinsrd xmm4, xmm4, esi, 3 3722 vinserti128 ymm3, ymm4, xmm3, 1 3723 vpsrlvd ymm3, ymm3, ymm2 3724 vpand ymm3, ymm3, ymm1 3725 vmovdqu ymmword ptr [r15 - 64], ymm3 3726 mov r10d, dword ptr [rbx - 24] 3727 mov r9d, dword ptr [rbx - 20] 3728 shld r9d, r10d, 6 3729 mov edx, dword ptr [rbx - 28] 3730 mov esi, r10d 3731 shld esi, edx, 18 3732 mov ecx, dword ptr [rbx - 32] 3733 shld edx, ecx, 8 3734 mov r11d, dword ptr [rbx - 40] 3735 mov eax, dword ptr [rbx - 36] 3736 mov edi, ecx 3737 shld edi, eax, 20 3738 shld eax, r11d, 10 3739 vmovd xmm3, r11d 3740 vmovd xmm4, edx 3741 vpinsrd xmm3, xmm3, eax, 1 3742 vpinsrd xmm4, xmm4, esi, 1 3743 vpinsrd xmm3, xmm3, edi, 2 3744 vpinsrd xmm4, xmm4, r10d, 2 3745 vpinsrd xmm3, xmm3, ecx, 3 3746 vpinsrd xmm4, xmm4, r9d, 3 3747 vinserti128 ymm3, ymm3, xmm4, 1 3748 vpsrlvd ymm3, ymm3, ymm0 3749 vpand ymm3, ymm3, ymm1 3750 vmovdqu ymmword ptr [r15 - 32], ymm3 3751 mov r9d, dword ptr [rbx] 3752 mov ecx, dword ptr [rbx - 4] 3753 mov r10d, r9d 3754 shld r10d, ecx, 12 3755 mov esi, dword ptr [rbx - 8] 3756 shld ecx, esi, 2 3757 mov edi, dword ptr [rbx - 12] 3758 vmovd xmm3, esi 3759 shld esi, edi, 14 3760 mov eax, dword ptr [rbx - 20] 3761 mov edx, dword ptr [rbx - 16] 3762 shld edi, edx, 4 3763 shrd eax, edx, 16 3764 vpinsrd xmm3, xmm3, ecx, 1 3765 vmovd xmm4, eax 3766 vpinsrd xmm3, xmm3, r10d, 2 3767 vpinsrd xmm4, xmm4, edx, 1 3768 vpinsrd xmm3, xmm3, r9d, 3 3769 vpinsrd xmm4, xmm4, edi, 2 3770 vpinsrd xmm4, xmm4, esi, 3 3771 vinserti128 ymm3, ymm4, xmm3, 1 3772 vpsrlvd ymm3, ymm3, ymm2 3773 vpand ymm3, ymm3, ymm1 3774 vmovdqu ymmword ptr [r15], ymm3 3775 sub r15, -128 3776 add rbx, 88 3777 add r8, -1 3778 jne .LBB0_113 3779 jmp .LBB0_147 3780 .LBB0_123: 3781 cmp edx, 32 3782 jl .LBB0_147 3783 # %bb.124: 3784 mov r8d, r14d 3785 add r15, 96 3786 add rbx, 52 3787 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_86] # ymm0 = [0,14,0,10,0,6,0,2] 3788 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_87] # ymm1 = [70364449226751,70364449226751,70364449226751,70364449226751] 3789 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_88] # ymm2 = [16,0,12,0,8,0,4,18] 3790 .p2align 4, 0x90 3791 .LBB0_125: # =>This Inner Loop Header: Depth=1 3792 mov r9d, dword ptr [rbx - 40] 3793 mov ecx, dword ptr [rbx - 44] 3794 mov esi, r9d 3795 shld esi, ecx, 12 3796 mov edi, dword ptr [rbx - 52] 3797 mov r10d, dword ptr [rbx - 48] 3798 mov edx, ecx 3799 shld edx, r10d, 8 3800 mov eax, r10d 3801 shld eax, edi, 4 3802 vmovd xmm3, edx 3803 vpinsrd xmm3, xmm3, ecx, 1 3804 vpinsrd xmm3, xmm3, esi, 2 3805 vpinsrd xmm3, xmm3, r9d, 3 3806 vmovd xmm4, edi 3807 vpinsrd xmm4, xmm4, edi, 1 3808 vpinsrd xmm4, xmm4, eax, 2 3809 vpinsrd xmm4, xmm4, r10d, 3 3810 vinserti128 ymm3, ymm4, xmm3, 1 3811 vpsrlvd ymm3, ymm3, ymm0 3812 vpand ymm3, ymm3, ymm1 3813 vmovdqu ymmword ptr [r15 - 96], ymm3 3814 mov eax, dword ptr [rbx - 28] 3815 mov ecx, dword ptr [rbx - 32] 3816 mov edx, eax 3817 shld edx, ecx, 10 3818 mov r9d, dword ptr [rbx - 40] 3819 mov esi, dword ptr [rbx - 36] 3820 vmovd xmm3, ecx 3821 shld ecx, esi, 6 3822 mov edi, esi 3823 shld edi, r9d, 2 3824 vmovd xmm4, r9d 3825 vpinsrd xmm4, xmm4, edi, 1 3826 vpinsrd xmm4, xmm4, esi, 2 3827 vpinsrd xmm4, xmm4, ecx, 3 3828 vpinsrd xmm3, xmm3, edx, 1 3829 vpinsrd xmm3, xmm3, eax, 2 3830 vpinsrd xmm3, xmm3, eax, 3 3831 vinserti128 ymm3, ymm4, xmm3, 1 3832 vpsrlvd ymm3, ymm3, ymm2 3833 vpand ymm3, ymm3, ymm1 3834 vmovdqu ymmword ptr [r15 - 64], ymm3 3835 mov r9d, dword ptr [rbx - 12] 3836 mov eax, dword ptr [rbx - 16] 3837 mov edx, r9d 3838 shld edx, eax, 12 3839 mov esi, dword ptr [rbx - 24] 3840 mov r10d, dword ptr [rbx - 20] 3841 mov ecx, eax 3842 shld ecx, r10d, 8 3843 mov edi, r10d 3844 shld edi, esi, 4 3845 vmovd xmm3, ecx 3846 vpinsrd xmm3, xmm3, eax, 1 3847 vpinsrd xmm3, xmm3, edx, 2 3848 vpinsrd xmm3, xmm3, r9d, 3 3849 vmovd xmm4, esi 3850 vpinsrd xmm4, xmm4, esi, 1 3851 vpinsrd xmm4, xmm4, edi, 2 3852 vpinsrd xmm4, xmm4, r10d, 3 3853 vinserti128 ymm3, ymm4, xmm3, 1 3854 vpsrlvd ymm3, ymm3, ymm0 3855 vpand ymm3, ymm3, ymm1 3856 vmovdqu ymmword ptr [r15 - 32], ymm3 3857 mov r9d, dword ptr [rbx] 3858 mov ecx, dword ptr [rbx - 4] 3859 mov edx, r9d 3860 shld edx, ecx, 10 3861 mov eax, dword ptr [rbx - 8] 3862 vmovd xmm3, ecx 3863 shld ecx, eax, 6 3864 mov edi, dword ptr [rbx - 12] 3865 mov esi, eax 3866 shld esi, edi, 2 3867 vmovd xmm4, edi 3868 vpinsrd xmm4, xmm4, esi, 1 3869 vpinsrd xmm4, xmm4, eax, 2 3870 vpinsrd xmm4, xmm4, ecx, 3 3871 vpinsrd xmm3, xmm3, edx, 1 3872 vpinsrd xmm3, xmm3, r9d, 2 3873 vpinsrd xmm3, xmm3, r9d, 3 3874 vinserti128 ymm3, ymm4, xmm3, 1 3875 vpsrlvd ymm3, ymm3, ymm2 3876 vpand ymm3, ymm3, ymm1 3877 vmovdqu ymmword ptr [r15], ymm3 3878 sub r15, -128 3879 add rbx, 56 3880 add r8, -1 3881 jne .LBB0_125 3882 jmp .LBB0_147 3883 .LBB0_99: 3884 cmp edx, 32 3885 jl .LBB0_147 3886 # %bb.100: 3887 mov r8d, r14d 3888 add r15, 96 3889 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_8] # ymm0 = [4611686015206162431,4611686015206162431,4611686015206162431,4611686015206162431] 3890 add rbx, 116 3891 vmovdqa xmm1, xmmword ptr [rip + .LCPI0_9] # xmm1 = [16,14,12,10] 3892 vmovdqa xmm2, xmmword ptr [rip + .LCPI0_10] # xmm2 = [16,18,20,22] 3893 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_11] # ymm3 = [0,0,0,0,0,0,0,2] 3894 .p2align 4, 0x90 3895 .LBB0_101: # =>This Inner Loop Header: Depth=1 3896 mov r11d, dword ptr [rbx - 92] 3897 mov r9d, dword ptr [rbx - 88] 3898 shld r9d, r11d, 14 3899 mov esi, dword ptr [rbx - 96] 3900 shld r11d, esi, 12 3901 mov edi, dword ptr [rbx - 100] 3902 shld esi, edi, 10 3903 mov eax, dword ptr [rbx - 104] 3904 shld edi, eax, 8 3905 mov edx, dword ptr [rbx - 108] 3906 shld eax, edx, 6 3907 mov r10d, dword ptr [rbx - 116] 3908 mov ecx, dword ptr [rbx - 112] 3909 shld edx, ecx, 4 3910 shld ecx, r10d, 2 3911 vmovd xmm4, r10d 3912 vmovd xmm5, edi 3913 vpinsrd xmm4, xmm4, ecx, 1 3914 vpinsrd xmm5, xmm5, esi, 1 3915 vpinsrd xmm4, xmm4, edx, 2 3916 vpinsrd xmm5, xmm5, r11d, 2 3917 vpinsrd xmm4, xmm4, eax, 3 3918 vpinsrd xmm5, xmm5, r9d, 3 3919 vinserti128 ymm4, ymm4, xmm5, 1 3920 vpand ymm4, ymm4, ymm0 3921 vmovdqu ymmword ptr [r15 - 96], ymm4 3922 mov eax, dword ptr [rbx - 60] 3923 mov ecx, dword ptr [rbx - 64] 3924 mov edx, eax 3925 shld edx, ecx, 28 3926 mov esi, dword ptr [rbx - 68] 3927 mov edi, dword ptr [rbx - 72] 3928 shld ecx, esi, 26 3929 shld esi, edi, 24 3930 vmovdqu xmm4, xmmword ptr [rbx - 88] 3931 vpsrlvd xmm5, xmm4, xmm1 3932 vpshufd xmm4, xmm4, 249 # xmm4 = xmm4[1,2,3,3] 3933 vpinsrd xmm4, xmm4, edi, 3 3934 vmovd xmm6, esi 3935 vpinsrd xmm6, xmm6, ecx, 1 3936 vpinsrd xmm6, xmm6, edx, 2 3937 vpsllvd xmm4, xmm4, xmm2 3938 vpinsrd xmm6, xmm6, eax, 3 3939 vpor xmm4, xmm5, xmm4 3940 vinserti128 ymm4, ymm4, xmm6, 1 3941 vpsrlvd ymm4, ymm4, ymm3 3942 vpand ymm4, ymm4, ymm0 3943 vmovdqu ymmword ptr [r15 - 64], ymm4 3944 mov r11d, dword ptr [rbx - 32] 3945 mov r9d, dword ptr [rbx - 28] 3946 shld r9d, r11d, 14 3947 mov edx, dword ptr [rbx - 36] 3948 shld r11d, edx, 12 3949 mov esi, dword ptr [rbx - 40] 3950 shld edx, esi, 10 3951 mov edi, dword ptr [rbx - 44] 3952 shld esi, edi, 8 3953 mov ecx, dword ptr [rbx - 48] 3954 shld edi, ecx, 6 3955 mov r10d, dword ptr [rbx - 56] 3956 mov eax, dword ptr [rbx - 52] 3957 shld ecx, eax, 4 3958 shld eax, r10d, 2 3959 vmovd xmm4, r10d 3960 vmovd xmm5, esi 3961 vpinsrd xmm4, xmm4, eax, 1 3962 vpinsrd xmm5, xmm5, edx, 1 3963 vpinsrd xmm4, xmm4, ecx, 2 3964 vpinsrd xmm5, xmm5, r11d, 2 3965 vpinsrd xmm4, xmm4, edi, 3 3966 vpinsrd xmm5, xmm5, r9d, 3 3967 vinserti128 ymm4, ymm4, xmm5, 1 3968 vpand ymm4, ymm4, ymm0 3969 vmovdqu ymmword ptr [r15 - 32], ymm4 3970 mov eax, dword ptr [rbx] 3971 mov ecx, dword ptr [rbx - 4] 3972 mov edx, eax 3973 shld edx, ecx, 28 3974 mov esi, dword ptr [rbx - 8] 3975 shld ecx, esi, 26 3976 mov edi, dword ptr [rbx - 12] 3977 vmovdqu xmm4, xmmword ptr [rbx - 28] 3978 shld esi, edi, 24 3979 vpsrlvd xmm5, xmm4, xmm1 3980 vpshufd xmm4, xmm4, 249 # xmm4 = xmm4[1,2,3,3] 3981 vpinsrd xmm4, xmm4, edi, 3 3982 vmovd xmm6, esi 3983 vpinsrd xmm6, xmm6, ecx, 1 3984 vpsllvd xmm4, xmm4, xmm2 3985 vpinsrd xmm6, xmm6, edx, 2 3986 vpinsrd xmm6, xmm6, eax, 3 3987 vpor xmm4, xmm5, xmm4 3988 vinserti128 ymm4, ymm4, xmm6, 1 3989 vpsrlvd ymm4, ymm4, ymm3 3990 vpand ymm4, ymm4, ymm0 3991 vmovdqu ymmword ptr [r15], ymm4 3992 sub r15, -128 3993 add rbx, 120 3994 add r8, -1 3995 jne .LBB0_101 3996 .LBB0_147: 3997 shl r14d, 5 3998 mov eax, r14d 3999 lea rsp, [rbp - 32] 4000 pop rbx 4001 pop r12 4002 pop r14 4003 pop r15 4004 pop rbp 4005 vzeroupper 4006 ret 4007 .Lfunc_end0: 4008 .size unpack32_avx2, .Lfunc_end0-unpack32_avx2 4009 # -- End function 4010 .ident "Debian clang version 11.1.0-++20210428103820+1fdec59bffc1-1~exp1~20210428204437.162" 4011 .section ".note.GNU-stack","",@progbits 4012 .addrsig