github.com/apache/arrow/go/v14@v14.0.1/parquet/internal/utils/_lib/bit_packing_avx2.s (about)

     1  	.text
     2  	.intel_syntax noprefix
     3  	.file	"bit_packing_avx2.c"
     4  	.section	.rodata.cst8,"aM",@progbits,8
     5  	.p2align	3                               # -- Begin function unpack32_avx2
     6  .LCPI0_0:
     7  	.quad	9223372034707292159             # 0x7fffffff7fffffff
     8  .LCPI0_8:
     9  	.quad	4611686015206162431             # 0x3fffffff3fffffff
    10  .LCPI0_12:
    11  	.quad	2305843005455597567             # 0x1fffffff1fffffff
    12  .LCPI0_23:
    13  	.quad	1152921500580315135             # 0xfffffff0fffffff
    14  .LCPI0_25:
    15  	.quad	576460748142673919              # 0x7ffffff07ffffff
    16  .LCPI0_34:
    17  	.quad	288230371923853311              # 0x3ffffff03ffffff
    18  .LCPI0_35:
    19  	.quad	42949672976                     # 0xa00000010
    20  .LCPI0_36:
    21  	.quad	94489280528                     # 0x1600000010
    22  .LCPI0_38:
    23  	.quad	144115183814443007              # 0x1ffffff01ffffff
    24  .LCPI0_49:
    25  	.quad	36028792732385279               # 0x7fffff007fffff
    26  .LCPI0_56:
    27  	.quad	18014394218708991               # 0x3fffff003fffff
    28  .LCPI0_59:
    29  	.quad	9007194961870847                # 0x1fffff001fffff
    30  .LCPI0_66:
    31  	.quad	4503595333451775                # 0xfffff000fffff
    32  .LCPI0_68:
    33  	.quad	2251795519242239                # 0x7ffff0007ffff
    34  .LCPI0_73:
    35  	.quad	1125895612137471                # 0x3ffff0003ffff
    36  .LCPI0_76:
    37  	.quad	562945658585087                 # 0x1ffff0001ffff
    38  .LCPI0_80:
    39  	.quad	68719476736                     # 0x1000000000
    40  .LCPI0_82:
    41  	.quad	140733193420799                 # 0x7fff00007fff
    42  .LCPI0_87:
    43  	.quad	70364449226751                  # 0x3fff00003fff
    44  .LCPI0_90:
    45  	.quad	35180077129727                  # 0x1fff00001fff
    46  .LCPI0_95:
    47  	.quad	17587891081215                  # 0xfff00000fff
    48  .LCPI0_97:
    49  	.quad	8791798056959                   # 0x7ff000007ff
    50  .LCPI0_102:
    51  	.quad	4393751544831                   # 0x3ff000003ff
    52  .LCPI0_105:
    53  	.quad	2194728288767                   # 0x1ff000001ff
    54  .LCPI0_112:
    55  	.quad	545460846719                    # 0x7f0000007f
    56  .LCPI0_117:
    57  	.quad	270582939711                    # 0x3f0000003f
    58  .LCPI0_120:
    59  	.quad	133143986207                    # 0x1f0000001f
    60  .LCPI0_125:
    61  	.quad	64424509455                     # 0xf0000000f
    62  .LCPI0_127:
    63  	.quad	30064771079                     # 0x700000007
    64  .LCPI0_132:
    65  	.quad	12884901891                     # 0x300000003
    66  .LCPI0_135:
    67  	.quad	4294967297                      # 0x100000001
    68  	.section	.rodata.cst32,"aM",@progbits,32
    69  	.p2align	5
    70  .LCPI0_1:
    71  	.long	24                              # 0x18
    72  	.long	23                              # 0x17
    73  	.long	22                              # 0x16
    74  	.long	21                              # 0x15
    75  	.long	20                              # 0x14
    76  	.long	19                              # 0x13
    77  	.long	18                              # 0x12
    78  	.long	17                              # 0x11
    79  .LCPI0_2:
    80  	.long	8                               # 0x8
    81  	.long	9                               # 0x9
    82  	.long	10                              # 0xa
    83  	.long	11                              # 0xb
    84  	.long	12                              # 0xc
    85  	.long	13                              # 0xd
    86  	.long	14                              # 0xe
    87  	.long	15                              # 0xf
    88  .LCPI0_3:
    89  	.long	16                              # 0x10
    90  	.long	15                              # 0xf
    91  	.long	14                              # 0xe
    92  	.long	13                              # 0xd
    93  	.long	12                              # 0xc
    94  	.long	11                              # 0xb
    95  	.long	10                              # 0xa
    96  	.long	9                               # 0x9
    97  .LCPI0_4:
    98  	.long	16                              # 0x10
    99  	.long	17                              # 0x11
   100  	.long	18                              # 0x12
   101  	.long	19                              # 0x13
   102  	.long	20                              # 0x14
   103  	.long	21                              # 0x15
   104  	.long	22                              # 0x16
   105  	.long	23                              # 0x17
   106  .LCPI0_7:
   107  	.long	0                               # 0x0
   108  	.long	0                               # 0x0
   109  	.long	0                               # 0x0
   110  	.long	0                               # 0x0
   111  	.long	0                               # 0x0
   112  	.long	0                               # 0x0
   113  	.long	0                               # 0x0
   114  	.long	1                               # 0x1
   115  .LCPI0_11:
   116  	.long	0                               # 0x0
   117  	.long	0                               # 0x0
   118  	.long	0                               # 0x0
   119  	.long	0                               # 0x0
   120  	.long	0                               # 0x0
   121  	.long	0                               # 0x0
   122  	.long	0                               # 0x0
   123  	.long	2                               # 0x2
   124  .LCPI0_15:
   125  	.long	0                               # 0x0
   126  	.long	0                               # 0x0
   127  	.long	2                               # 0x2
   128  	.long	0                               # 0x0
   129  	.long	0                               # 0x0
   130  	.long	0                               # 0x0
   131  	.long	0                               # 0x0
   132  	.long	0                               # 0x0
   133  .LCPI0_18:
   134  	.long	0                               # 0x0
   135  	.long	0                               # 0x0
   136  	.long	0                               # 0x0
   137  	.long	0                               # 0x0
   138  	.long	0                               # 0x0
   139  	.long	1                               # 0x1
   140  	.long	0                               # 0x0
   141  	.long	0                               # 0x0
   142  .LCPI0_21:
   143  	.long	0                               # 0x0
   144  	.long	0                               # 0x0
   145  	.long	0                               # 0x0
   146  	.long	0                               # 0x0
   147  	.long	0                               # 0x0
   148  	.long	0                               # 0x0
   149  	.long	0                               # 0x0
   150  	.long	3                               # 0x3
   151  .LCPI0_22:
   152  	.long	0                               # 0x0
   153  	.long	0                               # 0x0
   154  	.long	0                               # 0x0
   155  	.long	0                               # 0x0
   156  	.long	0                               # 0x0
   157  	.long	0                               # 0x0
   158  	.long	0                               # 0x0
   159  	.long	4                               # 0x4
   160  .LCPI0_24:
   161  	.long	0                               # 0x0
   162  	.long	0                               # 0x0
   163  	.long	0                               # 0x0
   164  	.long	0                               # 0x0
   165  	.long	0                               # 0x0
   166  	.long	0                               # 0x0
   167  	.long	2                               # 0x2
   168  	.long	0                               # 0x0
   169  .LCPI0_28:
   170  	.long	0                               # 0x0
   171  	.long	0                               # 0x0
   172  	.long	0                               # 0x0
   173  	.long	0                               # 0x0
   174  	.long	4                               # 0x4
   175  	.long	0                               # 0x0
   176  	.long	0                               # 0x0
   177  	.long	0                               # 0x0
   178  .LCPI0_31:
   179  	.long	0                               # 0x0
   180  	.long	0                               # 0x0
   181  	.long	0                               # 0x0
   182  	.long	1                               # 0x1
   183  	.long	0                               # 0x0
   184  	.long	0                               # 0x0
   185  	.long	0                               # 0x0
   186  	.long	0                               # 0x0
   187  .LCPI0_32:
   188  	.long	0                               # 0x0
   189  	.long	3                               # 0x3
   190  	.long	0                               # 0x0
   191  	.long	0                               # 0x0
   192  	.long	0                               # 0x0
   193  	.long	0                               # 0x0
   194  	.long	0                               # 0x0
   195  	.long	5                               # 0x5
   196  .LCPI0_33:
   197  	.long	0                               # 0x0
   198  	.long	0                               # 0x0
   199  	.long	0                               # 0x0
   200  	.long	0                               # 0x0
   201  	.long	0                               # 0x0
   202  	.long	2                               # 0x2
   203  	.long	0                               # 0x0
   204  	.long	0                               # 0x0
   205  .LCPI0_37:
   206  	.long	0                               # 0x0
   207  	.long	0                               # 0x0
   208  	.long	4                               # 0x4
   209  	.long	0                               # 0x0
   210  	.long	0                               # 0x0
   211  	.long	0                               # 0x0
   212  	.long	0                               # 0x0
   213  	.long	6                               # 0x6
   214  .LCPI0_39:
   215  	.long	0                               # 0x0
   216  	.long	1                               # 0x1
   217  	.long	0                               # 0x0
   218  	.long	0                               # 0x0
   219  	.long	0                               # 0x0
   220  	.long	5                               # 0x5
   221  	.long	0                               # 0x0
   222  	.long	0                               # 0x0
   223  .LCPI0_42:
   224  	.long	0                               # 0x0
   225  	.long	0                               # 0x0
   226  	.long	2                               # 0x2
   227  	.long	0                               # 0x0
   228  	.long	0                               # 0x0
   229  	.long	0                               # 0x0
   230  	.long	6                               # 0x6
   231  	.long	0                               # 0x0
   232  .LCPI0_45:
   233  	.long	0                               # 0x0
   234  	.long	0                               # 0x0
   235  	.long	0                               # 0x0
   236  	.long	3                               # 0x3
   237  	.long	0                               # 0x0
   238  	.long	0                               # 0x0
   239  	.long	0                               # 0x0
   240  	.long	7                               # 0x7
   241  .LCPI0_48:
   242  	.long	0                               # 0x0
   243  	.long	0                               # 0x0
   244  	.long	0                               # 0x0
   245  	.long	5                               # 0x5
   246  	.long	0                               # 0x0
   247  	.long	0                               # 0x0
   248  	.long	0                               # 0x0
   249  	.long	1                               # 0x1
   250  .LCPI0_52:
   251  	.long	0                               # 0x0
   252  	.long	0                               # 0x0
   253  	.long	6                               # 0x6
   254  	.long	0                               # 0x0
   255  	.long	0                               # 0x0
   256  	.long	0                               # 0x0
   257  	.long	2                               # 0x2
   258  	.long	0                               # 0x0
   259  .LCPI0_53:
   260  	.long	0                               # 0x0
   261  	.long	7                               # 0x7
   262  	.long	0                               # 0x0
   263  	.long	0                               # 0x0
   264  	.long	0                               # 0x0
   265  	.long	3                               # 0x3
   266  	.long	0                               # 0x0
   267  	.long	0                               # 0x0
   268  .LCPI0_54:
   269  	.long	8                               # 0x8
   270  	.long	0                               # 0x0
   271  	.long	0                               # 0x0
   272  	.long	0                               # 0x0
   273  	.long	4                               # 0x4
   274  	.long	0                               # 0x0
   275  	.long	0                               # 0x0
   276  	.long	9                               # 0x9
   277  .LCPI0_55:
   278  	.long	0                               # 0x0
   279  	.long	0                               # 0x0
   280  	.long	0                               # 0x0
   281  	.long	2                               # 0x2
   282  	.long	0                               # 0x0
   283  	.long	0                               # 0x0
   284  	.long	4                               # 0x4
   285  	.long	0                               # 0x0
   286  .LCPI0_57:
   287  	.long	0                               # 0x0
   288  	.long	6                               # 0x6
   289  	.long	0                               # 0x0
   290  	.long	0                               # 0x0
   291  	.long	8                               # 0x8
   292  	.long	0                               # 0x0
   293  	.long	0                               # 0x0
   294  	.long	10                              # 0xa
   295  .LCPI0_58:
   296  	.long	0                               # 0x0
   297  	.long	0                               # 0x0
   298  	.long	10                              # 0xa
   299  	.long	0                               # 0x0
   300  	.long	0                               # 0x0
   301  	.long	9                               # 0x9
   302  	.long	0                               # 0x0
   303  	.long	0                               # 0x0
   304  .LCPI0_60:
   305  	.long	8                               # 0x8
   306  	.long	0                               # 0x0
   307  	.long	0                               # 0x0
   308  	.long	7                               # 0x7
   309  	.long	0                               # 0x0
   310  	.long	0                               # 0x0
   311  	.long	6                               # 0x6
   312  	.long	0                               # 0x0
   313  .LCPI0_61:
   314  	.long	0                               # 0x0
   315  	.long	5                               # 0x5
   316  	.long	0                               # 0x0
   317  	.long	0                               # 0x0
   318  	.long	4                               # 0x4
   319  	.long	0                               # 0x0
   320  	.long	0                               # 0x0
   321  	.long	3                               # 0x3
   322  .LCPI0_64:
   323  	.long	0                               # 0x0
   324  	.long	0                               # 0x0
   325  	.long	2                               # 0x2
   326  	.long	0                               # 0x0
   327  	.long	0                               # 0x0
   328  	.long	1                               # 0x1
   329  	.long	0                               # 0x0
   330  	.long	11                              # 0xb
   331  .LCPI0_65:
   332  	.long	0                               # 0x0
   333  	.long	0                               # 0x0
   334  	.long	8                               # 0x8
   335  	.long	0                               # 0x0
   336  	.long	0                               # 0x0
   337  	.long	4                               # 0x4
   338  	.long	0                               # 0x0
   339  	.long	12                              # 0xc
   340  .LCPI0_67:
   341  	.long	0                               # 0x0
   342  	.long	0                               # 0x0
   343  	.long	6                               # 0x6
   344  	.long	0                               # 0x0
   345  	.long	12                              # 0xc
   346  	.long	0                               # 0x0
   347  	.long	0                               # 0x0
   348  	.long	5                               # 0x5
   349  .LCPI0_69:
   350  	.long	0                               # 0x0
   351  	.long	11                              # 0xb
   352  	.long	0                               # 0x0
   353  	.long	0                               # 0x0
   354  	.long	4                               # 0x4
   355  	.long	0                               # 0x0
   356  	.long	10                              # 0xa
   357  	.long	0                               # 0x0
   358  .LCPI0_70:
   359  	.long	0                               # 0x0
   360  	.long	3                               # 0x3
   361  	.long	0                               # 0x0
   362  	.long	9                               # 0x9
   363  	.long	0                               # 0x0
   364  	.long	0                               # 0x0
   365  	.long	2                               # 0x2
   366  	.long	0                               # 0x0
   367  .LCPI0_71:
   368  	.long	8                               # 0x8
   369  	.long	0                               # 0x0
   370  	.long	0                               # 0x0
   371  	.long	1                               # 0x1
   372  	.long	0                               # 0x0
   373  	.long	7                               # 0x7
   374  	.long	0                               # 0x0
   375  	.long	13                              # 0xd
   376  .LCPI0_72:
   377  	.long	0                               # 0x0
   378  	.long	0                               # 0x0
   379  	.long	4                               # 0x4
   380  	.long	0                               # 0x0
   381  	.long	8                               # 0x8
   382  	.long	0                               # 0x0
   383  	.long	12                              # 0xc
   384  	.long	0                               # 0x0
   385  .LCPI0_74:
   386  	.long	0                               # 0x0
   387  	.long	2                               # 0x2
   388  	.long	0                               # 0x0
   389  	.long	6                               # 0x6
   390  	.long	0                               # 0x0
   391  	.long	10                              # 0xa
   392  	.long	0                               # 0x0
   393  	.long	14                              # 0xe
   394  .LCPI0_75:
   395  	.long	0                               # 0x0
   396  	.long	0                               # 0x0
   397  	.long	2                               # 0x2
   398  	.long	0                               # 0x0
   399  	.long	4                               # 0x4
   400  	.long	0                               # 0x0
   401  	.long	6                               # 0x6
   402  	.long	0                               # 0x0
   403  .LCPI0_77:
   404  	.long	8                               # 0x8
   405  	.long	0                               # 0x0
   406  	.long	10                              # 0xa
   407  	.long	0                               # 0x0
   408  	.long	12                              # 0xc
   409  	.long	0                               # 0x0
   410  	.long	14                              # 0xe
   411  	.long	0                               # 0x0
   412  .LCPI0_78:
   413  	.long	0                               # 0x0
   414  	.long	1                               # 0x1
   415  	.long	0                               # 0x0
   416  	.long	3                               # 0x3
   417  	.long	0                               # 0x0
   418  	.long	5                               # 0x5
   419  	.long	0                               # 0x0
   420  	.long	7                               # 0x7
   421  .LCPI0_79:
   422  	.long	0                               # 0x0
   423  	.long	9                               # 0x9
   424  	.long	0                               # 0x0
   425  	.long	11                              # 0xb
   426  	.long	0                               # 0x0
   427  	.long	13                              # 0xd
   428  	.long	0                               # 0x0
   429  	.long	15                              # 0xf
   430  .LCPI0_81:
   431  	.long	0                               # 0x0
   432  	.long	15                              # 0xf
   433  	.long	0                               # 0x0
   434  	.long	13                              # 0xd
   435  	.long	0                               # 0x0
   436  	.long	11                              # 0xb
   437  	.long	0                               # 0x0
   438  	.long	9                               # 0x9
   439  .LCPI0_83:
   440  	.long	0                               # 0x0
   441  	.long	7                               # 0x7
   442  	.long	0                               # 0x0
   443  	.long	5                               # 0x5
   444  	.long	0                               # 0x0
   445  	.long	3                               # 0x3
   446  	.long	0                               # 0x0
   447  	.long	1                               # 0x1
   448  .LCPI0_84:
   449  	.long	16                              # 0x10
   450  	.long	0                               # 0x0
   451  	.long	14                              # 0xe
   452  	.long	0                               # 0x0
   453  	.long	12                              # 0xc
   454  	.long	0                               # 0x0
   455  	.long	10                              # 0xa
   456  	.long	0                               # 0x0
   457  .LCPI0_85:
   458  	.long	8                               # 0x8
   459  	.long	0                               # 0x0
   460  	.long	6                               # 0x6
   461  	.long	0                               # 0x0
   462  	.long	4                               # 0x4
   463  	.long	0                               # 0x0
   464  	.long	2                               # 0x2
   465  	.long	17                              # 0x11
   466  .LCPI0_86:
   467  	.long	0                               # 0x0
   468  	.long	14                              # 0xe
   469  	.long	0                               # 0x0
   470  	.long	10                              # 0xa
   471  	.long	0                               # 0x0
   472  	.long	6                               # 0x6
   473  	.long	0                               # 0x0
   474  	.long	2                               # 0x2
   475  .LCPI0_88:
   476  	.long	16                              # 0x10
   477  	.long	0                               # 0x0
   478  	.long	12                              # 0xc
   479  	.long	0                               # 0x0
   480  	.long	8                               # 0x8
   481  	.long	0                               # 0x0
   482  	.long	4                               # 0x4
   483  	.long	18                              # 0x12
   484  .LCPI0_89:
   485  	.long	0                               # 0x0
   486  	.long	13                              # 0xd
   487  	.long	0                               # 0x0
   488  	.long	7                               # 0x7
   489  	.long	0                               # 0x0
   490  	.long	1                               # 0x1
   491  	.long	14                              # 0xe
   492  	.long	0                               # 0x0
   493  .LCPI0_91:
   494  	.long	8                               # 0x8
   495  	.long	0                               # 0x0
   496  	.long	2                               # 0x2
   497  	.long	15                              # 0xf
   498  	.long	0                               # 0x0
   499  	.long	9                               # 0x9
   500  	.long	0                               # 0x0
   501  	.long	3                               # 0x3
   502  .LCPI0_92:
   503  	.long	16                              # 0x10
   504  	.long	0                               # 0x0
   505  	.long	10                              # 0xa
   506  	.long	0                               # 0x0
   507  	.long	4                               # 0x4
   508  	.long	17                              # 0x11
   509  	.long	0                               # 0x0
   510  	.long	11                              # 0xb
   511  .LCPI0_93:
   512  	.long	0                               # 0x0
   513  	.long	5                               # 0x5
   514  	.long	18                              # 0x12
   515  	.long	0                               # 0x0
   516  	.long	12                              # 0xc
   517  	.long	0                               # 0x0
   518  	.long	6                               # 0x6
   519  	.long	19                              # 0x13
   520  .LCPI0_94:
   521  	.long	0                               # 0x0
   522  	.long	12                              # 0xc
   523  	.long	0                               # 0x0
   524  	.long	4                               # 0x4
   525  	.long	16                              # 0x10
   526  	.long	0                               # 0x0
   527  	.long	8                               # 0x8
   528  	.long	20                              # 0x14
   529  .LCPI0_96:
   530  	.long	0                               # 0x0
   531  	.long	11                              # 0xb
   532  	.long	0                               # 0x0
   533  	.long	1                               # 0x1
   534  	.long	12                              # 0xc
   535  	.long	0                               # 0x0
   536  	.long	2                               # 0x2
   537  	.long	13                              # 0xd
   538  .LCPI0_98:
   539  	.long	0                               # 0x0
   540  	.long	3                               # 0x3
   541  	.long	14                              # 0xe
   542  	.long	0                               # 0x0
   543  	.long	4                               # 0x4
   544  	.long	15                              # 0xf
   545  	.long	0                               # 0x0
   546  	.long	5                               # 0x5
   547  .LCPI0_99:
   548  	.long	16                              # 0x10
   549  	.long	0                               # 0x0
   550  	.long	6                               # 0x6
   551  	.long	17                              # 0x11
   552  	.long	0                               # 0x0
   553  	.long	7                               # 0x7
   554  	.long	18                              # 0x12
   555  	.long	0                               # 0x0
   556  .LCPI0_100:
   557  	.long	8                               # 0x8
   558  	.long	19                              # 0x13
   559  	.long	0                               # 0x0
   560  	.long	9                               # 0x9
   561  	.long	20                              # 0x14
   562  	.long	0                               # 0x0
   563  	.long	10                              # 0xa
   564  	.long	21                              # 0x15
   565  .LCPI0_101:
   566  	.long	0                               # 0x0
   567  	.long	10                              # 0xa
   568  	.long	20                              # 0x14
   569  	.long	0                               # 0x0
   570  	.long	8                               # 0x8
   571  	.long	18                              # 0x12
   572  	.long	0                               # 0x0
   573  	.long	6                               # 0x6
   574  .LCPI0_103:
   575  	.long	16                              # 0x10
   576  	.long	0                               # 0x0
   577  	.long	4                               # 0x4
   578  	.long	14                              # 0xe
   579  	.long	0                               # 0x0
   580  	.long	2                               # 0x2
   581  	.long	12                              # 0xc
   582  	.long	22                              # 0x16
   583  .LCPI0_104:
   584  	.long	0                               # 0x0
   585  	.long	9                               # 0x9
   586  	.long	18                              # 0x12
   587  	.long	0                               # 0x0
   588  	.long	4                               # 0x4
   589  	.long	13                              # 0xd
   590  	.long	22                              # 0x16
   591  	.long	0                               # 0x0
   592  .LCPI0_106:
   593  	.long	8                               # 0x8
   594  	.long	17                              # 0x11
   595  	.long	0                               # 0x0
   596  	.long	3                               # 0x3
   597  	.long	12                              # 0xc
   598  	.long	21                              # 0x15
   599  	.long	0                               # 0x0
   600  	.long	7                               # 0x7
   601  .LCPI0_107:
   602  	.long	16                              # 0x10
   603  	.long	0                               # 0x0
   604  	.long	2                               # 0x2
   605  	.long	11                              # 0xb
   606  	.long	20                              # 0x14
   607  	.long	0                               # 0x0
   608  	.long	6                               # 0x6
   609  	.long	15                              # 0xf
   610  .LCPI0_108:
   611  	.long	0                               # 0x0
   612  	.long	1                               # 0x1
   613  	.long	10                              # 0xa
   614  	.long	19                              # 0x13
   615  	.long	0                               # 0x0
   616  	.long	5                               # 0x5
   617  	.long	14                              # 0xe
   618  	.long	23                              # 0x17
   619  .LCPI0_111:
   620  	.long	0                               # 0x0
   621  	.long	7                               # 0x7
   622  	.long	14                              # 0xe
   623  	.long	21                              # 0x15
   624  	.long	0                               # 0x0
   625  	.long	3                               # 0x3
   626  	.long	10                              # 0xa
   627  	.long	17                              # 0x11
   628  .LCPI0_113:
   629  	.long	24                              # 0x18
   630  	.long	0                               # 0x0
   631  	.long	6                               # 0x6
   632  	.long	13                              # 0xd
   633  	.long	20                              # 0x14
   634  	.long	0                               # 0x0
   635  	.long	2                               # 0x2
   636  	.long	9                               # 0x9
   637  .LCPI0_114:
   638  	.long	16                              # 0x10
   639  	.long	23                              # 0x17
   640  	.long	0                               # 0x0
   641  	.long	5                               # 0x5
   642  	.long	12                              # 0xc
   643  	.long	19                              # 0x13
   644  	.long	0                               # 0x0
   645  	.long	1                               # 0x1
   646  .LCPI0_115:
   647  	.long	8                               # 0x8
   648  	.long	15                              # 0xf
   649  	.long	22                              # 0x16
   650  	.long	0                               # 0x0
   651  	.long	4                               # 0x4
   652  	.long	11                              # 0xb
   653  	.long	18                              # 0x12
   654  	.long	25                              # 0x19
   655  .LCPI0_116:
   656  	.long	0                               # 0x0
   657  	.long	6                               # 0x6
   658  	.long	12                              # 0xc
   659  	.long	18                              # 0x12
   660  	.long	24                              # 0x18
   661  	.long	0                               # 0x0
   662  	.long	4                               # 0x4
   663  	.long	10                              # 0xa
   664  .LCPI0_118:
   665  	.long	16                              # 0x10
   666  	.long	22                              # 0x16
   667  	.long	0                               # 0x0
   668  	.long	2                               # 0x2
   669  	.long	8                               # 0x8
   670  	.long	14                              # 0xe
   671  	.long	20                              # 0x14
   672  	.long	26                              # 0x1a
   673  .LCPI0_119:
   674  	.long	0                               # 0x0
   675  	.long	5                               # 0x5
   676  	.long	10                              # 0xa
   677  	.long	15                              # 0xf
   678  	.long	20                              # 0x14
   679  	.long	25                              # 0x19
   680  	.long	0                               # 0x0
   681  	.long	3                               # 0x3
   682  .LCPI0_121:
   683  	.long	8                               # 0x8
   684  	.long	13                              # 0xd
   685  	.long	18                              # 0x12
   686  	.long	23                              # 0x17
   687  	.long	0                               # 0x0
   688  	.long	1                               # 0x1
   689  	.long	6                               # 0x6
   690  	.long	11                              # 0xb
   691  .LCPI0_122:
   692  	.long	16                              # 0x10
   693  	.long	21                              # 0x15
   694  	.long	26                              # 0x1a
   695  	.long	0                               # 0x0
   696  	.long	4                               # 0x4
   697  	.long	9                               # 0x9
   698  	.long	14                              # 0xe
   699  	.long	19                              # 0x13
   700  .LCPI0_123:
   701  	.long	24                              # 0x18
   702  	.long	0                               # 0x0
   703  	.long	2                               # 0x2
   704  	.long	7                               # 0x7
   705  	.long	12                              # 0xc
   706  	.long	17                              # 0x11
   707  	.long	22                              # 0x16
   708  	.long	27                              # 0x1b
   709  .LCPI0_124:
   710  	.long	0                               # 0x0
   711  	.long	4                               # 0x4
   712  	.long	8                               # 0x8
   713  	.long	12                              # 0xc
   714  	.long	16                              # 0x10
   715  	.long	20                              # 0x14
   716  	.long	24                              # 0x18
   717  	.long	28                              # 0x1c
   718  .LCPI0_126:
   719  	.long	0                               # 0x0
   720  	.long	3                               # 0x3
   721  	.long	6                               # 0x6
   722  	.long	9                               # 0x9
   723  	.long	12                              # 0xc
   724  	.long	15                              # 0xf
   725  	.long	18                              # 0x12
   726  	.long	21                              # 0x15
   727  .LCPI0_128:
   728  	.long	24                              # 0x18
   729  	.long	27                              # 0x1b
   730  	.long	0                               # 0x0
   731  	.long	1                               # 0x1
   732  	.long	4                               # 0x4
   733  	.long	7                               # 0x7
   734  	.long	10                              # 0xa
   735  	.long	13                              # 0xd
   736  .LCPI0_129:
   737  	.long	16                              # 0x10
   738  	.long	19                              # 0x13
   739  	.long	22                              # 0x16
   740  	.long	25                              # 0x19
   741  	.long	28                              # 0x1c
   742  	.long	0                               # 0x0
   743  	.long	2                               # 0x2
   744  	.long	5                               # 0x5
   745  .LCPI0_130:
   746  	.long	8                               # 0x8
   747  	.long	11                              # 0xb
   748  	.long	14                              # 0xe
   749  	.long	17                              # 0x11
   750  	.long	20                              # 0x14
   751  	.long	23                              # 0x17
   752  	.long	26                              # 0x1a
   753  	.long	29                              # 0x1d
   754  .LCPI0_131:
   755  	.long	0                               # 0x0
   756  	.long	2                               # 0x2
   757  	.long	4                               # 0x4
   758  	.long	6                               # 0x6
   759  	.long	8                               # 0x8
   760  	.long	10                              # 0xa
   761  	.long	12                              # 0xc
   762  	.long	14                              # 0xe
   763  .LCPI0_133:
   764  	.long	16                              # 0x10
   765  	.long	18                              # 0x12
   766  	.long	20                              # 0x14
   767  	.long	22                              # 0x16
   768  	.long	24                              # 0x18
   769  	.long	26                              # 0x1a
   770  	.long	28                              # 0x1c
   771  	.long	30                              # 0x1e
   772  .LCPI0_134:
   773  	.long	0                               # 0x0
   774  	.long	1                               # 0x1
   775  	.long	2                               # 0x2
   776  	.long	3                               # 0x3
   777  	.long	4                               # 0x4
   778  	.long	5                               # 0x5
   779  	.long	6                               # 0x6
   780  	.long	7                               # 0x7
   781  .LCPI0_136:
   782  	.long	24                              # 0x18
   783  	.long	25                              # 0x19
   784  	.long	26                              # 0x1a
   785  	.long	27                              # 0x1b
   786  	.long	28                              # 0x1c
   787  	.long	29                              # 0x1d
   788  	.long	30                              # 0x1e
   789  	.long	31                              # 0x1f
   790  	.section	.rodata.cst16,"aM",@progbits,16
   791  	.p2align	4
   792  .LCPI0_5:
   793  	.long	8                               # 0x8
   794  	.long	7                               # 0x7
   795  	.long	6                               # 0x6
   796  	.long	5                               # 0x5
   797  .LCPI0_6:
   798  	.long	24                              # 0x18
   799  	.long	25                              # 0x19
   800  	.long	26                              # 0x1a
   801  	.long	27                              # 0x1b
   802  .LCPI0_9:
   803  	.long	16                              # 0x10
   804  	.long	14                              # 0xe
   805  	.long	12                              # 0xc
   806  	.long	10                              # 0xa
   807  .LCPI0_10:
   808  	.long	16                              # 0x10
   809  	.long	18                              # 0x12
   810  	.long	20                              # 0x14
   811  	.long	22                              # 0x16
   812  .LCPI0_13:
   813  	.long	8                               # 0x8
   814  	.long	5                               # 0x5
   815  	.zero	4
   816  	.zero	4
   817  .LCPI0_14:
   818  	.long	24                              # 0x18
   819  	.long	27                              # 0x1b
   820  	.zero	4
   821  	.zero	4
   822  .LCPI0_16:
   823  	.long	16                              # 0x10
   824  	.long	13                              # 0xd
   825  	.long	10                              # 0xa
   826  	.long	7                               # 0x7
   827  .LCPI0_17:
   828  	.long	16                              # 0x10
   829  	.long	19                              # 0x13
   830  	.long	22                              # 0x16
   831  	.long	25                              # 0x19
   832  .LCPI0_19:
   833  	.long	24                              # 0x18
   834  	.long	21                              # 0x15
   835  	.long	18                              # 0x12
   836  	.long	15                              # 0xf
   837  .LCPI0_20:
   838  	.long	8                               # 0x8
   839  	.long	11                              # 0xb
   840  	.long	14                              # 0xe
   841  	.long	17                              # 0x11
   842  .LCPI0_26:
   843  	.long	24                              # 0x18
   844  	.long	19                              # 0x13
   845  	.long	14                              # 0xe
   846  	.long	9                               # 0x9
   847  .LCPI0_27:
   848  	.long	8                               # 0x8
   849  	.long	13                              # 0xd
   850  	.long	18                              # 0x12
   851  	.long	23                              # 0x17
   852  .LCPI0_29:
   853  	.long	16                              # 0x10
   854  	.long	11                              # 0xb
   855  	.zero	4
   856  	.zero	4
   857  .LCPI0_30:
   858  	.long	16                              # 0x10
   859  	.long	21                              # 0x15
   860  	.zero	4
   861  	.zero	4
   862  .LCPI0_40:
   863  	.long	16                              # 0x10
   864  	.long	9                               # 0x9
   865  	.zero	4
   866  	.zero	4
   867  .LCPI0_41:
   868  	.long	16                              # 0x10
   869  	.long	23                              # 0x17
   870  	.zero	4
   871  	.zero	4
   872  .LCPI0_43:
   873  	.long	24                              # 0x18
   874  	.long	17                              # 0x11
   875  	.zero	4
   876  	.zero	4
   877  .LCPI0_44:
   878  	.long	8                               # 0x8
   879  	.long	15                              # 0xf
   880  	.zero	4
   881  	.zero	4
   882  .LCPI0_46:
   883  	.long	0                               # 0x0
   884  	.long	0                               # 0x0
   885  	.long	0                               # 0x0
   886  	.long	8                               # 0x8
   887  .LCPI0_50:
   888  	.long	24                              # 0x18
   889  	.long	15                              # 0xf
   890  	.zero	4
   891  	.zero	4
   892  .LCPI0_51:
   893  	.long	8                               # 0x8
   894  	.long	17                              # 0x11
   895  	.zero	4
   896  	.zero	4
   897  .LCPI0_62:
   898  	.long	24                              # 0x18
   899  	.long	13                              # 0xd
   900  	.zero	4
   901  	.zero	4
   902  .LCPI0_63:
   903  	.long	8                               # 0x8
   904  	.long	19                              # 0x13
   905  	.zero	4
   906  	.zero	4
   907  .LCPI0_109:
   908  	.long	0                               # 0x0
   909  	.long	8                               # 0x8
   910  	.long	16                              # 0x10
   911  	.long	24                              # 0x18
   912  	.section	.rodata.cst4,"aM",@progbits,4
   913  	.p2align	2
   914  .LCPI0_47:
   915  	.long	16777215                        # 0xffffff
   916  .LCPI0_110:
   917  	.long	255                             # 0xff
   918  	.text
   919  	.globl	unpack32_avx2
   920  	.p2align	4, 0x90
   921  	.type	unpack32_avx2,@function
   922  unpack32_avx2:                          # @unpack32_avx2
   923  # %bb.0:
   924  	push	rbp
   925  	mov	rbp, rsp
   926  	push	r15
   927  	push	r14
   928  	push	r12
   929  	push	rbx
   930  	and	rsp, -16
   931                                          # kill: def $edx killed $edx def $rdx
   932  	mov	r15, rsi
   933  	mov	rbx, rdi
   934  	lea	r14d, [rdx + 31]
   935  	test	edx, edx
   936  	cmovns	r14d, edx
   937  	sar	r14d, 5
   938  	cmp	ecx, 15
   939  	jle	.LBB0_1
   940  # %bb.48:
   941  	cmp	ecx, 23
   942  	jle	.LBB0_49
   943  # %bb.72:
   944  	cmp	ecx, 27
   945  	jle	.LBB0_73
   946  # %bb.84:
   947  	cmp	ecx, 29
   948  	jle	.LBB0_85
   949  # %bb.90:
   950  	cmp	ecx, 30
   951  	je	.LBB0_99
   952  # %bb.91:
   953  	cmp	ecx, 31
   954  	je	.LBB0_96
   955  # %bb.92:
   956  	cmp	ecx, 32
   957  	jne	.LBB0_147
   958  # %bb.93:
   959  	cmp	edx, 32
   960  	jl	.LBB0_147
   961  # %bb.94:
   962  	mov	r12d, r14d
   963  	.p2align	4, 0x90
   964  .LBB0_95:                               # =>This Inner Loop Header: Depth=1
   965  	mov	edx, 128
   966  	mov	rdi, r15
   967  	mov	rsi, rbx
   968  	call	clib·_memcpy(SB)
   969  	sub	rbx, -128
   970  	sub	r15, -128
   971  	add	r12, -1
   972  	jne	.LBB0_95
   973  	jmp	.LBB0_147
   974  .LBB0_1:
   975  	cmp	ecx, 7
   976  	jg	.LBB0_25
   977  # %bb.2:
   978  	cmp	ecx, 3
   979  	jg	.LBB0_14
   980  # %bb.3:
   981  	cmp	ecx, 1
   982  	jg	.LBB0_9
   983  # %bb.4:
   984  	test	ecx, ecx
   985  	je	.LBB0_144
   986  # %bb.5:
   987  	cmp	ecx, 1
   988  	jne	.LBB0_147
   989  # %bb.6:
   990  	cmp	edx, 32
   991  	jl	.LBB0_147
   992  # %bb.7:
   993  	mov	eax, r14d
   994  	add	r15, 96
   995  	xor	ecx, ecx
   996  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_135] # ymm0 = [4294967297,4294967297,4294967297,4294967297]
   997  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_134] # ymm1 = [0,1,2,3,4,5,6,7]
   998  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_2] # ymm2 = [8,9,10,11,12,13,14,15]
   999  	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_4] # ymm3 = [16,17,18,19,20,21,22,23]
  1000  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_136] # ymm4 = [24,25,26,27,28,29,30,31]
  1001  	.p2align	4, 0x90
  1002  .LBB0_8:                                # =>This Inner Loop Header: Depth=1
  1003  	vpbroadcastd	ymm5, dword ptr [rbx + 4*rcx]
  1004  	vpsrlvd	ymm5, ymm5, ymm1
  1005  	vpand	ymm5, ymm5, ymm0
  1006  	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1007  	vpbroadcastd	ymm5, dword ptr [rbx + 4*rcx]
  1008  	vpsrlvd	ymm5, ymm5, ymm2
  1009  	vpand	ymm5, ymm5, ymm0
  1010  	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1011  	vpbroadcastd	ymm5, dword ptr [rbx + 4*rcx]
  1012  	vpsrlvd	ymm5, ymm5, ymm3
  1013  	vpand	ymm5, ymm5, ymm0
  1014  	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1015  	vpbroadcastd	ymm5, dword ptr [rbx + 4*rcx]
  1016  	vpsrlvd	ymm5, ymm5, ymm4
  1017  	vpand	ymm5, ymm5, ymm0
  1018  	vmovdqu	ymmword ptr [r15], ymm5
  1019  	add	rcx, 1
  1020  	sub	r15, -128
  1021  	cmp	rax, rcx
  1022  	jne	.LBB0_8
  1023  	jmp	.LBB0_147
  1024  .LBB0_49:
  1025  	cmp	ecx, 19
  1026  	jg	.LBB0_61
  1027  # %bb.50:
  1028  	cmp	ecx, 17
  1029  	jg	.LBB0_56
  1030  # %bb.51:
  1031  	cmp	ecx, 16
  1032  	je	.LBB0_120
  1033  # %bb.52:
  1034  	cmp	ecx, 17
  1035  	jne	.LBB0_147
  1036  # %bb.53:
  1037  	cmp	edx, 32
  1038  	jl	.LBB0_147
  1039  # %bb.54:
  1040  	mov	r8d, r14d
  1041  	add	r15, 96
  1042  	add	rbx, 64
  1043  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_76] # ymm0 = [562945658585087,562945658585087,562945658585087,562945658585087]
  1044  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_75] # ymm1 = [0,0,2,0,4,0,6,0]
  1045  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_77] # ymm2 = [8,0,10,0,12,0,14,0]
  1046  	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_78] # ymm3 = [0,1,0,3,0,5,0,7]
  1047  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_79] # ymm4 = [0,9,0,11,0,13,0,15]
  1048  	.p2align	4, 0x90
  1049  .LBB0_55:                               # =>This Inner Loop Header: Depth=1
  1050  	mov	ecx, dword ptr [rbx - 52]
  1051  	mov	r10d, dword ptr [rbx - 48]
  1052  	shld	r10d, ecx, 9
  1053  	mov	esi, dword ptr [rbx - 56]
  1054  	mov	edi, ecx
  1055  	shld	edi, esi, 11
  1056  	mov	r9d, dword ptr [rbx - 64]
  1057  	mov	edx, dword ptr [rbx - 60]
  1058  	mov	eax, edx
  1059  	shld	eax, r9d, 15
  1060  	vmovd	xmm5, esi
  1061  	shld	esi, edx, 13
  1062  	vpinsrd	xmm5, xmm5, edi, 1
  1063  	vpinsrd	xmm5, xmm5, ecx, 2
  1064  	vpinsrd	xmm5, xmm5, r10d, 3
  1065  	vmovd	xmm6, r9d
  1066  	vpinsrd	xmm6, xmm6, eax, 1
  1067  	vpinsrd	xmm6, xmm6, edx, 2
  1068  	vpinsrd	xmm6, xmm6, esi, 3
  1069  	vinserti128	ymm5, ymm6, xmm5, 1
  1070  	vpsrlvd	ymm5, ymm5, ymm1
  1071  	vpand	ymm5, ymm5, ymm0
  1072  	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1073  	mov	eax, dword ptr [rbx - 36]
  1074  	mov	r10d, dword ptr [rbx - 32]
  1075  	shld	r10d, eax, 1
  1076  	mov	edx, dword ptr [rbx - 40]
  1077  	mov	esi, eax
  1078  	shld	esi, edx, 3
  1079  	mov	r9d, dword ptr [rbx - 48]
  1080  	mov	ecx, dword ptr [rbx - 44]
  1081  	mov	edi, ecx
  1082  	shld	edi, r9d, 7
  1083  	vmovd	xmm5, edx
  1084  	shld	edx, ecx, 5
  1085  	vpinsrd	xmm5, xmm5, esi, 1
  1086  	vpinsrd	xmm5, xmm5, eax, 2
  1087  	vpinsrd	xmm5, xmm5, r10d, 3
  1088  	vmovd	xmm6, r9d
  1089  	vpinsrd	xmm6, xmm6, edi, 1
  1090  	vpinsrd	xmm6, xmm6, ecx, 2
  1091  	vpinsrd	xmm6, xmm6, edx, 3
  1092  	vinserti128	ymm5, ymm6, xmm5, 1
  1093  	vpsrlvd	ymm5, ymm5, ymm2
  1094  	vpand	ymm5, ymm5, ymm0
  1095  	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1096  	mov	r9d, dword ptr [rbx - 16]
  1097  	mov	r11d, dword ptr [rbx - 20]
  1098  	mov	edx, r9d
  1099  	shld	edx, r11d, 10
  1100  	mov	r10d, dword ptr [rbx - 24]
  1101  	mov	edi, r11d
  1102  	shld	edi, r10d, 12
  1103  	mov	eax, dword ptr [rbx - 28]
  1104  	mov	esi, r10d
  1105  	shld	esi, eax, 14
  1106  	mov	ecx, dword ptr [rbx - 32]
  1107  	shrd	ecx, eax, 16
  1108  	vmovd	xmm5, edi
  1109  	vpinsrd	xmm5, xmm5, r11d, 1
  1110  	vpinsrd	xmm5, xmm5, edx, 2
  1111  	vpinsrd	xmm5, xmm5, r9d, 3
  1112  	vmovd	xmm6, ecx
  1113  	vpinsrd	xmm6, xmm6, eax, 1
  1114  	vpinsrd	xmm6, xmm6, esi, 2
  1115  	vpinsrd	xmm6, xmm6, r10d, 3
  1116  	vinserti128	ymm5, ymm6, xmm5, 1
  1117  	vpsrlvd	ymm5, ymm5, ymm3
  1118  	vpand	ymm5, ymm5, ymm0
  1119  	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1120  	mov	r9d, dword ptr [rbx]
  1121  	mov	r11d, dword ptr [rbx - 4]
  1122  	mov	edx, r9d
  1123  	shld	edx, r11d, 2
  1124  	mov	r10d, dword ptr [rbx - 8]
  1125  	mov	edi, r11d
  1126  	shld	edi, r10d, 4
  1127  	mov	eax, dword ptr [rbx - 16]
  1128  	mov	esi, dword ptr [rbx - 12]
  1129  	mov	ecx, r10d
  1130  	shld	ecx, esi, 6
  1131  	shrd	eax, esi, 24
  1132  	vmovd	xmm5, edi
  1133  	vpinsrd	xmm5, xmm5, r11d, 1
  1134  	vpinsrd	xmm5, xmm5, edx, 2
  1135  	vpinsrd	xmm5, xmm5, r9d, 3
  1136  	vmovd	xmm6, eax
  1137  	vpinsrd	xmm6, xmm6, esi, 1
  1138  	vpinsrd	xmm6, xmm6, ecx, 2
  1139  	vpinsrd	xmm6, xmm6, r10d, 3
  1140  	vinserti128	ymm5, ymm6, xmm5, 1
  1141  	vpsrlvd	ymm5, ymm5, ymm4
  1142  	vpand	ymm5, ymm5, ymm0
  1143  	vmovdqu	ymmword ptr [r15], ymm5
  1144  	sub	r15, -128
  1145  	add	rbx, 68
  1146  	add	r8, -1
  1147  	jne	.LBB0_55
  1148  	jmp	.LBB0_147
  1149  .LBB0_25:
  1150  	cmp	ecx, 11
  1151  	jg	.LBB0_37
  1152  # %bb.26:
  1153  	cmp	ecx, 9
  1154  	jg	.LBB0_32
  1155  # %bb.27:
  1156  	cmp	ecx, 8
  1157  	je	.LBB0_132
  1158  # %bb.28:
  1159  	cmp	ecx, 9
  1160  	jne	.LBB0_147
  1161  # %bb.29:
  1162  	cmp	edx, 32
  1163  	jl	.LBB0_147
  1164  # %bb.30:
  1165  	mov	r8d, r14d
  1166  	add	r15, 96
  1167  	add	rbx, 32
  1168  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_105] # ymm0 = [2194728288767,2194728288767,2194728288767,2194728288767]
  1169  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_104] # ymm1 = [0,9,18,0,4,13,22,0]
  1170  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_106] # ymm2 = [8,17,0,3,12,21,0,7]
  1171  	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_107] # ymm3 = [16,0,2,11,20,0,6,15]
  1172  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_108] # ymm4 = [0,1,10,19,0,5,14,23]
  1173  	.p2align	4, 0x90
  1174  .LBB0_31:                               # =>This Inner Loop Header: Depth=1
  1175  	mov	ecx, dword ptr [rbx - 32]
  1176  	mov	edx, dword ptr [rbx - 28]
  1177  	mov	esi, dword ptr [rbx - 24]
  1178  	shld	esi, edx, 1
  1179  	vmovd	xmm5, edx
  1180  	vpinsrd	xmm5, xmm5, edx, 1
  1181  	vpinsrd	xmm5, xmm5, edx, 2
  1182  	shld	edx, ecx, 5
  1183  	vpinsrd	xmm5, xmm5, esi, 3
  1184  	vmovd	xmm6, ecx
  1185  	vpinsrd	xmm6, xmm6, ecx, 1
  1186  	vpinsrd	xmm6, xmm6, ecx, 2
  1187  	vpinsrd	xmm6, xmm6, edx, 3
  1188  	vinserti128	ymm5, ymm6, xmm5, 1
  1189  	vpsrlvd	ymm5, ymm5, ymm1
  1190  	vpand	ymm5, ymm5, ymm0
  1191  	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1192  	mov	ecx, dword ptr [rbx - 16]
  1193  	mov	edx, dword ptr [rbx - 24]
  1194  	mov	esi, dword ptr [rbx - 20]
  1195  	mov	edi, ecx
  1196  	shld	edi, esi, 2
  1197  	mov	eax, esi
  1198  	shld	eax, edx, 6
  1199  	vmovd	xmm5, esi
  1200  	vpinsrd	xmm5, xmm5, esi, 1
  1201  	vpinsrd	xmm5, xmm5, edi, 2
  1202  	vpinsrd	xmm5, xmm5, ecx, 3
  1203  	vmovd	xmm6, edx
  1204  	vpinsrd	xmm6, xmm6, edx, 1
  1205  	vpinsrd	xmm6, xmm6, eax, 2
  1206  	vpinsrd	xmm6, xmm6, esi, 3
  1207  	vinserti128	ymm5, ymm6, xmm5, 1
  1208  	vpsrlvd	ymm5, ymm5, ymm2
  1209  	vpand	ymm5, ymm5, ymm0
  1210  	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1211  	mov	eax, dword ptr [rbx - 8]
  1212  	mov	ecx, dword ptr [rbx - 16]
  1213  	mov	edx, dword ptr [rbx - 12]
  1214  	mov	esi, eax
  1215  	shld	esi, edx, 3
  1216  	mov	edi, edx
  1217  	shld	edi, ecx, 7
  1218  	vmovd	xmm5, edx
  1219  	vpinsrd	xmm5, xmm5, esi, 1
  1220  	vpinsrd	xmm5, xmm5, eax, 2
  1221  	vpinsrd	xmm5, xmm5, eax, 3
  1222  	vmovd	xmm6, ecx
  1223  	vpinsrd	xmm6, xmm6, edi, 1
  1224  	vpinsrd	xmm6, xmm6, edx, 2
  1225  	vpinsrd	xmm6, xmm6, edx, 3
  1226  	vinserti128	ymm5, ymm6, xmm5, 1
  1227  	vpsrlvd	ymm5, ymm5, ymm3
  1228  	vpand	ymm5, ymm5, ymm0
  1229  	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1230  	mov	eax, dword ptr [rbx]
  1231  	mov	ecx, dword ptr [rbx - 8]
  1232  	mov	edx, dword ptr [rbx - 4]
  1233  	mov	esi, eax
  1234  	shld	esi, edx, 4
  1235  	shrd	ecx, edx, 24
  1236  	vmovd	xmm5, esi
  1237  	vpinsrd	xmm5, xmm5, eax, 1
  1238  	vpinsrd	xmm5, xmm5, eax, 2
  1239  	vpinsrd	xmm5, xmm5, eax, 3
  1240  	vmovd	xmm6, ecx
  1241  	vpinsrd	xmm6, xmm6, edx, 1
  1242  	vpinsrd	xmm6, xmm6, edx, 2
  1243  	vpinsrd	xmm6, xmm6, edx, 3
  1244  	vinserti128	ymm5, ymm6, xmm5, 1
  1245  	vpsrlvd	ymm5, ymm5, ymm4
  1246  	vpand	ymm5, ymm5, ymm0
  1247  	vmovdqu	ymmword ptr [r15], ymm5
  1248  	sub	r15, -128
  1249  	add	rbx, 36
  1250  	add	r8, -1
  1251  	jne	.LBB0_31
  1252  	jmp	.LBB0_147
  1253  .LBB0_73:
  1254  	cmp	ecx, 25
  1255  	jg	.LBB0_79
  1256  # %bb.74:
  1257  	cmp	ecx, 24
  1258  	je	.LBB0_108
  1259  # %bb.75:
  1260  	cmp	ecx, 25
  1261  	jne	.LBB0_147
  1262  # %bb.76:
  1263  	cmp	edx, 32
  1264  	jl	.LBB0_147
  1265  # %bb.77:
  1266  	mov	r8d, r14d
  1267  	add	r15, 96
  1268  	add	rbx, 96
  1269  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_38] # ymm0 = [144115183814443007,144115183814443007,144115183814443007,144115183814443007]
  1270  	vmovdqa	ymm9, ymmword ptr [rip + .LCPI0_28] # ymm9 = [0,0,0,0,4,0,0,0]
  1271  	vmovdqa	ymm10, ymmword ptr [rip + .LCPI0_39] # ymm10 = [0,1,0,0,0,5,0,0]
  1272  	vmovdqa	xmm11, xmmword ptr [rip + .LCPI0_40] # xmm11 = <16,9,u,u>
  1273  	vmovdqa	xmm4, xmmword ptr [rip + .LCPI0_41] # xmm4 = <16,23,u,u>
  1274  	vmovdqa	ymm5, ymmword ptr [rip + .LCPI0_42] # ymm5 = [0,0,2,0,0,0,6,0]
  1275  	vmovdqa	xmm6, xmmword ptr [rip + .LCPI0_43] # xmm6 = <24,17,u,u>
  1276  	vmovdqa	xmm7, xmmword ptr [rip + .LCPI0_44] # xmm7 = <8,15,u,u>
  1277  	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_45] # ymm8 = [0,0,0,3,0,0,0,7]
  1278  	.p2align	4, 0x90
  1279  .LBB0_78:                               # =>This Inner Loop Header: Depth=1
  1280  	mov	ecx, dword ptr [rbx - 76]
  1281  	mov	r9d, dword ptr [rbx - 72]
  1282  	shld	r9d, ecx, 17
  1283  	mov	esi, dword ptr [rbx - 80]
  1284  	shld	ecx, esi, 10
  1285  	mov	edi, dword ptr [rbx - 84]
  1286  	shld	esi, edi, 3
  1287  	mov	eax, dword ptr [rbx - 88]
  1288  	vmovd	xmm1, edi
  1289  	shld	edi, eax, 21
  1290  	mov	r10d, dword ptr [rbx - 96]
  1291  	mov	edx, dword ptr [rbx - 92]
  1292  	shld	eax, edx, 14
  1293  	shld	edx, r10d, 7
  1294  	vpinsrd	xmm1, xmm1, esi, 1
  1295  	vmovd	xmm2, r10d
  1296  	vpinsrd	xmm1, xmm1, ecx, 2
  1297  	vpinsrd	xmm2, xmm2, edx, 1
  1298  	vpinsrd	xmm1, xmm1, r9d, 3
  1299  	vpinsrd	xmm2, xmm2, eax, 2
  1300  	vpinsrd	xmm2, xmm2, edi, 3
  1301  	vinserti128	ymm1, ymm2, xmm1, 1
  1302  	vpsrlvd	ymm1, ymm1, ymm9
  1303  	vpand	ymm1, ymm1, ymm0
  1304  	vmovdqu	ymmword ptr [r15 - 96], ymm1
  1305  	mov	r11d, dword ptr [rbx - 52]
  1306  	mov	r9d, dword ptr [rbx - 48]
  1307  	shld	r9d, r11d, 9
  1308  	mov	r10d, dword ptr [rbx - 56]
  1309  	shld	r11d, r10d, 2
  1310  	mov	esi, dword ptr [rbx - 60]
  1311  	mov	edi, r10d
  1312  	mov	ecx, dword ptr [rbx - 64]
  1313  	shld	edi, esi, 20
  1314  	mov	edx, dword ptr [rbx - 72]
  1315  	mov	eax, dword ptr [rbx - 68]
  1316  	shld	esi, ecx, 13
  1317  	shrd	edx, eax, 8
  1318  	shld	ecx, eax, 6
  1319  	vmovd	xmm1, edi
  1320  	vpinsrd	xmm1, xmm1, r10d, 1
  1321  	vmovd	xmm2, edx
  1322  	vpinsrd	xmm1, xmm1, r11d, 2
  1323  	vpinsrd	xmm2, xmm2, eax, 1
  1324  	vpinsrd	xmm1, xmm1, r9d, 3
  1325  	vpinsrd	xmm2, xmm2, ecx, 2
  1326  	vpinsrd	xmm2, xmm2, esi, 3
  1327  	vinserti128	ymm1, ymm2, xmm1, 1
  1328  	vpsrlvd	ymm1, ymm1, ymm10
  1329  	vpand	ymm1, ymm1, ymm0
  1330  	vmovdqu	ymmword ptr [r15 - 64], ymm1
  1331  	mov	eax, dword ptr [rbx - 28]
  1332  	mov	r9d, dword ptr [rbx - 24]
  1333  	shld	r9d, eax, 1
  1334  	mov	edx, dword ptr [rbx - 32]
  1335  	mov	esi, eax
  1336  	shld	esi, edx, 19
  1337  	mov	edi, dword ptr [rbx - 40]
  1338  	mov	ecx, dword ptr [rbx - 36]
  1339  	shld	edx, ecx, 12
  1340  	shld	ecx, edi, 5
  1341  	vmovq	xmm1, qword ptr [rbx - 48]      # xmm1 = mem[0],zero
  1342  	vpsrlvd	xmm2, xmm1, xmm11
  1343  	vpshufd	xmm1, xmm1, 229                 # xmm1 = xmm1[1,1,2,3]
  1344  	vpinsrd	xmm1, xmm1, edi, 1
  1345  	vpsllvd	xmm1, xmm1, xmm4
  1346  	vpor	xmm1, xmm2, xmm1
  1347  	vmovd	xmm2, edx
  1348  	vpinsrd	xmm2, xmm2, esi, 1
  1349  	vpinsrd	xmm2, xmm2, eax, 2
  1350  	vpinsrd	xmm2, xmm2, r9d, 3
  1351  	vpinsrd	xmm1, xmm1, edi, 2
  1352  	vpinsrd	xmm1, xmm1, ecx, 3
  1353  	vinserti128	ymm1, ymm1, xmm2, 1
  1354  	vpsrlvd	ymm1, ymm1, ymm5
  1355  	vpand	ymm1, ymm1, ymm0
  1356  	vmovdqu	ymmword ptr [r15 - 32], ymm1
  1357  	mov	r9d, dword ptr [rbx]
  1358  	mov	ecx, dword ptr [rbx - 4]
  1359  	mov	edx, r9d
  1360  	shld	edx, ecx, 18
  1361  	mov	esi, dword ptr [rbx - 8]
  1362  	shld	ecx, esi, 11
  1363  	mov	r10d, dword ptr [rbx - 16]
  1364  	mov	edi, dword ptr [rbx - 12]
  1365  	shld	esi, edi, 4
  1366  	mov	eax, edi
  1367  	shld	eax, r10d, 22
  1368  	vmovq	xmm1, qword ptr [rbx - 24]      # xmm1 = mem[0],zero
  1369  	vpsrlvd	xmm2, xmm1, xmm6
  1370  	vpshufd	xmm1, xmm1, 229                 # xmm1 = xmm1[1,1,2,3]
  1371  	vpinsrd	xmm1, xmm1, r10d, 1
  1372  	vpsllvd	xmm1, xmm1, xmm7
  1373  	vmovd	xmm3, esi
  1374  	vpinsrd	xmm3, xmm3, ecx, 1
  1375  	vpor	xmm1, xmm2, xmm1
  1376  	vpinsrd	xmm2, xmm3, edx, 2
  1377  	vpinsrd	xmm2, xmm2, r9d, 3
  1378  	vpinsrd	xmm1, xmm1, eax, 2
  1379  	vpinsrd	xmm1, xmm1, edi, 3
  1380  	vinserti128	ymm1, ymm1, xmm2, 1
  1381  	vpsrlvd	ymm1, ymm1, ymm8
  1382  	vpand	ymm1, ymm1, ymm0
  1383  	vmovdqu	ymmword ptr [r15], ymm1
  1384  	sub	r15, -128
  1385  	add	rbx, 100
  1386  	add	r8, -1
  1387  	jne	.LBB0_78
  1388  	jmp	.LBB0_147
  1389  .LBB0_14:
  1390  	cmp	ecx, 5
  1391  	jg	.LBB0_20
  1392  # %bb.15:
  1393  	cmp	ecx, 4
  1394  	je	.LBB0_138
  1395  # %bb.16:
  1396  	cmp	ecx, 5
  1397  	jne	.LBB0_147
  1398  # %bb.17:
  1399  	cmp	edx, 32
  1400  	jl	.LBB0_147
  1401  # %bb.18:
  1402  	mov	eax, r14d
  1403  	add	r15, 96
  1404  	add	rbx, 16
  1405  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_120] # ymm0 = [133143986207,133143986207,133143986207,133143986207]
  1406  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_119] # ymm1 = [0,5,10,15,20,25,0,3]
  1407  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_121] # ymm2 = [8,13,18,23,0,1,6,11]
  1408  	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_122] # ymm3 = [16,21,26,0,4,9,14,19]
  1409  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_123] # ymm4 = [24,0,2,7,12,17,22,27]
  1410  	.p2align	4, 0x90
  1411  .LBB0_19:                               # =>This Inner Loop Header: Depth=1
  1412  	mov	ecx, dword ptr [rbx - 16]
  1413  	mov	edx, dword ptr [rbx - 12]
  1414  	mov	esi, edx
  1415  	shld	esi, ecx, 2
  1416  	vmovd	xmm5, ecx
  1417  	vpbroadcastd	xmm6, xmm5
  1418  	vpinsrd	xmm5, xmm5, ecx, 1
  1419  	vpinsrd	xmm5, xmm5, esi, 2
  1420  	vpinsrd	xmm5, xmm5, edx, 3
  1421  	vinserti128	ymm5, ymm6, xmm5, 1
  1422  	vpsrlvd	ymm5, ymm5, ymm1
  1423  	vpand	ymm5, ymm5, ymm0
  1424  	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1425  	mov	ecx, dword ptr [rbx - 12]
  1426  	mov	edx, dword ptr [rbx - 8]
  1427  	mov	esi, edx
  1428  	shld	esi, ecx, 4
  1429  	vmovd	xmm5, ecx
  1430  	vpbroadcastd	xmm5, xmm5
  1431  	vmovd	xmm6, esi
  1432  	vpinsrd	xmm6, xmm6, edx, 1
  1433  	vpinsrd	xmm6, xmm6, edx, 2
  1434  	vpinsrd	xmm6, xmm6, edx, 3
  1435  	vinserti128	ymm5, ymm5, xmm6, 1
  1436  	vpsrlvd	ymm5, ymm5, ymm2
  1437  	vpand	ymm5, ymm5, ymm0
  1438  	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1439  	mov	ecx, dword ptr [rbx - 8]
  1440  	mov	edx, dword ptr [rbx - 4]
  1441  	vmovd	xmm5, edx
  1442  	shld	edx, ecx, 1
  1443  	vmovd	xmm6, ecx
  1444  	vpinsrd	xmm6, xmm6, ecx, 1
  1445  	vpinsrd	xmm6, xmm6, ecx, 2
  1446  	vpinsrd	xmm6, xmm6, edx, 3
  1447  	vpbroadcastd	xmm5, xmm5
  1448  	vinserti128	ymm5, ymm6, xmm5, 1
  1449  	vpsrlvd	ymm5, ymm5, ymm3
  1450  	vpand	ymm5, ymm5, ymm0
  1451  	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1452  	mov	ecx, dword ptr [rbx - 4]
  1453  	mov	edx, dword ptr [rbx]
  1454  	mov	esi, edx
  1455  	shld	esi, ecx, 3
  1456  	vmovd	xmm5, ecx
  1457  	vpinsrd	xmm5, xmm5, esi, 1
  1458  	vpinsrd	xmm5, xmm5, edx, 2
  1459  	vpinsrd	xmm5, xmm5, edx, 3
  1460  	vmovd	xmm6, edx
  1461  	vpbroadcastd	xmm6, xmm6
  1462  	vinserti128	ymm5, ymm5, xmm6, 1
  1463  	vpsrlvd	ymm5, ymm5, ymm4
  1464  	vpand	ymm5, ymm5, ymm0
  1465  	vmovdqu	ymmword ptr [r15], ymm5
  1466  	sub	r15, -128
  1467  	add	rbx, 20
  1468  	add	rax, -1
  1469  	jne	.LBB0_19
  1470  	jmp	.LBB0_147
  1471  .LBB0_61:
  1472  	cmp	ecx, 21
  1473  	jg	.LBB0_67
  1474  # %bb.62:
  1475  	cmp	ecx, 20
  1476  	je	.LBB0_114
  1477  # %bb.63:
  1478  	cmp	ecx, 21
  1479  	jne	.LBB0_147
  1480  # %bb.64:
  1481  	cmp	edx, 32
  1482  	jl	.LBB0_147
  1483  # %bb.65:
  1484  	mov	r8d, r14d
  1485  	add	r15, 96
  1486  	add	rbx, 80
  1487  	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_58] # ymm8 = [0,0,10,0,0,9,0,0]
  1488  	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_59] # ymm1 = [9007194961870847,9007194961870847,9007194961870847,9007194961870847]
  1489  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_60] # ymm2 = [8,0,0,7,0,0,6,0]
  1490  	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_61] # ymm3 = [0,5,0,0,4,0,0,3]
  1491  	vmovdqa	xmm4, xmmword ptr [rip + .LCPI0_62] # xmm4 = <24,13,u,u>
  1492  	vmovdqa	xmm5, xmmword ptr [rip + .LCPI0_63] # xmm5 = <8,19,u,u>
  1493  	vmovdqa	ymm6, ymmword ptr [rip + .LCPI0_64] # ymm6 = [0,0,2,0,0,1,0,11]
  1494  	.p2align	4, 0x90
  1495  .LBB0_66:                               # =>This Inner Loop Header: Depth=1
  1496  	mov	ecx, dword ptr [rbx - 64]
  1497  	mov	r9d, dword ptr [rbx - 60]
  1498  	shld	r9d, ecx, 13
  1499  	mov	r11d, dword ptr [rbx - 68]
  1500  	shld	ecx, r11d, 2
  1501  	mov	edi, dword ptr [rbx - 72]
  1502  	mov	esi, r11d
  1503  	shld	esi, edi, 12
  1504  	mov	r10d, dword ptr [rbx - 80]
  1505  	mov	eax, dword ptr [rbx - 76]
  1506  	shld	edi, eax, 1
  1507  	mov	edx, eax
  1508  	shld	edx, r10d, 11
  1509  	vmovd	xmm7, r10d
  1510  	vmovd	xmm0, esi
  1511  	vpinsrd	xmm7, xmm7, edx, 1
  1512  	vpinsrd	xmm0, xmm0, r11d, 1
  1513  	vpinsrd	xmm7, xmm7, eax, 2
  1514  	vpinsrd	xmm0, xmm0, ecx, 2
  1515  	vpinsrd	xmm7, xmm7, edi, 3
  1516  	vpinsrd	xmm0, xmm0, r9d, 3
  1517  	vinserti128	ymm0, ymm7, xmm0, 1
  1518  	vpsrlvd	ymm0, ymm0, ymm8
  1519  	vpand	ymm0, ymm0, ymm1
  1520  	vmovdqu	ymmword ptr [r15 - 96], ymm0
  1521  	mov	r10d, dword ptr [rbx - 44]
  1522  	mov	r9d, dword ptr [rbx - 40]
  1523  	shld	r9d, r10d, 5
  1524  	mov	edx, dword ptr [rbx - 48]
  1525  	mov	esi, r10d
  1526  	shld	esi, edx, 15
  1527  	mov	ecx, dword ptr [rbx - 52]
  1528  	shld	edx, ecx, 4
  1529  	mov	r11d, dword ptr [rbx - 60]
  1530  	mov	eax, dword ptr [rbx - 56]
  1531  	mov	edi, ecx
  1532  	shld	edi, eax, 14
  1533  	shld	eax, r11d, 3
  1534  	vmovd	xmm0, r11d
  1535  	vmovd	xmm7, edx
  1536  	vpinsrd	xmm0, xmm0, eax, 1
  1537  	vpinsrd	xmm7, xmm7, esi, 1
  1538  	vpinsrd	xmm0, xmm0, edi, 2
  1539  	vpinsrd	xmm7, xmm7, r10d, 2
  1540  	vpinsrd	xmm0, xmm0, ecx, 3
  1541  	vpinsrd	xmm7, xmm7, r9d, 3
  1542  	vinserti128	ymm0, ymm0, xmm7, 1
  1543  	vpsrlvd	ymm0, ymm0, ymm2
  1544  	vpand	ymm0, ymm0, ymm1
  1545  	vmovdqu	ymmword ptr [r15 - 64], ymm0
  1546  	mov	r9d, dword ptr [rbx - 20]
  1547  	mov	ecx, dword ptr [rbx - 24]
  1548  	mov	r10d, r9d
  1549  	shld	r10d, ecx, 18
  1550  	mov	esi, dword ptr [rbx - 28]
  1551  	shld	ecx, esi, 7
  1552  	mov	edi, dword ptr [rbx - 32]
  1553  	vmovd	xmm0, esi
  1554  	shld	esi, edi, 17
  1555  	mov	eax, dword ptr [rbx - 40]
  1556  	mov	edx, dword ptr [rbx - 36]
  1557  	shld	edi, edx, 6
  1558  	shrd	eax, edx, 16
  1559  	vpinsrd	xmm0, xmm0, ecx, 1
  1560  	vmovd	xmm7, eax
  1561  	vpinsrd	xmm0, xmm0, r10d, 2
  1562  	vpinsrd	xmm7, xmm7, edx, 1
  1563  	vpinsrd	xmm0, xmm0, r9d, 3
  1564  	vpinsrd	xmm7, xmm7, edi, 2
  1565  	vpinsrd	xmm7, xmm7, esi, 3
  1566  	vinserti128	ymm0, ymm7, xmm0, 1
  1567  	vpsrlvd	ymm0, ymm0, ymm3
  1568  	vpand	ymm0, ymm0, ymm1
  1569  	vmovdqu	ymmword ptr [r15 - 32], ymm0
  1570  	mov	r9d, dword ptr [rbx]
  1571  	mov	eax, dword ptr [rbx - 4]
  1572  	mov	edx, r9d
  1573  	shld	edx, eax, 10
  1574  	mov	esi, dword ptr [rbx - 12]
  1575  	mov	edi, dword ptr [rbx - 8]
  1576  	mov	ecx, eax
  1577  	shld	ecx, edi, 20
  1578  	shld	edi, esi, 9
  1579  	vmovq	xmm0, qword ptr [rbx - 20]      # xmm0 = mem[0],zero
  1580  	vpsrlvd	xmm7, xmm0, xmm4
  1581  	vpshufd	xmm0, xmm0, 229                 # xmm0 = xmm0[1,1,2,3]
  1582  	vpinsrd	xmm0, xmm0, esi, 1
  1583  	vpsllvd	xmm0, xmm0, xmm5
  1584  	vpor	xmm0, xmm7, xmm0
  1585  	vmovd	xmm7, ecx
  1586  	vpinsrd	xmm7, xmm7, eax, 1
  1587  	vpinsrd	xmm7, xmm7, edx, 2
  1588  	vpinsrd	xmm7, xmm7, r9d, 3
  1589  	vpinsrd	xmm0, xmm0, esi, 2
  1590  	vpinsrd	xmm0, xmm0, edi, 3
  1591  	vinserti128	ymm0, ymm0, xmm7, 1
  1592  	vpsrlvd	ymm0, ymm0, ymm6
  1593  	vpand	ymm0, ymm0, ymm1
  1594  	vmovdqu	ymmword ptr [r15], ymm0
  1595  	sub	r15, -128
  1596  	add	rbx, 84
  1597  	add	r8, -1
  1598  	jne	.LBB0_66
  1599  	jmp	.LBB0_147
  1600  .LBB0_37:
  1601  	cmp	ecx, 13
  1602  	jg	.LBB0_43
  1603  # %bb.38:
  1604  	cmp	ecx, 12
  1605  	je	.LBB0_126
  1606  # %bb.39:
  1607  	cmp	ecx, 13
  1608  	jne	.LBB0_147
  1609  # %bb.40:
  1610  	cmp	edx, 32
  1611  	jl	.LBB0_147
  1612  # %bb.41:
  1613  	mov	r8d, r14d
  1614  	add	r15, 96
  1615  	add	rbx, 48
  1616  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_90] # ymm0 = [35180077129727,35180077129727,35180077129727,35180077129727]
  1617  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_89] # ymm1 = [0,13,0,7,0,1,14,0]
  1618  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_91] # ymm2 = [8,0,2,15,0,9,0,3]
  1619  	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_92] # ymm3 = [16,0,10,0,4,17,0,11]
  1620  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_93] # ymm4 = [0,5,18,0,12,0,6,19]
  1621  	.p2align	4, 0x90
  1622  .LBB0_42:                               # =>This Inner Loop Header: Depth=1
  1623  	mov	eax, dword ptr [rbx - 40]
  1624  	mov	r9d, dword ptr [rbx - 36]
  1625  	shld	r9d, eax, 5
  1626  	mov	esi, dword ptr [rbx - 48]
  1627  	mov	edx, dword ptr [rbx - 44]
  1628  	mov	ecx, eax
  1629  	shld	ecx, edx, 12
  1630  	mov	edi, edx
  1631  	shld	edi, esi, 6
  1632  	vmovd	xmm5, ecx
  1633  	vpinsrd	xmm5, xmm5, eax, 1
  1634  	vpinsrd	xmm5, xmm5, eax, 2
  1635  	vpinsrd	xmm5, xmm5, r9d, 3
  1636  	vmovd	xmm6, esi
  1637  	vpinsrd	xmm6, xmm6, esi, 1
  1638  	vpinsrd	xmm6, xmm6, edi, 2
  1639  	vpinsrd	xmm6, xmm6, edx, 3
  1640  	vinserti128	ymm5, ymm6, xmm5, 1
  1641  	vpsrlvd	ymm5, ymm5, ymm1
  1642  	vpand	ymm5, ymm5, ymm0
  1643  	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1644  	mov	r9d, dword ptr [rbx - 24]
  1645  	mov	ecx, dword ptr [rbx - 28]
  1646  	mov	edx, r9d
  1647  	shld	edx, ecx, 10
  1648  	mov	esi, dword ptr [rbx - 32]
  1649  	mov	edi, ecx
  1650  	shld	edi, esi, 4
  1651  	mov	r10d, dword ptr [rbx - 36]
  1652  	mov	eax, esi
  1653  	shld	eax, r10d, 11
  1654  	vmovd	xmm5, edi
  1655  	vpinsrd	xmm5, xmm5, ecx, 1
  1656  	vpinsrd	xmm5, xmm5, edx, 2
  1657  	vpinsrd	xmm5, xmm5, r9d, 3
  1658  	vmovd	xmm6, r10d
  1659  	vpinsrd	xmm6, xmm6, eax, 1
  1660  	vpinsrd	xmm6, xmm6, esi, 2
  1661  	vpinsrd	xmm6, xmm6, esi, 3
  1662  	vinserti128	ymm5, ymm6, xmm5, 1
  1663  	vpsrlvd	ymm5, ymm5, ymm2
  1664  	vpand	ymm5, ymm5, ymm0
  1665  	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1666  	mov	r9d, dword ptr [rbx - 12]
  1667  	mov	ecx, dword ptr [rbx - 16]
  1668  	mov	edx, r9d
  1669  	shld	edx, ecx, 2
  1670  	mov	esi, dword ptr [rbx - 24]
  1671  	mov	eax, dword ptr [rbx - 20]
  1672  	vmovd	xmm5, ecx
  1673  	vpinsrd	xmm5, xmm5, ecx, 1
  1674  	shld	ecx, eax, 9
  1675  	mov	edi, eax
  1676  	shld	edi, esi, 3
  1677  	vpinsrd	xmm5, xmm5, edx, 2
  1678  	vpinsrd	xmm5, xmm5, r9d, 3
  1679  	vmovd	xmm6, esi
  1680  	vpinsrd	xmm6, xmm6, edi, 1
  1681  	vpinsrd	xmm6, xmm6, eax, 2
  1682  	vpinsrd	xmm6, xmm6, ecx, 3
  1683  	vinserti128	ymm5, ymm6, xmm5, 1
  1684  	vpsrlvd	ymm5, ymm5, ymm3
  1685  	vpand	ymm5, ymm5, ymm0
  1686  	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1687  	mov	eax, dword ptr [rbx]
  1688  	mov	ecx, dword ptr [rbx - 4]
  1689  	mov	edx, eax
  1690  	shld	edx, ecx, 7
  1691  	mov	esi, dword ptr [rbx - 8]
  1692  	vmovd	xmm5, ecx
  1693  	shld	ecx, esi, 1
  1694  	mov	edi, dword ptr [rbx - 12]
  1695  	shrd	edi, esi, 24
  1696  	vmovd	xmm6, edi
  1697  	vpinsrd	xmm6, xmm6, esi, 1
  1698  	vpinsrd	xmm6, xmm6, esi, 2
  1699  	vpinsrd	xmm6, xmm6, ecx, 3
  1700  	vpinsrd	xmm5, xmm5, edx, 1
  1701  	vpinsrd	xmm5, xmm5, eax, 2
  1702  	vpinsrd	xmm5, xmm5, eax, 3
  1703  	vinserti128	ymm5, ymm6, xmm5, 1
  1704  	vpsrlvd	ymm5, ymm5, ymm4
  1705  	vpand	ymm5, ymm5, ymm0
  1706  	vmovdqu	ymmword ptr [r15], ymm5
  1707  	sub	r15, -128
  1708  	add	rbx, 52
  1709  	add	r8, -1
  1710  	jne	.LBB0_42
  1711  	jmp	.LBB0_147
  1712  .LBB0_85:
  1713  	cmp	ecx, 28
  1714  	je	.LBB0_102
  1715  # %bb.86:
  1716  	cmp	ecx, 29
  1717  	jne	.LBB0_147
  1718  # %bb.87:
  1719  	cmp	edx, 32
  1720  	jl	.LBB0_147
  1721  # %bb.88:
  1722  	mov	r8d, r14d
  1723  	add	r15, 96
  1724  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_12] # ymm0 = [2305843005455597567,2305843005455597567,2305843005455597567,2305843005455597567]
  1725  	vmovdqa	xmm8, xmmword ptr [rip + .LCPI0_13] # xmm8 = <8,5,u,u>
  1726  	vmovdqa	xmm10, xmmword ptr [rip + .LCPI0_14] # xmm10 = <24,27,u,u>
  1727  	vmovdqa	ymm11, ymmword ptr [rip + .LCPI0_15] # ymm11 = [0,0,2,0,0,0,0,0]
  1728  	vmovdqa	xmm12, xmmword ptr [rip + .LCPI0_16] # xmm12 = [16,13,10,7]
  1729  	vmovdqa	xmm5, xmmword ptr [rip + .LCPI0_17] # xmm5 = [16,19,22,25]
  1730  	vmovdqa	ymm6, ymmword ptr [rip + .LCPI0_18] # ymm6 = [0,0,0,0,0,1,0,0]
  1731  	vmovdqa	xmm7, xmmword ptr [rip + .LCPI0_19] # xmm7 = [24,21,18,15]
  1732  	vmovdqa	xmm1, xmmword ptr [rip + .LCPI0_20] # xmm1 = [8,11,14,17]
  1733  	vmovdqa	ymm9, ymmword ptr [rip + .LCPI0_21] # ymm9 = [0,0,0,0,0,0,0,3]
  1734  	.p2align	4, 0x90
  1735  .LBB0_89:                               # =>This Inner Loop Header: Depth=1
  1736  	mov	r11d, dword ptr [rbx + 24]
  1737  	mov	r9d, dword ptr [rbx + 28]
  1738  	shld	r9d, r11d, 21
  1739  	mov	esi, dword ptr [rbx + 20]
  1740  	shld	r11d, esi, 18
  1741  	mov	edi, dword ptr [rbx + 16]
  1742  	shld	esi, edi, 15
  1743  	mov	eax, dword ptr [rbx + 12]
  1744  	shld	edi, eax, 12
  1745  	mov	edx, dword ptr [rbx + 8]
  1746  	shld	eax, edx, 9
  1747  	mov	r10d, dword ptr [rbx]
  1748  	mov	ecx, dword ptr [rbx + 4]
  1749  	shld	edx, ecx, 6
  1750  	shld	ecx, r10d, 3
  1751  	vmovd	xmm2, r10d
  1752  	vmovd	xmm3, edi
  1753  	vpinsrd	xmm2, xmm2, ecx, 1
  1754  	vpinsrd	xmm3, xmm3, esi, 1
  1755  	vpinsrd	xmm2, xmm2, edx, 2
  1756  	vpinsrd	xmm3, xmm3, r11d, 2
  1757  	vpinsrd	xmm2, xmm2, eax, 3
  1758  	vpinsrd	xmm3, xmm3, r9d, 3
  1759  	vinserti128	ymm2, ymm2, xmm3, 1
  1760  	vpand	ymm2, ymm2, ymm0
  1761  	vmovdqu	ymmword ptr [r15 - 96], ymm2
  1762  	mov	eax, dword ptr [rbx + 52]
  1763  	mov	r9d, dword ptr [rbx + 56]
  1764  	shld	r9d, eax, 13
  1765  	mov	edx, dword ptr [rbx + 48]
  1766  	shld	eax, edx, 10
  1767  	mov	esi, dword ptr [rbx + 44]
  1768  	shld	edx, esi, 7
  1769  	mov	edi, dword ptr [rbx + 36]
  1770  	mov	ecx, dword ptr [rbx + 40]
  1771  	shld	esi, ecx, 4
  1772  	shld	ecx, edi, 1
  1773  	vmovq	xmm2, qword ptr [rbx + 28]      # xmm2 = mem[0],zero
  1774  	vpsrlvd	xmm3, xmm2, xmm8
  1775  	vpshufd	xmm2, xmm2, 229                 # xmm2 = xmm2[1,1,2,3]
  1776  	vpinsrd	xmm2, xmm2, edi, 1
  1777  	vpsllvd	xmm2, xmm2, xmm10
  1778  	vpor	xmm2, xmm3, xmm2
  1779  	vmovd	xmm3, esi
  1780  	vpinsrd	xmm3, xmm3, edx, 1
  1781  	vpinsrd	xmm3, xmm3, eax, 2
  1782  	vpinsrd	xmm3, xmm3, r9d, 3
  1783  	vpinsrd	xmm2, xmm2, edi, 2
  1784  	vpinsrd	xmm2, xmm2, ecx, 3
  1785  	vinserti128	ymm2, ymm2, xmm3, 1
  1786  	vpsrlvd	ymm2, ymm2, ymm11
  1787  	vpand	ymm2, ymm2, ymm0
  1788  	vmovdqu	ymmword ptr [r15 - 64], ymm2
  1789  	mov	eax, dword ptr [rbx + 80]
  1790  	mov	ecx, dword ptr [rbx + 84]
  1791  	shld	ecx, eax, 5
  1792  	mov	edx, dword ptr [rbx + 76]
  1793  	mov	esi, dword ptr [rbx + 72]
  1794  	shld	eax, edx, 2
  1795  	mov	edi, edx
  1796  	shld	edi, esi, 28
  1797  	vmovdqu	xmm2, xmmword ptr [rbx + 56]
  1798  	vpsrlvd	xmm3, xmm2, xmm12
  1799  	vpshufd	xmm2, xmm2, 249                 # xmm2 = xmm2[1,2,3,3]
  1800  	vpinsrd	xmm2, xmm2, esi, 3
  1801  	vmovd	xmm4, edi
  1802  	vpinsrd	xmm4, xmm4, edx, 1
  1803  	vpinsrd	xmm4, xmm4, eax, 2
  1804  	vpsllvd	xmm2, xmm2, xmm5
  1805  	vpinsrd	xmm4, xmm4, ecx, 3
  1806  	vpor	xmm2, xmm3, xmm2
  1807  	vinserti128	ymm2, ymm2, xmm4, 1
  1808  	vpsrlvd	ymm2, ymm2, ymm6
  1809  	vpand	ymm2, ymm2, ymm0
  1810  	vmovdqu	ymmword ptr [r15 - 32], ymm2
  1811  	mov	eax, dword ptr [rbx + 112]
  1812  	mov	ecx, dword ptr [rbx + 108]
  1813  	mov	edx, eax
  1814  	shld	edx, ecx, 26
  1815  	mov	esi, dword ptr [rbx + 104]
  1816  	shld	ecx, esi, 23
  1817  	mov	edi, dword ptr [rbx + 100]
  1818  	vmovdqu	xmm2, xmmword ptr [rbx + 84]
  1819  	shld	esi, edi, 20
  1820  	vpsrlvd	xmm3, xmm2, xmm7
  1821  	vpshufd	xmm2, xmm2, 249                 # xmm2 = xmm2[1,2,3,3]
  1822  	vpinsrd	xmm2, xmm2, edi, 3
  1823  	vmovd	xmm4, esi
  1824  	vpinsrd	xmm4, xmm4, ecx, 1
  1825  	vpsllvd	xmm2, xmm2, xmm1
  1826  	vpinsrd	xmm4, xmm4, edx, 2
  1827  	vpinsrd	xmm4, xmm4, eax, 3
  1828  	vpor	xmm2, xmm3, xmm2
  1829  	vinserti128	ymm2, ymm2, xmm4, 1
  1830  	vpsrlvd	ymm2, ymm2, ymm9
  1831  	vpand	ymm2, ymm2, ymm0
  1832  	vmovdqu	ymmword ptr [r15], ymm2
  1833  	add	rbx, 116
  1834  	sub	r15, -128
  1835  	add	r8, -1
  1836  	jne	.LBB0_89
  1837  	jmp	.LBB0_147
  1838  .LBB0_9:
  1839  	cmp	ecx, 2
  1840  	je	.LBB0_141
  1841  # %bb.10:
  1842  	cmp	ecx, 3
  1843  	jne	.LBB0_147
  1844  # %bb.11:
  1845  	cmp	edx, 32
  1846  	jl	.LBB0_147
  1847  # %bb.12:
  1848  	mov	eax, r14d
  1849  	add	r15, 96
  1850  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_127] # ymm0 = [30064771079,30064771079,30064771079,30064771079]
  1851  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_126] # ymm1 = [0,3,6,9,12,15,18,21]
  1852  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_128] # ymm2 = [24,27,0,1,4,7,10,13]
  1853  	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_129] # ymm3 = [16,19,22,25,28,0,2,5]
  1854  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_130] # ymm4 = [8,11,14,17,20,23,26,29]
  1855  	.p2align	4, 0x90
  1856  .LBB0_13:                               # =>This Inner Loop Header: Depth=1
  1857  	vpbroadcastd	ymm5, dword ptr [rbx]
  1858  	vpsrlvd	ymm5, ymm5, ymm1
  1859  	vpand	ymm5, ymm5, ymm0
  1860  	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1861  	mov	ecx, dword ptr [rbx]
  1862  	mov	edx, dword ptr [rbx + 4]
  1863  	mov	esi, edx
  1864  	shld	esi, ecx, 2
  1865  	vmovd	xmm5, ecx
  1866  	vpinsrd	xmm5, xmm5, ecx, 1
  1867  	vpinsrd	xmm5, xmm5, esi, 2
  1868  	vpinsrd	xmm5, xmm5, edx, 3
  1869  	vmovd	xmm6, edx
  1870  	vpbroadcastd	xmm6, xmm6
  1871  	vinserti128	ymm5, ymm5, xmm6, 1
  1872  	vpsrlvd	ymm5, ymm5, ymm2
  1873  	vpand	ymm5, ymm5, ymm0
  1874  	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1875  	mov	ecx, dword ptr [rbx + 4]
  1876  	mov	edx, dword ptr [rbx + 8]
  1877  	mov	esi, edx
  1878  	shld	esi, ecx, 1
  1879  	vmovd	xmm5, ecx
  1880  	vpbroadcastd	xmm6, xmm5
  1881  	vpinsrd	xmm5, xmm5, esi, 1
  1882  	vpinsrd	xmm5, xmm5, edx, 2
  1883  	vpinsrd	xmm5, xmm5, edx, 3
  1884  	vinserti128	ymm5, ymm6, xmm5, 1
  1885  	vpsrlvd	ymm5, ymm5, ymm3
  1886  	vpand	ymm5, ymm5, ymm0
  1887  	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1888  	vpbroadcastd	ymm5, dword ptr [rbx + 8]
  1889  	vpsrlvd	ymm5, ymm5, ymm4
  1890  	vpand	ymm5, ymm5, ymm0
  1891  	vmovdqu	ymmword ptr [r15], ymm5
  1892  	sub	r15, -128
  1893  	add	rbx, 12
  1894  	add	rax, -1
  1895  	jne	.LBB0_13
  1896  	jmp	.LBB0_147
  1897  .LBB0_56:
  1898  	cmp	ecx, 18
  1899  	je	.LBB0_117
  1900  # %bb.57:
  1901  	cmp	ecx, 19
  1902  	jne	.LBB0_147
  1903  # %bb.58:
  1904  	cmp	edx, 32
  1905  	jl	.LBB0_147
  1906  # %bb.59:
  1907  	mov	r8d, r14d
  1908  	add	r15, 96
  1909  	add	rbx, 72
  1910  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_68] # ymm0 = [2251795519242239,2251795519242239,2251795519242239,2251795519242239]
  1911  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_67] # ymm1 = [0,0,6,0,12,0,0,5]
  1912  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_69] # ymm2 = [0,11,0,0,4,0,10,0]
  1913  	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_70] # ymm3 = [0,3,0,9,0,0,2,0]
  1914  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_71] # ymm4 = [8,0,0,1,0,7,0,13]
  1915  	.p2align	4, 0x90
  1916  .LBB0_60:                               # =>This Inner Loop Header: Depth=1
  1917  	mov	r9d, dword ptr [rbx - 56]
  1918  	mov	edx, dword ptr [rbx - 60]
  1919  	mov	esi, r9d
  1920  	shld	esi, edx, 14
  1921  	mov	edi, dword ptr [rbx - 64]
  1922  	mov	r10d, dword ptr [rbx - 72]
  1923  	shld	edx, edi, 1
  1924  	mov	eax, dword ptr [rbx - 68]
  1925  	mov	ecx, eax
  1926  	shld	ecx, r10d, 13
  1927  	vmovd	xmm5, edi
  1928  	shld	edi, eax, 7
  1929  	vpinsrd	xmm5, xmm5, edx, 1
  1930  	vmovd	xmm6, r10d
  1931  	vpinsrd	xmm5, xmm5, esi, 2
  1932  	vpinsrd	xmm6, xmm6, ecx, 1
  1933  	vpinsrd	xmm5, xmm5, r9d, 3
  1934  	vpinsrd	xmm6, xmm6, eax, 2
  1935  	vpinsrd	xmm6, xmm6, edi, 3
  1936  	vinserti128	ymm5, ymm6, xmm5, 1
  1937  	vpsrlvd	ymm5, ymm5, ymm1
  1938  	vpand	ymm5, ymm5, ymm0
  1939  	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1940  	mov	r10d, dword ptr [rbx - 40]
  1941  	mov	r9d, dword ptr [rbx - 36]
  1942  	shld	r9d, r10d, 3
  1943  	mov	edx, dword ptr [rbx - 44]
  1944  	mov	esi, r10d
  1945  	shld	esi, edx, 9
  1946  	mov	edi, dword ptr [rbx - 48]
  1947  	vmovd	xmm5, edx
  1948  	shld	edx, edi, 15
  1949  	mov	ecx, dword ptr [rbx - 56]
  1950  	mov	eax, dword ptr [rbx - 52]
  1951  	shld	edi, eax, 2
  1952  	shrd	ecx, eax, 24
  1953  	vpinsrd	xmm5, xmm5, esi, 1
  1954  	vmovd	xmm6, ecx
  1955  	vpinsrd	xmm5, xmm5, r10d, 2
  1956  	vpinsrd	xmm6, xmm6, eax, 1
  1957  	vpinsrd	xmm5, xmm5, r9d, 3
  1958  	vpinsrd	xmm6, xmm6, edi, 2
  1959  	vpinsrd	xmm6, xmm6, edx, 3
  1960  	vinserti128	ymm5, ymm6, xmm5, 1
  1961  	vpsrlvd	ymm5, ymm5, ymm2
  1962  	vpand	ymm5, ymm5, ymm0
  1963  	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1964  	mov	r10d, dword ptr [rbx - 20]
  1965  	mov	r9d, dword ptr [rbx - 16]
  1966  	shld	r9d, r10d, 11
  1967  	mov	edx, dword ptr [rbx - 24]
  1968  	mov	esi, r10d
  1969  	mov	r11d, dword ptr [rbx - 28]
  1970  	shld	esi, edx, 17
  1971  	mov	ecx, dword ptr [rbx - 36]
  1972  	mov	eax, dword ptr [rbx - 32]
  1973  	shld	edx, r11d, 4
  1974  	mov	edi, r11d
  1975  	shld	edi, eax, 10
  1976  	shrd	ecx, eax, 16
  1977  	vmovd	xmm5, edx
  1978  	vpinsrd	xmm5, xmm5, esi, 1
  1979  	vmovd	xmm6, ecx
  1980  	vpinsrd	xmm5, xmm5, r10d, 2
  1981  	vpinsrd	xmm6, xmm6, eax, 1
  1982  	vpinsrd	xmm5, xmm5, r9d, 3
  1983  	vpinsrd	xmm6, xmm6, edi, 2
  1984  	vpinsrd	xmm6, xmm6, r11d, 3
  1985  	vinserti128	ymm5, ymm6, xmm5, 1
  1986  	vpsrlvd	ymm5, ymm5, ymm3
  1987  	vpand	ymm5, ymm5, ymm0
  1988  	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1989  	mov	r9d, dword ptr [rbx]
  1990  	mov	r11d, dword ptr [rbx - 4]
  1991  	mov	edx, r9d
  1992  	shld	edx, r11d, 6
  1993  	mov	ecx, dword ptr [rbx - 8]
  1994  	mov	edi, r11d
  1995  	shld	edi, ecx, 12
  1996  	mov	r10d, dword ptr [rbx - 16]
  1997  	mov	eax, dword ptr [rbx - 12]
  1998  	mov	esi, ecx
  1999  	shld	esi, eax, 18
  2000  	shld	eax, r10d, 5
  2001  	vmovd	xmm5, r10d
  2002  	vmovd	xmm6, edi
  2003  	vpinsrd	xmm5, xmm5, eax, 1
  2004  	vpinsrd	xmm6, xmm6, r11d, 1
  2005  	vpinsrd	xmm5, xmm5, esi, 2
  2006  	vpinsrd	xmm6, xmm6, edx, 2
  2007  	vpinsrd	xmm5, xmm5, ecx, 3
  2008  	vpinsrd	xmm6, xmm6, r9d, 3
  2009  	vinserti128	ymm5, ymm5, xmm6, 1
  2010  	vpsrlvd	ymm5, ymm5, ymm4
  2011  	vpand	ymm5, ymm5, ymm0
  2012  	vmovdqu	ymmword ptr [r15], ymm5
  2013  	sub	r15, -128
  2014  	add	rbx, 76
  2015  	add	r8, -1
  2016  	jne	.LBB0_60
  2017  	jmp	.LBB0_147
  2018  .LBB0_32:
  2019  	cmp	ecx, 10
  2020  	je	.LBB0_129
  2021  # %bb.33:
  2022  	cmp	ecx, 11
  2023  	jne	.LBB0_147
  2024  # %bb.34:
  2025  	cmp	edx, 32
  2026  	jl	.LBB0_147
  2027  # %bb.35:
  2028  	mov	r8d, r14d
  2029  	add	r15, 96
  2030  	add	rbx, 40
  2031  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_97] # ymm0 = [8791798056959,8791798056959,8791798056959,8791798056959]
  2032  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_96] # ymm1 = [0,11,0,1,12,0,2,13]
  2033  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_98] # ymm2 = [0,3,14,0,4,15,0,5]
  2034  	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_99] # ymm3 = [16,0,6,17,0,7,18,0]
  2035  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_100] # ymm4 = [8,19,0,9,20,0,10,21]
  2036  	.p2align	4, 0x90
  2037  .LBB0_36:                               # =>This Inner Loop Header: Depth=1
  2038  	mov	ecx, dword ptr [rbx - 32]
  2039  	mov	edx, dword ptr [rbx - 40]
  2040  	mov	esi, dword ptr [rbx - 36]
  2041  	mov	edi, ecx
  2042  	shld	edi, esi, 9
  2043  	mov	eax, esi
  2044  	shld	eax, edx, 10
  2045  	vmovd	xmm5, esi
  2046  	vpinsrd	xmm5, xmm5, edi, 1
  2047  	vpinsrd	xmm5, xmm5, ecx, 2
  2048  	vpinsrd	xmm5, xmm5, ecx, 3
  2049  	vmovd	xmm6, edx
  2050  	vpinsrd	xmm6, xmm6, edx, 1
  2051  	vpinsrd	xmm6, xmm6, eax, 2
  2052  	vpinsrd	xmm6, xmm6, esi, 3
  2053  	vinserti128	ymm5, ymm6, xmm5, 1
  2054  	vpsrlvd	ymm5, ymm5, ymm1
  2055  	vpand	ymm5, ymm5, ymm0
  2056  	vmovdqu	ymmword ptr [r15 - 96], ymm5
  2057  	mov	eax, dword ptr [rbx - 20]
  2058  	mov	ecx, dword ptr [rbx - 24]
  2059  	mov	edx, eax
  2060  	shld	edx, ecx, 6
  2061  	mov	esi, dword ptr [rbx - 32]
  2062  	mov	edi, dword ptr [rbx - 28]
  2063  	vmovd	xmm5, ecx
  2064  	vpinsrd	xmm5, xmm5, ecx, 1
  2065  	shld	ecx, edi, 7
  2066  	shrd	esi, edi, 24
  2067  	vpinsrd	xmm5, xmm5, edx, 2
  2068  	vpinsrd	xmm5, xmm5, eax, 3
  2069  	vmovd	xmm6, esi
  2070  	vpinsrd	xmm6, xmm6, edi, 1
  2071  	vpinsrd	xmm6, xmm6, edi, 2
  2072  	vpinsrd	xmm6, xmm6, ecx, 3
  2073  	vinserti128	ymm5, ymm6, xmm5, 1
  2074  	vpsrlvd	ymm5, ymm5, ymm2
  2075  	vpand	ymm5, ymm5, ymm0
  2076  	vmovdqu	ymmword ptr [r15 - 64], ymm5
  2077  	mov	eax, dword ptr [rbx - 12]
  2078  	mov	ecx, dword ptr [rbx - 8]
  2079  	shld	ecx, eax, 3
  2080  	mov	r9d, dword ptr [rbx - 20]
  2081  	mov	esi, dword ptr [rbx - 16]
  2082  	mov	edi, eax
  2083  	shld	edi, esi, 4
  2084  	mov	edx, esi
  2085  	shld	edx, r9d, 5
  2086  	vmovd	xmm5, edi
  2087  	vpinsrd	xmm5, xmm5, eax, 1
  2088  	vpinsrd	xmm5, xmm5, eax, 2
  2089  	vpinsrd	xmm5, xmm5, ecx, 3
  2090  	vmovd	xmm6, r9d
  2091  	vpinsrd	xmm6, xmm6, edx, 1
  2092  	vpinsrd	xmm6, xmm6, esi, 2
  2093  	vpinsrd	xmm6, xmm6, esi, 3
  2094  	vinserti128	ymm5, ymm6, xmm5, 1
  2095  	vpsrlvd	ymm5, ymm5, ymm3
  2096  	vpand	ymm5, ymm5, ymm0
  2097  	vmovdqu	ymmword ptr [r15 - 32], ymm5
  2098  	mov	eax, dword ptr [rbx]
  2099  	mov	ecx, dword ptr [rbx - 8]
  2100  	mov	edx, dword ptr [rbx - 4]
  2101  	mov	esi, eax
  2102  	shld	esi, edx, 1
  2103  	mov	edi, edx
  2104  	shld	edi, ecx, 2
  2105  	vmovd	xmm5, edx
  2106  	vpinsrd	xmm5, xmm5, esi, 1
  2107  	vpinsrd	xmm5, xmm5, eax, 2
  2108  	vpinsrd	xmm5, xmm5, eax, 3
  2109  	vmovd	xmm6, ecx
  2110  	vpinsrd	xmm6, xmm6, ecx, 1
  2111  	vpinsrd	xmm6, xmm6, edi, 2
  2112  	vpinsrd	xmm6, xmm6, edx, 3
  2113  	vinserti128	ymm5, ymm6, xmm5, 1
  2114  	vpsrlvd	ymm5, ymm5, ymm4
  2115  	vpand	ymm5, ymm5, ymm0
  2116  	vmovdqu	ymmword ptr [r15], ymm5
  2117  	sub	r15, -128
  2118  	add	rbx, 44
  2119  	add	r8, -1
  2120  	jne	.LBB0_36
  2121  	jmp	.LBB0_147
  2122  .LBB0_79:
  2123  	cmp	ecx, 26
  2124  	je	.LBB0_105
  2125  # %bb.80:
  2126  	cmp	ecx, 27
  2127  	jne	.LBB0_147
  2128  # %bb.81:
  2129  	cmp	edx, 32
  2130  	jl	.LBB0_147
  2131  # %bb.82:
  2132  	mov	r8d, r14d
  2133  	add	r15, 96
  2134  	add	rbx, 104
  2135  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_25] # ymm0 = [576460748142673919,576460748142673919,576460748142673919,576460748142673919]
  2136  	vmovdqa	ymm9, ymmword ptr [rip + .LCPI0_24] # ymm9 = [0,0,0,0,0,0,2,0]
  2137  	vmovdqa	xmm10, xmmword ptr [rip + .LCPI0_26] # xmm10 = [24,19,14,9]
  2138  	vmovdqa	xmm11, xmmword ptr [rip + .LCPI0_27] # xmm11 = [8,13,18,23]
  2139  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_28] # ymm4 = [0,0,0,0,4,0,0,0]
  2140  	vmovdqa	xmm5, xmmword ptr [rip + .LCPI0_29] # xmm5 = <16,11,u,u>
  2141  	vmovdqa	xmm6, xmmword ptr [rip + .LCPI0_30] # xmm6 = <16,21,u,u>
  2142  	vmovdqa	ymm7, ymmword ptr [rip + .LCPI0_31] # ymm7 = [0,0,0,1,0,0,0,0]
  2143  	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_32] # ymm8 = [0,3,0,0,0,0,0,5]
  2144  	.p2align	4, 0x90
  2145  .LBB0_83:                               # =>This Inner Loop Header: Depth=1
  2146  	mov	r10d, dword ptr [rbx - 84]
  2147  	mov	r9d, dword ptr [rbx - 80]
  2148  	shld	r9d, r10d, 3
  2149  	mov	esi, dword ptr [rbx - 88]
  2150  	mov	edi, r10d
  2151  	shld	edi, esi, 25
  2152  	mov	eax, dword ptr [rbx - 92]
  2153  	shld	esi, eax, 20
  2154  	mov	edx, dword ptr [rbx - 96]
  2155  	shld	eax, edx, 15
  2156  	mov	r11d, dword ptr [rbx - 104]
  2157  	mov	ecx, dword ptr [rbx - 100]
  2158  	shld	edx, ecx, 10
  2159  	shld	ecx, r11d, 5
  2160  	vmovd	xmm1, r11d
  2161  	vmovd	xmm2, esi
  2162  	vpinsrd	xmm1, xmm1, ecx, 1
  2163  	vpinsrd	xmm2, xmm2, edi, 1
  2164  	vpinsrd	xmm1, xmm1, edx, 2
  2165  	vpinsrd	xmm2, xmm2, r10d, 2
  2166  	vpinsrd	xmm1, xmm1, eax, 3
  2167  	vpinsrd	xmm2, xmm2, r9d, 3
  2168  	vinserti128	ymm1, ymm1, xmm2, 1
  2169  	vpsrlvd	ymm1, ymm1, ymm9
  2170  	vpand	ymm1, ymm1, ymm0
  2171  	vmovdqu	ymmword ptr [r15 - 96], ymm1
  2172  	mov	eax, dword ptr [rbx - 56]
  2173  	mov	ecx, dword ptr [rbx - 52]
  2174  	shld	ecx, eax, 11
  2175  	mov	edx, dword ptr [rbx - 60]
  2176  	mov	esi, dword ptr [rbx - 64]
  2177  	shld	eax, edx, 6
  2178  	shld	edx, esi, 1
  2179  	vmovdqu	xmm1, xmmword ptr [rbx - 80]
  2180  	vpsrlvd	xmm2, xmm1, xmm10
  2181  	vpshufd	xmm1, xmm1, 249                 # xmm1 = xmm1[1,2,3,3]
  2182  	vmovd	xmm3, esi
  2183  	vpinsrd	xmm1, xmm1, esi, 3
  2184  	vpinsrd	xmm3, xmm3, edx, 1
  2185  	vpinsrd	xmm3, xmm3, eax, 2
  2186  	vpsllvd	xmm1, xmm1, xmm11
  2187  	vpinsrd	xmm3, xmm3, ecx, 3
  2188  	vpor	xmm1, xmm2, xmm1
  2189  	vinserti128	ymm1, ymm1, xmm3, 1
  2190  	vpsrlvd	ymm1, ymm1, ymm4
  2191  	vpand	ymm1, ymm1, ymm0
  2192  	vmovdqu	ymmword ptr [r15 - 64], ymm1
  2193  	mov	eax, dword ptr [rbx - 28]
  2194  	mov	r9d, dword ptr [rbx - 24]
  2195  	shld	r9d, eax, 19
  2196  	mov	edx, dword ptr [rbx - 32]
  2197  	shld	eax, edx, 14
  2198  	mov	esi, dword ptr [rbx - 36]
  2199  	shld	edx, esi, 9
  2200  	mov	r10d, dword ptr [rbx - 44]
  2201  	mov	edi, dword ptr [rbx - 40]
  2202  	shld	esi, edi, 4
  2203  	mov	ecx, edi
  2204  	shld	ecx, r10d, 26
  2205  	vmovq	xmm1, qword ptr [rbx - 52]      # xmm1 = mem[0],zero
  2206  	vpsrlvd	xmm2, xmm1, xmm5
  2207  	vpshufd	xmm1, xmm1, 229                 # xmm1 = xmm1[1,1,2,3]
  2208  	vpinsrd	xmm1, xmm1, r10d, 1
  2209  	vpsllvd	xmm1, xmm1, xmm6
  2210  	vmovd	xmm3, esi
  2211  	vpinsrd	xmm3, xmm3, edx, 1
  2212  	vpor	xmm1, xmm2, xmm1
  2213  	vpinsrd	xmm2, xmm3, eax, 2
  2214  	vpinsrd	xmm2, xmm2, r9d, 3
  2215  	vpinsrd	xmm1, xmm1, ecx, 2
  2216  	vpinsrd	xmm1, xmm1, edi, 3
  2217  	vinserti128	ymm1, ymm1, xmm2, 1
  2218  	vpsrlvd	ymm1, ymm1, ymm7
  2219  	vpand	ymm1, ymm1, ymm0
  2220  	vmovdqu	ymmword ptr [r15 - 32], ymm1
  2221  	mov	r9d, dword ptr [rbx]
  2222  	mov	r11d, dword ptr [rbx - 4]
  2223  	mov	r10d, r9d
  2224  	shld	r10d, r11d, 22
  2225  	mov	esi, dword ptr [rbx - 8]
  2226  	shld	r11d, esi, 17
  2227  	mov	edi, dword ptr [rbx - 12]
  2228  	mov	eax, dword ptr [rbx - 16]
  2229  	shld	esi, edi, 12
  2230  	mov	edx, dword ptr [rbx - 24]
  2231  	mov	ecx, dword ptr [rbx - 20]
  2232  	shld	edi, eax, 7
  2233  	shrd	edx, ecx, 8
  2234  	shld	eax, ecx, 2
  2235  	vmovd	xmm1, esi
  2236  	vpinsrd	xmm1, xmm1, r11d, 1
  2237  	vmovd	xmm2, edx
  2238  	vpinsrd	xmm1, xmm1, r10d, 2
  2239  	vpinsrd	xmm2, xmm2, ecx, 1
  2240  	vpinsrd	xmm1, xmm1, r9d, 3
  2241  	vpinsrd	xmm2, xmm2, eax, 2
  2242  	vpinsrd	xmm2, xmm2, edi, 3
  2243  	vinserti128	ymm1, ymm2, xmm1, 1
  2244  	vpsrlvd	ymm1, ymm1, ymm8
  2245  	vpand	ymm1, ymm1, ymm0
  2246  	vmovdqu	ymmword ptr [r15], ymm1
  2247  	sub	r15, -128
  2248  	add	rbx, 108
  2249  	add	r8, -1
  2250  	jne	.LBB0_83
  2251  	jmp	.LBB0_147
  2252  .LBB0_20:
  2253  	cmp	ecx, 6
  2254  	je	.LBB0_135
  2255  # %bb.21:
  2256  	cmp	ecx, 7
  2257  	jne	.LBB0_147
  2258  # %bb.22:
  2259  	cmp	edx, 32
  2260  	jl	.LBB0_147
  2261  # %bb.23:
  2262  	mov	r8d, r14d
  2263  	add	r15, 96
  2264  	add	rbx, 24
  2265  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_112] # ymm0 = [545460846719,545460846719,545460846719,545460846719]
  2266  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_111] # ymm1 = [0,7,14,21,0,3,10,17]
  2267  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_113] # ymm2 = [24,0,6,13,20,0,2,9]
  2268  	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_114] # ymm3 = [16,23,0,5,12,19,0,1]
  2269  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_115] # ymm4 = [8,15,22,0,4,11,18,25]
  2270  	.p2align	4, 0x90
  2271  .LBB0_24:                               # =>This Inner Loop Header: Depth=1
  2272  	mov	ecx, dword ptr [rbx - 24]
  2273  	mov	edx, dword ptr [rbx - 20]
  2274  	mov	esi, edx
  2275  	shld	esi, ecx, 4
  2276  	vmovd	xmm5, ecx
  2277  	vmovd	xmm6, esi
  2278  	vpinsrd	xmm6, xmm6, edx, 1
  2279  	vpinsrd	xmm6, xmm6, edx, 2
  2280  	vpinsrd	xmm6, xmm6, edx, 3
  2281  	vpbroadcastd	xmm5, xmm5
  2282  	vinserti128	ymm5, ymm5, xmm6, 1
  2283  	vpsrlvd	ymm5, ymm5, ymm1
  2284  	vpand	ymm5, ymm5, ymm0
  2285  	vmovdqu	ymmword ptr [r15 - 96], ymm5
  2286  	mov	ecx, dword ptr [rbx - 12]
  2287  	mov	edx, dword ptr [rbx - 20]
  2288  	mov	esi, dword ptr [rbx - 16]
  2289  	mov	edi, ecx
  2290  	shld	edi, esi, 5
  2291  	mov	eax, esi
  2292  	shld	eax, edx, 1
  2293  	vmovd	xmm5, esi
  2294  	vpinsrd	xmm5, xmm5, edi, 1
  2295  	vpinsrd	xmm5, xmm5, ecx, 2
  2296  	vpinsrd	xmm5, xmm5, ecx, 3
  2297  	vmovd	xmm6, edx
  2298  	vpinsrd	xmm6, xmm6, eax, 1
  2299  	vpinsrd	xmm6, xmm6, esi, 2
  2300  	vpinsrd	xmm6, xmm6, esi, 3
  2301  	vinserti128	ymm5, ymm6, xmm5, 1
  2302  	vpsrlvd	ymm5, ymm5, ymm2
  2303  	vpand	ymm5, ymm5, ymm0
  2304  	vmovdqu	ymmword ptr [r15 - 64], ymm5
  2305  	mov	eax, dword ptr [rbx - 4]
  2306  	mov	ecx, dword ptr [rbx - 12]
  2307  	mov	edx, dword ptr [rbx - 8]
  2308  	mov	esi, eax
  2309  	shld	esi, edx, 6
  2310  	mov	edi, edx
  2311  	shld	edi, ecx, 2
  2312  	vmovd	xmm5, edx
  2313  	vpinsrd	xmm5, xmm5, edx, 1
  2314  	vpinsrd	xmm5, xmm5, esi, 2
  2315  	vpinsrd	xmm5, xmm5, eax, 3
  2316  	vmovd	xmm6, ecx
  2317  	vpinsrd	xmm6, xmm6, ecx, 1
  2318  	vpinsrd	xmm6, xmm6, edi, 2
  2319  	vpinsrd	xmm6, xmm6, edx, 3
  2320  	vinserti128	ymm5, ymm6, xmm5, 1
  2321  	vpsrlvd	ymm5, ymm5, ymm3
  2322  	vpand	ymm5, ymm5, ymm0
  2323  	vmovdqu	ymmword ptr [r15 - 32], ymm5
  2324  	mov	eax, dword ptr [rbx - 4]
  2325  	mov	ecx, dword ptr [rbx]
  2326  	mov	edx, ecx
  2327  	shld	edx, eax, 3
  2328  	vmovd	xmm5, ecx
  2329  	vmovd	xmm6, eax
  2330  	vpinsrd	xmm6, xmm6, eax, 1
  2331  	vpinsrd	xmm6, xmm6, eax, 2
  2332  	vpinsrd	xmm6, xmm6, edx, 3
  2333  	vpbroadcastd	xmm5, xmm5
  2334  	vinserti128	ymm5, ymm6, xmm5, 1
  2335  	vpsrlvd	ymm5, ymm5, ymm4
  2336  	vpand	ymm5, ymm5, ymm0
  2337  	vmovdqu	ymmword ptr [r15], ymm5
  2338  	sub	r15, -128
  2339  	add	rbx, 28
  2340  	add	r8, -1
  2341  	jne	.LBB0_24
  2342  	jmp	.LBB0_147
  2343  .LBB0_67:
  2344  	cmp	ecx, 22
  2345  	je	.LBB0_111
  2346  # %bb.68:
  2347  	cmp	ecx, 23
  2348  	jne	.LBB0_147
  2349  # %bb.69:
  2350  	cmp	edx, 32
  2351  	jl	.LBB0_147
  2352  # %bb.70:
  2353  	mov	r8d, r14d
  2354  	add	r15, 96
  2355  	add	rbx, 88
  2356  	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_48] # ymm8 = [0,0,0,5,0,0,0,1]
  2357  	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_49] # ymm1 = [36028792732385279,36028792732385279,36028792732385279,36028792732385279]
  2358  	vmovdqa	xmm2, xmmword ptr [rip + .LCPI0_50] # xmm2 = <24,15,u,u>
  2359  	vmovdqa	xmm3, xmmword ptr [rip + .LCPI0_51] # xmm3 = <8,17,u,u>
  2360  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_52] # ymm4 = [0,0,6,0,0,0,2,0]
  2361  	vmovdqa	ymm5, ymmword ptr [rip + .LCPI0_53] # ymm5 = [0,7,0,0,0,3,0,0]
  2362  	vmovdqa	ymm6, ymmword ptr [rip + .LCPI0_54] # ymm6 = [8,0,0,0,4,0,0,9]
  2363  	.p2align	4, 0x90
  2364  .LBB0_71:                               # =>This Inner Loop Header: Depth=1
  2365  	mov	r9d, dword ptr [rbx - 68]
  2366  	mov	edx, dword ptr [rbx - 72]
  2367  	mov	r11d, r9d
  2368  	shld	r11d, edx, 22
  2369  	mov	edi, dword ptr [rbx - 76]
  2370  	shld	edx, edi, 13
  2371  	mov	esi, dword ptr [rbx - 80]
  2372  	shld	edi, esi, 4
  2373  	mov	r10d, dword ptr [rbx - 88]
  2374  	mov	ecx, dword ptr [rbx - 84]
  2375  	mov	eax, esi
  2376  	shld	eax, ecx, 18
  2377  	shld	ecx, r10d, 9
  2378  	vmovd	xmm7, r10d
  2379  	vmovd	xmm0, edi
  2380  	vpinsrd	xmm7, xmm7, ecx, 1
  2381  	vpinsrd	xmm0, xmm0, edx, 1
  2382  	vpinsrd	xmm7, xmm7, eax, 2
  2383  	vpinsrd	xmm0, xmm0, r11d, 2
  2384  	vpinsrd	xmm7, xmm7, esi, 3
  2385  	vpinsrd	xmm0, xmm0, r9d, 3
  2386  	vinserti128	ymm0, ymm7, xmm0, 1
  2387  	vpsrlvd	ymm0, ymm0, ymm8
  2388  	vpand	ymm0, ymm0, ymm1
  2389  	vmovdqu	ymmword ptr [r15 - 96], ymm0
  2390  	mov	eax, dword ptr [rbx - 48]
  2391  	mov	r9d, dword ptr [rbx - 44]
  2392  	shld	r9d, eax, 7
  2393  	mov	edx, dword ptr [rbx - 52]
  2394  	mov	esi, eax
  2395  	shld	esi, edx, 21
  2396  	mov	edi, dword ptr [rbx - 60]
  2397  	mov	ecx, dword ptr [rbx - 56]
  2398  	shld	edx, ecx, 12
  2399  	shld	ecx, edi, 3
  2400  	vmovq	xmm0, qword ptr [rbx - 68]      # xmm0 = mem[0],zero
  2401  	vpsrlvd	xmm7, xmm0, xmm2
  2402  	vpshufd	xmm0, xmm0, 229                 # xmm0 = xmm0[1,1,2,3]
  2403  	vpinsrd	xmm0, xmm0, edi, 1
  2404  	vpsllvd	xmm0, xmm0, xmm3
  2405  	vpor	xmm0, xmm7, xmm0
  2406  	vmovd	xmm7, edx
  2407  	vpinsrd	xmm7, xmm7, esi, 1
  2408  	vpinsrd	xmm7, xmm7, eax, 2
  2409  	vpinsrd	xmm7, xmm7, r9d, 3
  2410  	vpinsrd	xmm0, xmm0, edi, 2
  2411  	vpinsrd	xmm0, xmm0, ecx, 3
  2412  	vinserti128	ymm0, ymm0, xmm7, 1
  2413  	vpsrlvd	ymm0, ymm0, ymm4
  2414  	vpand	ymm0, ymm0, ymm1
  2415  	vmovdqu	ymmword ptr [r15 - 64], ymm0
  2416  	mov	r11d, dword ptr [rbx - 24]
  2417  	mov	r9d, dword ptr [rbx - 20]
  2418  	shld	r9d, r11d, 15
  2419  	mov	r10d, dword ptr [rbx - 28]
  2420  	shld	r11d, r10d, 6
  2421  	mov	esi, dword ptr [rbx - 32]
  2422  	mov	edi, r10d
  2423  	mov	ecx, dword ptr [rbx - 36]
  2424  	shld	edi, esi, 20
  2425  	mov	edx, dword ptr [rbx - 44]
  2426  	mov	eax, dword ptr [rbx - 40]
  2427  	shld	esi, ecx, 11
  2428  	shrd	edx, eax, 16
  2429  	shld	ecx, eax, 2
  2430  	vmovd	xmm0, edi
  2431  	vpinsrd	xmm0, xmm0, r10d, 1
  2432  	vmovd	xmm7, edx
  2433  	vpinsrd	xmm0, xmm0, r11d, 2
  2434  	vpinsrd	xmm7, xmm7, eax, 1
  2435  	vpinsrd	xmm0, xmm0, r9d, 3
  2436  	vpinsrd	xmm7, xmm7, ecx, 2
  2437  	vpinsrd	xmm7, xmm7, esi, 3
  2438  	vinserti128	ymm0, ymm7, xmm0, 1
  2439  	vpsrlvd	ymm0, ymm0, ymm5
  2440  	vpand	ymm0, ymm0, ymm1
  2441  	vmovdqu	ymmword ptr [r15 - 32], ymm0
  2442  	mov	r9d, dword ptr [rbx]
  2443  	mov	ecx, dword ptr [rbx - 4]
  2444  	mov	edx, r9d
  2445  	shld	edx, ecx, 14
  2446  	mov	esi, dword ptr [rbx - 8]
  2447  	shld	ecx, esi, 5
  2448  	mov	edi, dword ptr [rbx - 12]
  2449  	vmovd	xmm0, esi
  2450  	shld	esi, edi, 19
  2451  	mov	r10d, dword ptr [rbx - 20]
  2452  	mov	eax, dword ptr [rbx - 16]
  2453  	shld	edi, eax, 10
  2454  	shld	eax, r10d, 1
  2455  	vpinsrd	xmm0, xmm0, ecx, 1
  2456  	vmovd	xmm7, r10d
  2457  	vpinsrd	xmm0, xmm0, edx, 2
  2458  	vpinsrd	xmm7, xmm7, eax, 1
  2459  	vpinsrd	xmm0, xmm0, r9d, 3
  2460  	vpinsrd	xmm7, xmm7, edi, 2
  2461  	vpinsrd	xmm7, xmm7, esi, 3
  2462  	vinserti128	ymm0, ymm7, xmm0, 1
  2463  	vpsrlvd	ymm0, ymm0, ymm6
  2464  	vpand	ymm0, ymm0, ymm1
  2465  	vmovdqu	ymmword ptr [r15], ymm0
  2466  	sub	r15, -128
  2467  	add	rbx, 92
  2468  	add	r8, -1
  2469  	jne	.LBB0_71
  2470  	jmp	.LBB0_147
  2471  .LBB0_43:
  2472  	cmp	ecx, 14
  2473  	je	.LBB0_123
  2474  # %bb.44:
  2475  	cmp	ecx, 15
  2476  	jne	.LBB0_147
  2477  # %bb.45:
  2478  	cmp	edx, 32
  2479  	jl	.LBB0_147
  2480  # %bb.46:
  2481  	mov	r8d, r14d
  2482  	add	r15, 96
  2483  	add	rbx, 56
  2484  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_82] # ymm0 = [140733193420799,140733193420799,140733193420799,140733193420799]
  2485  	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_81] # ymm1 = [0,15,0,13,0,11,0,9]
  2486  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_83] # ymm2 = [0,7,0,5,0,3,0,1]
  2487  	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_84] # ymm3 = [16,0,14,0,12,0,10,0]
  2488  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_85] # ymm4 = [8,0,6,0,4,0,2,17]
  2489  	.p2align	4, 0x90
  2490  .LBB0_47:                               # =>This Inner Loop Header: Depth=1
  2491  	mov	r9d, dword ptr [rbx - 44]
  2492  	mov	eax, dword ptr [rbx - 48]
  2493  	mov	esi, r9d
  2494  	shld	esi, eax, 6
  2495  	mov	r10d, dword ptr [rbx - 52]
  2496  	mov	edx, eax
  2497  	shld	edx, r10d, 4
  2498  	mov	ecx, dword ptr [rbx - 56]
  2499  	mov	edi, r10d
  2500  	shld	edi, ecx, 2
  2501  	vmovd	xmm5, edx
  2502  	vpinsrd	xmm5, xmm5, eax, 1
  2503  	vpinsrd	xmm5, xmm5, esi, 2
  2504  	vpinsrd	xmm5, xmm5, r9d, 3
  2505  	vmovd	xmm6, ecx
  2506  	vpinsrd	xmm6, xmm6, ecx, 1
  2507  	vpinsrd	xmm6, xmm6, edi, 2
  2508  	vpinsrd	xmm6, xmm6, r10d, 3
  2509  	vinserti128	ymm5, ymm6, xmm5, 1
  2510  	vpsrlvd	ymm5, ymm5, ymm1
  2511  	vpand	ymm5, ymm5, ymm0
  2512  	vmovdqu	ymmword ptr [r15 - 96], ymm5
  2513  	mov	r9d, dword ptr [rbx - 28]
  2514  	mov	r11d, dword ptr [rbx - 32]
  2515  	mov	edx, r9d
  2516  	shld	edx, r11d, 14
  2517  	mov	r10d, dword ptr [rbx - 36]
  2518  	mov	edi, r11d
  2519  	shld	edi, r10d, 12
  2520  	mov	eax, dword ptr [rbx - 44]
  2521  	mov	esi, dword ptr [rbx - 40]
  2522  	mov	ecx, r10d
  2523  	shld	ecx, esi, 10
  2524  	shrd	eax, esi, 24
  2525  	vmovd	xmm5, edi
  2526  	vpinsrd	xmm5, xmm5, r11d, 1
  2527  	vpinsrd	xmm5, xmm5, edx, 2
  2528  	vpinsrd	xmm5, xmm5, r9d, 3
  2529  	vmovd	xmm6, eax
  2530  	vpinsrd	xmm6, xmm6, esi, 1
  2531  	vpinsrd	xmm6, xmm6, ecx, 2
  2532  	vpinsrd	xmm6, xmm6, r10d, 3
  2533  	vinserti128	ymm5, ymm6, xmm5, 1
  2534  	vpsrlvd	ymm5, ymm5, ymm2
  2535  	vpand	ymm5, ymm5, ymm0
  2536  	vmovdqu	ymmword ptr [r15 - 64], ymm5
  2537  	mov	eax, dword ptr [rbx - 16]
  2538  	mov	r10d, dword ptr [rbx - 12]
  2539  	shld	r10d, eax, 7
  2540  	mov	edx, dword ptr [rbx - 20]
  2541  	mov	esi, eax
  2542  	shld	esi, edx, 5
  2543  	mov	r9d, dword ptr [rbx - 28]
  2544  	mov	ecx, dword ptr [rbx - 24]
  2545  	mov	edi, ecx
  2546  	shld	edi, r9d, 1
  2547  	vmovd	xmm5, edx
  2548  	shld	edx, ecx, 3
  2549  	vpinsrd	xmm5, xmm5, esi, 1
  2550  	vpinsrd	xmm5, xmm5, eax, 2
  2551  	vpinsrd	xmm5, xmm5, r10d, 3
  2552  	vmovd	xmm6, r9d
  2553  	vpinsrd	xmm6, xmm6, edi, 1
  2554  	vpinsrd	xmm6, xmm6, ecx, 2
  2555  	vpinsrd	xmm6, xmm6, edx, 3
  2556  	vinserti128	ymm5, ymm6, xmm5, 1
  2557  	vpsrlvd	ymm5, ymm5, ymm3
  2558  	vpand	ymm5, ymm5, ymm0
  2559  	vmovdqu	ymmword ptr [r15 - 32], ymm5
  2560  	mov	r9d, dword ptr [rbx]
  2561  	mov	ecx, dword ptr [rbx - 4]
  2562  	mov	edx, r9d
  2563  	shld	edx, ecx, 13
  2564  	mov	eax, dword ptr [rbx - 8]
  2565  	vmovd	xmm5, ecx
  2566  	shld	ecx, eax, 11
  2567  	mov	edi, dword ptr [rbx - 12]
  2568  	mov	esi, eax
  2569  	shld	esi, edi, 9
  2570  	vmovd	xmm6, edi
  2571  	vpinsrd	xmm6, xmm6, esi, 1
  2572  	vpinsrd	xmm6, xmm6, eax, 2
  2573  	vpinsrd	xmm6, xmm6, ecx, 3
  2574  	vpinsrd	xmm5, xmm5, edx, 1
  2575  	vpinsrd	xmm5, xmm5, r9d, 2
  2576  	vpinsrd	xmm5, xmm5, r9d, 3
  2577  	vinserti128	ymm5, ymm6, xmm5, 1
  2578  	vpsrlvd	ymm5, ymm5, ymm4
  2579  	vpand	ymm5, ymm5, ymm0
  2580  	vmovdqu	ymmword ptr [r15], ymm5
  2581  	sub	r15, -128
  2582  	add	rbx, 60
  2583  	add	r8, -1
  2584  	jne	.LBB0_47
  2585  	jmp	.LBB0_147
  2586  .LBB0_96:
  2587  	cmp	edx, 32
  2588  	jl	.LBB0_147
  2589  # %bb.97:
  2590  	mov	r8d, r14d
  2591  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_0] # ymm0 = [9223372034707292159,9223372034707292159,9223372034707292159,9223372034707292159]
  2592  	add	r15, 96
  2593  	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_1] # ymm8 = [24,23,22,21,20,19,18,17]
  2594  	vmovdqa	ymm9, ymmword ptr [rip + .LCPI0_2] # ymm9 = [8,9,10,11,12,13,14,15]
  2595  	vmovdqa	ymm10, ymmword ptr [rip + .LCPI0_3] # ymm10 = [16,15,14,13,12,11,10,9]
  2596  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_4] # ymm4 = [16,17,18,19,20,21,22,23]
  2597  	vmovdqa	xmm5, xmmword ptr [rip + .LCPI0_5] # xmm5 = [8,7,6,5]
  2598  	vmovdqa	xmm6, xmmword ptr [rip + .LCPI0_6] # xmm6 = [24,25,26,27]
  2599  	vmovdqa	ymm7, ymmword ptr [rip + .LCPI0_7] # ymm7 = [0,0,0,0,0,0,0,1]
  2600  	.p2align	4, 0x90
  2601  .LBB0_98:                               # =>This Inner Loop Header: Depth=1
  2602  	mov	r10d, dword ptr [rbx + 24]
  2603  	mov	r9d, dword ptr [rbx + 28]
  2604  	shld	r9d, r10d, 7
  2605  	mov	esi, dword ptr [rbx + 20]
  2606  	shld	r10d, esi, 6
  2607  	mov	edi, dword ptr [rbx + 16]
  2608  	shld	esi, edi, 5
  2609  	mov	eax, dword ptr [rbx + 12]
  2610  	shld	edi, eax, 4
  2611  	mov	edx, dword ptr [rbx + 8]
  2612  	shld	eax, edx, 3
  2613  	mov	ecx, dword ptr [rbx + 4]
  2614  	shld	edx, ecx, 2
  2615  	mov	r11d, dword ptr [rbx]
  2616  	shld	ecx, r11d, 1
  2617  	vmovd	xmm1, edi
  2618  	vpinsrd	xmm1, xmm1, esi, 1
  2619  	vpinsrd	xmm1, xmm1, r10d, 2
  2620  	vpinsrd	xmm1, xmm1, r9d, 3
  2621  	vmovd	xmm2, r11d
  2622  	vpinsrd	xmm2, xmm2, ecx, 1
  2623  	vpinsrd	xmm2, xmm2, edx, 2
  2624  	vpinsrd	xmm2, xmm2, eax, 3
  2625  	vinserti128	ymm1, ymm2, xmm1, 1
  2626  	vpand	ymm1, ymm1, ymm0
  2627  	vmovdqu	ymmword ptr [r15 - 96], ymm1
  2628  	vmovdqu	ymm1, ymmword ptr [rbx + 28]
  2629  	vpsrlvd	ymm1, ymm1, ymm8
  2630  	vmovdqu	xmm2, xmmword ptr [rbx + 44]
  2631  	vpshufd	xmm3, xmm2, 249                 # xmm3 = xmm2[1,2,3,3]
  2632  	vpinsrd	xmm3, xmm3, dword ptr [rbx + 60], 3
  2633  	vpalignr	xmm2, xmm2, xmmword ptr [rbx + 28], 4 # xmm2 = mem[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
  2634  	vinserti128	ymm2, ymm2, xmm3, 1
  2635  	vpsllvd	ymm2, ymm2, ymm9
  2636  	vpor	ymm1, ymm1, ymm2
  2637  	vpand	ymm1, ymm1, ymm0
  2638  	vmovdqu	ymmword ptr [r15 - 64], ymm1
  2639  	vmovdqu	ymm1, ymmword ptr [rbx + 60]
  2640  	vmovdqu	xmm2, xmmword ptr [rbx + 76]
  2641  	vpshufd	xmm3, xmm2, 249                 # xmm3 = xmm2[1,2,3,3]
  2642  	vpinsrd	xmm3, xmm3, dword ptr [rbx + 92], 3
  2643  	vpsrlvd	ymm1, ymm1, ymm10
  2644  	vpalignr	xmm2, xmm2, xmmword ptr [rbx + 60], 4 # xmm2 = mem[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
  2645  	vinserti128	ymm2, ymm2, xmm3, 1
  2646  	vpsllvd	ymm2, ymm2, ymm4
  2647  	vpor	ymm1, ymm1, ymm2
  2648  	vpand	ymm1, ymm1, ymm0
  2649  	vmovdqu	ymmword ptr [r15 - 32], ymm1
  2650  	mov	eax, dword ptr [rbx + 120]
  2651  	mov	ecx, dword ptr [rbx + 116]
  2652  	mov	edx, eax
  2653  	shld	edx, ecx, 30
  2654  	mov	esi, dword ptr [rbx + 112]
  2655  	shld	ecx, esi, 29
  2656  	mov	edi, dword ptr [rbx + 108]
  2657  	shld	esi, edi, 28
  2658  	vmovdqu	xmm1, xmmword ptr [rbx + 92]
  2659  	vpsrlvd	xmm2, xmm1, xmm5
  2660  	vpshufd	xmm1, xmm1, 249                 # xmm1 = xmm1[1,2,3,3]
  2661  	vpinsrd	xmm1, xmm1, edi, 3
  2662  	vpsllvd	xmm1, xmm1, xmm6
  2663  	vmovd	xmm3, esi
  2664  	vpinsrd	xmm3, xmm3, ecx, 1
  2665  	vpinsrd	xmm3, xmm3, edx, 2
  2666  	vpinsrd	xmm3, xmm3, eax, 3
  2667  	vpor	xmm1, xmm2, xmm1
  2668  	vinserti128	ymm1, ymm1, xmm3, 1
  2669  	vpsrlvd	ymm1, ymm1, ymm7
  2670  	vpand	ymm1, ymm1, ymm0
  2671  	vmovdqu	ymmword ptr [r15], ymm1
  2672  	add	rbx, 124
  2673  	sub	r15, -128
  2674  	add	r8, -1
  2675  	jne	.LBB0_98
  2676  	jmp	.LBB0_147
  2677  .LBB0_144:
  2678  	cmp	edx, 32
  2679  	jl	.LBB0_147
  2680  # %bb.145:
  2681  	mov	ebx, r14d
  2682  	.p2align	4, 0x90
  2683  .LBB0_146:                              # =>This Inner Loop Header: Depth=1
  2684  	mov	edx, 128
  2685  	mov	rdi, r15
  2686  	xor	esi, esi
  2687  	call	clib·_memset(SB)
  2688  	sub	r15, -128
  2689  	add	rbx, -1
  2690  	jne	.LBB0_146
  2691  	jmp	.LBB0_147
  2692  .LBB0_120:
  2693  	cmp	edx, 32
  2694  	jl	.LBB0_147
  2695  # %bb.121:
  2696  	mov	eax, r14d
  2697  	xor	ecx, ecx
  2698  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_80] # ymm0 = [68719476736,68719476736,68719476736,68719476736]
  2699  	vpxor	xmm1, xmm1, xmm1
  2700  	.p2align	4, 0x90
  2701  .LBB0_122:                              # =>This Inner Loop Header: Depth=1
  2702  	vmovdqu	xmm2, xmmword ptr [rbx + rcx]
  2703  	vpermq	ymm2, ymm2, 216                 # ymm2 = ymm2[0,2,1,3]
  2704  	vpshufd	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1,4,4,5,5]
  2705  	vpsrlvd	ymm2, ymm2, ymm0
  2706  	vpblendw	ymm2, ymm2, ymm1, 170           # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
  2707  	vmovdqu	ymmword ptr [r15 + 2*rcx], ymm2
  2708  	vmovdqu	xmm2, xmmword ptr [rbx + rcx + 16]
  2709  	vpermq	ymm2, ymm2, 216                 # ymm2 = ymm2[0,2,1,3]
  2710  	vpshufd	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1,4,4,5,5]
  2711  	vpsrlvd	ymm2, ymm2, ymm0
  2712  	vpblendw	ymm2, ymm2, ymm1, 170           # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
  2713  	vmovdqu	ymmword ptr [r15 + 2*rcx + 32], ymm2
  2714  	vmovdqu	xmm2, xmmword ptr [rbx + rcx + 32]
  2715  	vpermq	ymm2, ymm2, 216                 # ymm2 = ymm2[0,2,1,3]
  2716  	vpshufd	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1,4,4,5,5]
  2717  	vpsrlvd	ymm2, ymm2, ymm0
  2718  	vpblendw	ymm2, ymm2, ymm1, 170           # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
  2719  	vmovdqu	ymmword ptr [r15 + 2*rcx + 64], ymm2
  2720  	vmovdqu	xmm2, xmmword ptr [rbx + rcx + 48]
  2721  	vpermq	ymm2, ymm2, 216                 # ymm2 = ymm2[0,2,1,3]
  2722  	vpshufd	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1,4,4,5,5]
  2723  	vpsrlvd	ymm2, ymm2, ymm0
  2724  	vpblendw	ymm2, ymm2, ymm1, 170           # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
  2725  	vmovdqu	ymmword ptr [r15 + 2*rcx + 96], ymm2
  2726  	add	rcx, 64
  2727  	add	rax, -1
  2728  	jne	.LBB0_122
  2729  	jmp	.LBB0_147
  2730  .LBB0_132:
  2731  	cmp	edx, 32
  2732  	jl	.LBB0_147
  2733  # %bb.133:
  2734  	mov	eax, r14d
  2735  	xor	ecx, ecx
  2736  	vbroadcasti128	ymm0, xmmword ptr [rip + .LCPI0_109] # ymm0 = [0,8,16,24,0,8,16,24]
  2737                                          # ymm0 = mem[0,1,0,1]
  2738  	vpbroadcastd	ymm1, dword ptr [rip + .LCPI0_110] # ymm1 = [255,255,255,255,255,255,255,255]
  2739  	.p2align	4, 0x90
  2740  .LBB0_134:                              # =>This Inner Loop Header: Depth=1
  2741  	vmovq	xmm2, qword ptr [rbx + rcx]     # xmm2 = mem[0],zero
  2742  	vpshufd	xmm2, xmm2, 80                  # xmm2 = xmm2[0,0,1,1]
  2743  	vpermq	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1]
  2744  	vpsrlvd	ymm2, ymm2, ymm0
  2745  	vpand	ymm2, ymm2, ymm1
  2746  	vmovdqu	ymmword ptr [r15 + 4*rcx], ymm2
  2747  	vmovq	xmm2, qword ptr [rbx + rcx + 8] # xmm2 = mem[0],zero
  2748  	vpshufd	xmm2, xmm2, 80                  # xmm2 = xmm2[0,0,1,1]
  2749  	vpermq	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1]
  2750  	vpsrlvd	ymm2, ymm2, ymm0
  2751  	vpand	ymm2, ymm2, ymm1
  2752  	vmovdqu	ymmword ptr [r15 + 4*rcx + 32], ymm2
  2753  	vmovq	xmm2, qword ptr [rbx + rcx + 16] # xmm2 = mem[0],zero
  2754  	vpshufd	xmm2, xmm2, 80                  # xmm2 = xmm2[0,0,1,1]
  2755  	vpermq	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1]
  2756  	vpsrlvd	ymm2, ymm2, ymm0
  2757  	vpand	ymm2, ymm2, ymm1
  2758  	vmovdqu	ymmword ptr [r15 + 4*rcx + 64], ymm2
  2759  	vmovq	xmm2, qword ptr [rbx + rcx + 24] # xmm2 = mem[0],zero
  2760  	vpshufd	xmm2, xmm2, 80                  # xmm2 = xmm2[0,0,1,1]
  2761  	vpermq	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1]
  2762  	vpsrlvd	ymm2, ymm2, ymm0
  2763  	vpand	ymm2, ymm2, ymm1
  2764  	vmovdqu	ymmword ptr [r15 + 4*rcx + 96], ymm2
  2765  	add	rcx, 32
  2766  	add	rax, -1
  2767  	jne	.LBB0_134
  2768  	jmp	.LBB0_147
  2769  .LBB0_108:
  2770  	cmp	edx, 32
  2771  	jl	.LBB0_147
  2772  # %bb.109:
  2773  	mov	r8d, r14d
  2774  	add	r15, 96
  2775  	add	rbx, 92
  2776  	vbroadcasti128	ymm0, xmmword ptr [rip + .LCPI0_46] # ymm0 = [0,0,0,8,0,0,0,8]
  2777                                          # ymm0 = mem[0,1,0,1]
  2778  	vpbroadcastd	ymm1, dword ptr [rip + .LCPI0_47] # ymm1 = [16777215,16777215,16777215,16777215,16777215,16777215,16777215,16777215]
  2779  	.p2align	4, 0x90
  2780  .LBB0_110:                              # =>This Inner Loop Header: Depth=1
  2781  	mov	r9d, dword ptr [rbx - 72]
  2782  	mov	edx, dword ptr [rbx - 76]
  2783  	mov	esi, r9d
  2784  	mov	edi, dword ptr [rbx - 80]
  2785  	mov	r10d, dword ptr [rbx - 84]
  2786  	shld	esi, edx, 16
  2787  	mov	r11d, dword ptr [rbx - 92]
  2788  	mov	eax, dword ptr [rbx - 88]
  2789  	shld	edx, edi, 8
  2790  	mov	ecx, r10d
  2791  	shld	ecx, eax, 16
  2792  	shld	eax, r11d, 8
  2793  	vmovd	xmm2, edi
  2794  	vmovd	xmm3, r11d
  2795  	vpinsrd	xmm2, xmm2, edx, 1
  2796  	vpinsrd	xmm3, xmm3, eax, 1
  2797  	vpinsrd	xmm2, xmm2, esi, 2
  2798  	vpinsrd	xmm3, xmm3, ecx, 2
  2799  	vpinsrd	xmm2, xmm2, r9d, 3
  2800  	vpinsrd	xmm3, xmm3, r10d, 3
  2801  	vinserti128	ymm2, ymm3, xmm2, 1
  2802  	vpsrlvd	ymm2, ymm2, ymm0
  2803  	vpand	ymm2, ymm2, ymm1
  2804  	vmovdqu	ymmword ptr [r15 - 96], ymm2
  2805  	mov	r9d, dword ptr [rbx - 48]
  2806  	mov	ecx, dword ptr [rbx - 52]
  2807  	mov	edx, r9d
  2808  	mov	esi, dword ptr [rbx - 56]
  2809  	mov	r10d, dword ptr [rbx - 60]
  2810  	shld	edx, ecx, 16
  2811  	mov	r11d, dword ptr [rbx - 68]
  2812  	mov	edi, dword ptr [rbx - 64]
  2813  	shld	ecx, esi, 8
  2814  	mov	eax, r10d
  2815  	shld	eax, edi, 16
  2816  	shld	edi, r11d, 8
  2817  	vmovd	xmm2, esi
  2818  	vmovd	xmm3, r11d
  2819  	vpinsrd	xmm2, xmm2, ecx, 1
  2820  	vpinsrd	xmm3, xmm3, edi, 1
  2821  	vpinsrd	xmm2, xmm2, edx, 2
  2822  	vpinsrd	xmm3, xmm3, eax, 2
  2823  	vpinsrd	xmm2, xmm2, r9d, 3
  2824  	vpinsrd	xmm3, xmm3, r10d, 3
  2825  	vinserti128	ymm2, ymm3, xmm2, 1
  2826  	vpsrlvd	ymm2, ymm2, ymm0
  2827  	vpand	ymm2, ymm2, ymm1
  2828  	vmovdqu	ymmword ptr [r15 - 64], ymm2
  2829  	mov	r9d, dword ptr [rbx - 24]
  2830  	mov	ecx, dword ptr [rbx - 28]
  2831  	mov	edx, r9d
  2832  	mov	esi, dword ptr [rbx - 32]
  2833  	mov	r10d, dword ptr [rbx - 36]
  2834  	shld	edx, ecx, 16
  2835  	mov	r11d, dword ptr [rbx - 44]
  2836  	mov	edi, dword ptr [rbx - 40]
  2837  	shld	ecx, esi, 8
  2838  	mov	eax, r10d
  2839  	shld	eax, edi, 16
  2840  	shld	edi, r11d, 8
  2841  	vmovd	xmm2, esi
  2842  	vmovd	xmm3, r11d
  2843  	vpinsrd	xmm2, xmm2, ecx, 1
  2844  	vpinsrd	xmm3, xmm3, edi, 1
  2845  	vpinsrd	xmm2, xmm2, edx, 2
  2846  	vpinsrd	xmm3, xmm3, eax, 2
  2847  	vpinsrd	xmm2, xmm2, r9d, 3
  2848  	vpinsrd	xmm3, xmm3, r10d, 3
  2849  	vinserti128	ymm2, ymm3, xmm2, 1
  2850  	vpsrlvd	ymm2, ymm2, ymm0
  2851  	vpand	ymm2, ymm2, ymm1
  2852  	vmovdqu	ymmword ptr [r15 - 32], ymm2
  2853  	mov	r9d, dword ptr [rbx]
  2854  	mov	ecx, dword ptr [rbx - 4]
  2855  	mov	edx, r9d
  2856  	mov	esi, dword ptr [rbx - 8]
  2857  	mov	r10d, dword ptr [rbx - 12]
  2858  	shld	edx, ecx, 16
  2859  	mov	r11d, dword ptr [rbx - 20]
  2860  	mov	edi, dword ptr [rbx - 16]
  2861  	shld	ecx, esi, 8
  2862  	mov	eax, r10d
  2863  	shld	eax, edi, 16
  2864  	shld	edi, r11d, 8
  2865  	vmovd	xmm2, esi
  2866  	vpinsrd	xmm2, xmm2, ecx, 1
  2867  	vmovd	xmm3, r11d
  2868  	vpinsrd	xmm2, xmm2, edx, 2
  2869  	vpinsrd	xmm3, xmm3, edi, 1
  2870  	vpinsrd	xmm2, xmm2, r9d, 3
  2871  	vpinsrd	xmm3, xmm3, eax, 2
  2872  	vpinsrd	xmm3, xmm3, r10d, 3
  2873  	vinserti128	ymm2, ymm3, xmm2, 1
  2874  	vpsrlvd	ymm2, ymm2, ymm0
  2875  	vpand	ymm2, ymm2, ymm1
  2876  	vmovdqu	ymmword ptr [r15], ymm2
  2877  	sub	r15, -128
  2878  	add	rbx, 96
  2879  	add	r8, -1
  2880  	jne	.LBB0_110
  2881  	jmp	.LBB0_147
  2882  .LBB0_138:
  2883  	cmp	edx, 32
  2884  	jl	.LBB0_147
  2885  # %bb.139:
  2886  	mov	eax, r14d
  2887  	xor	ecx, ecx
  2888  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_124] # ymm0 = [0,4,8,12,16,20,24,28]
  2889  	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_125] # ymm1 = [64424509455,64424509455,64424509455,64424509455]
  2890  	.p2align	4, 0x90
  2891  .LBB0_140:                              # =>This Inner Loop Header: Depth=1
  2892  	vpbroadcastd	ymm2, dword ptr [rbx + rcx]
  2893  	vpsrlvd	ymm2, ymm2, ymm0
  2894  	vpand	ymm2, ymm2, ymm1
  2895  	vmovdqu	ymmword ptr [r15 + 8*rcx], ymm2
  2896  	vpbroadcastd	ymm2, dword ptr [rbx + rcx + 4]
  2897  	vpsrlvd	ymm2, ymm2, ymm0
  2898  	vpand	ymm2, ymm2, ymm1
  2899  	vmovdqu	ymmword ptr [r15 + 8*rcx + 32], ymm2
  2900  	vpbroadcastd	ymm2, dword ptr [rbx + rcx + 8]
  2901  	vpsrlvd	ymm2, ymm2, ymm0
  2902  	vpand	ymm2, ymm2, ymm1
  2903  	vmovdqu	ymmword ptr [r15 + 8*rcx + 64], ymm2
  2904  	vpbroadcastd	ymm2, dword ptr [rbx + rcx + 12]
  2905  	vpsrlvd	ymm2, ymm2, ymm0
  2906  	vpand	ymm2, ymm2, ymm1
  2907  	vmovdqu	ymmword ptr [r15 + 8*rcx + 96], ymm2
  2908  	add	rcx, 16
  2909  	add	rax, -1
  2910  	jne	.LBB0_140
  2911  	jmp	.LBB0_147
  2912  .LBB0_114:
  2913  	cmp	edx, 32
  2914  	jl	.LBB0_147
  2915  # %bb.115:
  2916  	mov	r8d, r14d
  2917  	add	r15, 96
  2918  	add	rbx, 76
  2919  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_65] # ymm0 = [0,0,8,0,0,4,0,12]
  2920  	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_66] # ymm1 = [4503595333451775,4503595333451775,4503595333451775,4503595333451775]
  2921  	.p2align	4, 0x90
  2922  .LBB0_116:                              # =>This Inner Loop Header: Depth=1
  2923  	mov	r9d, dword ptr [rbx - 60]
  2924  	mov	r11d, dword ptr [rbx - 64]
  2925  	mov	esi, r9d
  2926  	shld	esi, r11d, 8
  2927  	mov	edi, dword ptr [rbx - 68]
  2928  	mov	edx, r11d
  2929  	shld	edx, edi, 16
  2930  	mov	eax, dword ptr [rbx - 72]
  2931  	shld	edi, eax, 4
  2932  	mov	r10d, dword ptr [rbx - 76]
  2933  	mov	ecx, eax
  2934  	shld	ecx, r10d, 12
  2935  	vmovd	xmm2, edx
  2936  	vpinsrd	xmm2, xmm2, r11d, 1
  2937  	vpinsrd	xmm2, xmm2, esi, 2
  2938  	vpinsrd	xmm2, xmm2, r9d, 3
  2939  	vmovd	xmm3, r10d
  2940  	vpinsrd	xmm3, xmm3, ecx, 1
  2941  	vpinsrd	xmm3, xmm3, eax, 2
  2942  	vpinsrd	xmm3, xmm3, edi, 3
  2943  	vinserti128	ymm2, ymm3, xmm2, 1
  2944  	vpsrlvd	ymm2, ymm2, ymm0
  2945  	vpand	ymm2, ymm2, ymm1
  2946  	vmovdqu	ymmword ptr [r15 - 96], ymm2
  2947  	mov	r9d, dword ptr [rbx - 40]
  2948  	mov	r11d, dword ptr [rbx - 44]
  2949  	mov	edx, r9d
  2950  	shld	edx, r11d, 8
  2951  	mov	esi, dword ptr [rbx - 48]
  2952  	mov	edi, r11d
  2953  	shld	edi, esi, 16
  2954  	mov	r10d, dword ptr [rbx - 56]
  2955  	mov	ecx, dword ptr [rbx - 52]
  2956  	shld	esi, ecx, 4
  2957  	mov	eax, ecx
  2958  	shld	eax, r10d, 12
  2959  	vmovd	xmm2, edi
  2960  	vpinsrd	xmm2, xmm2, r11d, 1
  2961  	vpinsrd	xmm2, xmm2, edx, 2
  2962  	vpinsrd	xmm2, xmm2, r9d, 3
  2963  	vmovd	xmm3, r10d
  2964  	vpinsrd	xmm3, xmm3, eax, 1
  2965  	vpinsrd	xmm3, xmm3, ecx, 2
  2966  	vpinsrd	xmm3, xmm3, esi, 3
  2967  	vinserti128	ymm2, ymm3, xmm2, 1
  2968  	vpsrlvd	ymm2, ymm2, ymm0
  2969  	vpand	ymm2, ymm2, ymm1
  2970  	vmovdqu	ymmword ptr [r15 - 64], ymm2
  2971  	mov	r9d, dword ptr [rbx - 20]
  2972  	mov	r11d, dword ptr [rbx - 24]
  2973  	mov	edx, r9d
  2974  	shld	edx, r11d, 8
  2975  	mov	esi, dword ptr [rbx - 28]
  2976  	mov	edi, r11d
  2977  	shld	edi, esi, 16
  2978  	mov	ecx, dword ptr [rbx - 32]
  2979  	shld	esi, ecx, 4
  2980  	mov	r10d, dword ptr [rbx - 36]
  2981  	mov	eax, ecx
  2982  	shld	eax, r10d, 12
  2983  	vmovd	xmm2, edi
  2984  	vpinsrd	xmm2, xmm2, r11d, 1
  2985  	vpinsrd	xmm2, xmm2, edx, 2
  2986  	vpinsrd	xmm2, xmm2, r9d, 3
  2987  	vmovd	xmm3, r10d
  2988  	vpinsrd	xmm3, xmm3, eax, 1
  2989  	vpinsrd	xmm3, xmm3, ecx, 2
  2990  	vpinsrd	xmm3, xmm3, esi, 3
  2991  	vinserti128	ymm2, ymm3, xmm2, 1
  2992  	vpsrlvd	ymm2, ymm2, ymm0
  2993  	vpand	ymm2, ymm2, ymm1
  2994  	vmovdqu	ymmword ptr [r15 - 32], ymm2
  2995  	mov	r9d, dword ptr [rbx]
  2996  	mov	r11d, dword ptr [rbx - 4]
  2997  	mov	edx, r9d
  2998  	shld	edx, r11d, 8
  2999  	mov	esi, dword ptr [rbx - 8]
  3000  	mov	edi, r11d
  3001  	shld	edi, esi, 16
  3002  	mov	r10d, dword ptr [rbx - 16]
  3003  	mov	ecx, dword ptr [rbx - 12]
  3004  	shld	esi, ecx, 4
  3005  	mov	eax, ecx
  3006  	shld	eax, r10d, 12
  3007  	vmovd	xmm2, edi
  3008  	vpinsrd	xmm2, xmm2, r11d, 1
  3009  	vpinsrd	xmm2, xmm2, edx, 2
  3010  	vpinsrd	xmm2, xmm2, r9d, 3
  3011  	vmovd	xmm3, r10d
  3012  	vpinsrd	xmm3, xmm3, eax, 1
  3013  	vpinsrd	xmm3, xmm3, ecx, 2
  3014  	vpinsrd	xmm3, xmm3, esi, 3
  3015  	vinserti128	ymm2, ymm3, xmm2, 1
  3016  	vpsrlvd	ymm2, ymm2, ymm0
  3017  	vpand	ymm2, ymm2, ymm1
  3018  	vmovdqu	ymmword ptr [r15], ymm2
  3019  	sub	r15, -128
  3020  	add	rbx, 80
  3021  	add	r8, -1
  3022  	jne	.LBB0_116
  3023  	jmp	.LBB0_147
  3024  .LBB0_126:
  3025  	cmp	edx, 32
  3026  	jl	.LBB0_147
  3027  # %bb.127:
  3028  	mov	r8d, r14d
  3029  	add	r15, 96
  3030  	add	rbx, 44
  3031  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_94] # ymm0 = [0,12,0,4,16,0,8,20]
  3032  	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_95] # ymm1 = [17587891081215,17587891081215,17587891081215,17587891081215]
  3033  	.p2align	4, 0x90
  3034  .LBB0_128:                              # =>This Inner Loop Header: Depth=1
  3035  	mov	ecx, dword ptr [rbx - 36]
  3036  	mov	edx, dword ptr [rbx - 44]
  3037  	mov	esi, dword ptr [rbx - 40]
  3038  	mov	edi, ecx
  3039  	shld	edi, esi, 4
  3040  	mov	eax, esi
  3041  	shld	eax, edx, 8
  3042  	vmovd	xmm2, esi
  3043  	vpinsrd	xmm2, xmm2, edi, 1
  3044  	vpinsrd	xmm2, xmm2, ecx, 2
  3045  	vpinsrd	xmm2, xmm2, ecx, 3
  3046  	vmovd	xmm3, edx
  3047  	vpinsrd	xmm3, xmm3, edx, 1
  3048  	vpinsrd	xmm3, xmm3, eax, 2
  3049  	vpinsrd	xmm3, xmm3, esi, 3
  3050  	vinserti128	ymm2, ymm3, xmm2, 1
  3051  	vpsrlvd	ymm2, ymm2, ymm0
  3052  	vpand	ymm2, ymm2, ymm1
  3053  	vmovdqu	ymmword ptr [r15 - 96], ymm2
  3054  	mov	eax, dword ptr [rbx - 24]
  3055  	mov	ecx, dword ptr [rbx - 32]
  3056  	mov	edx, dword ptr [rbx - 28]
  3057  	mov	esi, eax
  3058  	shld	esi, edx, 4
  3059  	mov	edi, edx
  3060  	shld	edi, ecx, 8
  3061  	vmovd	xmm2, edx
  3062  	vpinsrd	xmm2, xmm2, esi, 1
  3063  	vpinsrd	xmm2, xmm2, eax, 2
  3064  	vpinsrd	xmm2, xmm2, eax, 3
  3065  	vmovd	xmm3, ecx
  3066  	vpinsrd	xmm3, xmm3, ecx, 1
  3067  	vpinsrd	xmm3, xmm3, edi, 2
  3068  	vpinsrd	xmm3, xmm3, edx, 3
  3069  	vinserti128	ymm2, ymm3, xmm2, 1
  3070  	vpsrlvd	ymm2, ymm2, ymm0
  3071  	vpand	ymm2, ymm2, ymm1
  3072  	vmovdqu	ymmword ptr [r15 - 64], ymm2
  3073  	mov	eax, dword ptr [rbx - 12]
  3074  	mov	ecx, dword ptr [rbx - 20]
  3075  	mov	edx, dword ptr [rbx - 16]
  3076  	mov	esi, eax
  3077  	shld	esi, edx, 4
  3078  	mov	edi, edx
  3079  	shld	edi, ecx, 8
  3080  	vmovd	xmm2, edx
  3081  	vpinsrd	xmm2, xmm2, esi, 1
  3082  	vpinsrd	xmm2, xmm2, eax, 2
  3083  	vpinsrd	xmm2, xmm2, eax, 3
  3084  	vmovd	xmm3, ecx
  3085  	vpinsrd	xmm3, xmm3, ecx, 1
  3086  	vpinsrd	xmm3, xmm3, edi, 2
  3087  	vpinsrd	xmm3, xmm3, edx, 3
  3088  	vinserti128	ymm2, ymm3, xmm2, 1
  3089  	vpsrlvd	ymm2, ymm2, ymm0
  3090  	vpand	ymm2, ymm2, ymm1
  3091  	vmovdqu	ymmword ptr [r15 - 32], ymm2
  3092  	mov	eax, dword ptr [rbx]
  3093  	mov	ecx, dword ptr [rbx - 8]
  3094  	mov	edx, dword ptr [rbx - 4]
  3095  	mov	esi, eax
  3096  	shld	esi, edx, 4
  3097  	mov	edi, edx
  3098  	shld	edi, ecx, 8
  3099  	vmovd	xmm2, edx
  3100  	vpinsrd	xmm2, xmm2, esi, 1
  3101  	vpinsrd	xmm2, xmm2, eax, 2
  3102  	vpinsrd	xmm2, xmm2, eax, 3
  3103  	vmovd	xmm3, ecx
  3104  	vpinsrd	xmm3, xmm3, ecx, 1
  3105  	vpinsrd	xmm3, xmm3, edi, 2
  3106  	vpinsrd	xmm3, xmm3, edx, 3
  3107  	vinserti128	ymm2, ymm3, xmm2, 1
  3108  	vpsrlvd	ymm2, ymm2, ymm0
  3109  	vpand	ymm2, ymm2, ymm1
  3110  	vmovdqu	ymmword ptr [r15], ymm2
  3111  	sub	r15, -128
  3112  	add	rbx, 48
  3113  	add	r8, -1
  3114  	jne	.LBB0_128
  3115  	jmp	.LBB0_147
  3116  .LBB0_102:
  3117  	cmp	edx, 32
  3118  	jl	.LBB0_147
  3119  # %bb.103:
  3120  	mov	r8d, r14d
  3121  	add	r15, 96
  3122  	add	rbx, 108
  3123  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_22] # ymm0 = [0,0,0,0,0,0,0,4]
  3124  	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_23] # ymm1 = [1152921500580315135,1152921500580315135,1152921500580315135,1152921500580315135]
  3125  	.p2align	4, 0x90
  3126  .LBB0_104:                              # =>This Inner Loop Header: Depth=1
  3127  	mov	r9d, dword ptr [rbx - 84]
  3128  	mov	edx, dword ptr [rbx - 88]
  3129  	mov	r10d, r9d
  3130  	shld	r10d, edx, 24
  3131  	mov	edi, dword ptr [rbx - 92]
  3132  	shld	edx, edi, 20
  3133  	mov	eax, dword ptr [rbx - 96]
  3134  	shld	edi, eax, 16
  3135  	mov	ecx, dword ptr [rbx - 100]
  3136  	shld	eax, ecx, 12
  3137  	mov	r11d, dword ptr [rbx - 108]
  3138  	mov	esi, dword ptr [rbx - 104]
  3139  	shld	ecx, esi, 8
  3140  	shld	esi, r11d, 4
  3141  	vmovd	xmm2, r11d
  3142  	vmovd	xmm3, edi
  3143  	vpinsrd	xmm2, xmm2, esi, 1
  3144  	vpinsrd	xmm3, xmm3, edx, 1
  3145  	vpinsrd	xmm2, xmm2, ecx, 2
  3146  	vpinsrd	xmm3, xmm3, r10d, 2
  3147  	vpinsrd	xmm2, xmm2, eax, 3
  3148  	vpinsrd	xmm3, xmm3, r9d, 3
  3149  	vinserti128	ymm2, ymm2, xmm3, 1
  3150  	vpsrlvd	ymm2, ymm2, ymm0
  3151  	vpand	ymm2, ymm2, ymm1
  3152  	vmovdqu	ymmword ptr [r15 - 96], ymm2
  3153  	mov	r9d, dword ptr [rbx - 56]
  3154  	mov	ecx, dword ptr [rbx - 60]
  3155  	mov	r10d, r9d
  3156  	shld	r10d, ecx, 24
  3157  	mov	esi, dword ptr [rbx - 64]
  3158  	shld	ecx, esi, 20
  3159  	mov	edi, dword ptr [rbx - 68]
  3160  	shld	esi, edi, 16
  3161  	mov	eax, dword ptr [rbx - 72]
  3162  	shld	edi, eax, 12
  3163  	mov	r11d, dword ptr [rbx - 80]
  3164  	mov	edx, dword ptr [rbx - 76]
  3165  	shld	eax, edx, 8
  3166  	shld	edx, r11d, 4
  3167  	vmovd	xmm2, r11d
  3168  	vmovd	xmm3, esi
  3169  	vpinsrd	xmm2, xmm2, edx, 1
  3170  	vpinsrd	xmm3, xmm3, ecx, 1
  3171  	vpinsrd	xmm2, xmm2, eax, 2
  3172  	vpinsrd	xmm3, xmm3, r10d, 2
  3173  	vpinsrd	xmm2, xmm2, edi, 3
  3174  	vpinsrd	xmm3, xmm3, r9d, 3
  3175  	vinserti128	ymm2, ymm2, xmm3, 1
  3176  	vpsrlvd	ymm2, ymm2, ymm0
  3177  	vpand	ymm2, ymm2, ymm1
  3178  	vmovdqu	ymmword ptr [r15 - 64], ymm2
  3179  	mov	r9d, dword ptr [rbx - 28]
  3180  	mov	ecx, dword ptr [rbx - 32]
  3181  	mov	r10d, r9d
  3182  	shld	r10d, ecx, 24
  3183  	mov	esi, dword ptr [rbx - 36]
  3184  	shld	ecx, esi, 20
  3185  	mov	edi, dword ptr [rbx - 40]
  3186  	shld	esi, edi, 16
  3187  	mov	eax, dword ptr [rbx - 44]
  3188  	shld	edi, eax, 12
  3189  	mov	r11d, dword ptr [rbx - 52]
  3190  	mov	edx, dword ptr [rbx - 48]
  3191  	shld	eax, edx, 8
  3192  	shld	edx, r11d, 4
  3193  	vmovd	xmm2, r11d
  3194  	vmovd	xmm3, esi
  3195  	vpinsrd	xmm2, xmm2, edx, 1
  3196  	vpinsrd	xmm3, xmm3, ecx, 1
  3197  	vpinsrd	xmm2, xmm2, eax, 2
  3198  	vpinsrd	xmm3, xmm3, r10d, 2
  3199  	vpinsrd	xmm2, xmm2, edi, 3
  3200  	vpinsrd	xmm3, xmm3, r9d, 3
  3201  	vinserti128	ymm2, ymm2, xmm3, 1
  3202  	vpsrlvd	ymm2, ymm2, ymm0
  3203  	vpand	ymm2, ymm2, ymm1
  3204  	vmovdqu	ymmword ptr [r15 - 32], ymm2
  3205  	mov	r9d, dword ptr [rbx]
  3206  	mov	ecx, dword ptr [rbx - 4]
  3207  	mov	r10d, r9d
  3208  	shld	r10d, ecx, 24
  3209  	mov	esi, dword ptr [rbx - 8]
  3210  	shld	ecx, esi, 20
  3211  	mov	edi, dword ptr [rbx - 12]
  3212  	shld	esi, edi, 16
  3213  	mov	eax, dword ptr [rbx - 16]
  3214  	shld	edi, eax, 12
  3215  	mov	r11d, dword ptr [rbx - 24]
  3216  	mov	edx, dword ptr [rbx - 20]
  3217  	shld	eax, edx, 8
  3218  	shld	edx, r11d, 4
  3219  	vmovd	xmm2, r11d
  3220  	vmovd	xmm3, esi
  3221  	vpinsrd	xmm2, xmm2, edx, 1
  3222  	vpinsrd	xmm3, xmm3, ecx, 1
  3223  	vpinsrd	xmm2, xmm2, eax, 2
  3224  	vpinsrd	xmm3, xmm3, r10d, 2
  3225  	vpinsrd	xmm2, xmm2, edi, 3
  3226  	vpinsrd	xmm3, xmm3, r9d, 3
  3227  	vinserti128	ymm2, ymm2, xmm3, 1
  3228  	vpsrlvd	ymm2, ymm2, ymm0
  3229  	vpand	ymm2, ymm2, ymm1
  3230  	vmovdqu	ymmword ptr [r15], ymm2
  3231  	sub	r15, -128
  3232  	add	rbx, 112
  3233  	add	r8, -1
  3234  	jne	.LBB0_104
  3235  	jmp	.LBB0_147
  3236  .LBB0_141:
  3237  	cmp	edx, 32
  3238  	jl	.LBB0_147
  3239  # %bb.142:
  3240  	mov	eax, r14d
  3241  	add	r15, 96
  3242  	xor	ecx, ecx
  3243  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_131] # ymm0 = [0,2,4,6,8,10,12,14]
  3244  	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_132] # ymm1 = [12884901891,12884901891,12884901891,12884901891]
  3245  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_133] # ymm2 = [16,18,20,22,24,26,28,30]
  3246  	.p2align	4, 0x90
  3247  .LBB0_143:                              # =>This Inner Loop Header: Depth=1
  3248  	vpbroadcastd	ymm3, dword ptr [rbx + 8*rcx]
  3249  	vpsrlvd	ymm3, ymm3, ymm0
  3250  	vpand	ymm3, ymm3, ymm1
  3251  	vmovdqu	ymmword ptr [r15 - 96], ymm3
  3252  	vpbroadcastd	ymm3, dword ptr [rbx + 8*rcx]
  3253  	vpsrlvd	ymm3, ymm3, ymm2
  3254  	vpand	ymm3, ymm3, ymm1
  3255  	vmovdqu	ymmword ptr [r15 - 64], ymm3
  3256  	vpbroadcastd	ymm3, dword ptr [rbx + 8*rcx + 4]
  3257  	vpsrlvd	ymm3, ymm3, ymm0
  3258  	vpand	ymm3, ymm3, ymm1
  3259  	vmovdqu	ymmword ptr [r15 - 32], ymm3
  3260  	vpbroadcastd	ymm3, dword ptr [rbx + 8*rcx + 4]
  3261  	vpsrlvd	ymm3, ymm3, ymm2
  3262  	vpand	ymm3, ymm3, ymm1
  3263  	vmovdqu	ymmword ptr [r15], ymm3
  3264  	add	rcx, 1
  3265  	sub	r15, -128
  3266  	cmp	rax, rcx
  3267  	jne	.LBB0_143
  3268  	jmp	.LBB0_147
  3269  .LBB0_117:
  3270  	cmp	edx, 32
  3271  	jl	.LBB0_147
  3272  # %bb.118:
  3273  	mov	r8d, r14d
  3274  	add	r15, 96
  3275  	add	rbx, 68
  3276  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_72] # ymm0 = [0,0,4,0,8,0,12,0]
  3277  	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_73] # ymm1 = [1125895612137471,1125895612137471,1125895612137471,1125895612137471]
  3278  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_74] # ymm2 = [0,2,0,6,0,10,0,14]
  3279  	.p2align	4, 0x90
  3280  .LBB0_119:                              # =>This Inner Loop Header: Depth=1
  3281  	mov	ecx, dword ptr [rbx - 56]
  3282  	mov	r10d, dword ptr [rbx - 52]
  3283  	shld	r10d, ecx, 2
  3284  	mov	esi, dword ptr [rbx - 60]
  3285  	mov	edi, ecx
  3286  	shld	edi, esi, 6
  3287  	mov	r9d, dword ptr [rbx - 68]
  3288  	mov	edx, dword ptr [rbx - 64]
  3289  	mov	eax, edx
  3290  	shld	eax, r9d, 14
  3291  	vmovd	xmm3, esi
  3292  	shld	esi, edx, 10
  3293  	vpinsrd	xmm3, xmm3, edi, 1
  3294  	vpinsrd	xmm3, xmm3, ecx, 2
  3295  	vpinsrd	xmm3, xmm3, r10d, 3
  3296  	vmovd	xmm4, r9d
  3297  	vpinsrd	xmm4, xmm4, eax, 1
  3298  	vpinsrd	xmm4, xmm4, edx, 2
  3299  	vpinsrd	xmm4, xmm4, esi, 3
  3300  	vinserti128	ymm3, ymm4, xmm3, 1
  3301  	vpsrlvd	ymm3, ymm3, ymm0
  3302  	vpand	ymm3, ymm3, ymm1
  3303  	vmovdqu	ymmword ptr [r15 - 96], ymm3
  3304  	mov	r9d, dword ptr [rbx - 36]
  3305  	mov	r11d, dword ptr [rbx - 40]
  3306  	mov	edx, r9d
  3307  	shld	edx, r11d, 4
  3308  	mov	r10d, dword ptr [rbx - 44]
  3309  	mov	edi, r11d
  3310  	shld	edi, r10d, 8
  3311  	mov	eax, dword ptr [rbx - 52]
  3312  	mov	esi, dword ptr [rbx - 48]
  3313  	mov	ecx, r10d
  3314  	shld	ecx, esi, 12
  3315  	shrd	eax, esi, 16
  3316  	vmovd	xmm3, edi
  3317  	vpinsrd	xmm3, xmm3, r11d, 1
  3318  	vpinsrd	xmm3, xmm3, edx, 2
  3319  	vpinsrd	xmm3, xmm3, r9d, 3
  3320  	vmovd	xmm4, eax
  3321  	vpinsrd	xmm4, xmm4, esi, 1
  3322  	vpinsrd	xmm4, xmm4, ecx, 2
  3323  	vpinsrd	xmm4, xmm4, r10d, 3
  3324  	vinserti128	ymm3, ymm4, xmm3, 1
  3325  	vpsrlvd	ymm3, ymm3, ymm2
  3326  	vpand	ymm3, ymm3, ymm1
  3327  	vmovdqu	ymmword ptr [r15 - 64], ymm3
  3328  	mov	eax, dword ptr [rbx - 20]
  3329  	mov	r10d, dword ptr [rbx - 16]
  3330  	shld	r10d, eax, 2
  3331  	mov	edx, dword ptr [rbx - 24]
  3332  	mov	esi, eax
  3333  	shld	esi, edx, 6
  3334  	mov	r9d, dword ptr [rbx - 32]
  3335  	mov	ecx, dword ptr [rbx - 28]
  3336  	mov	edi, ecx
  3337  	shld	edi, r9d, 14
  3338  	vmovd	xmm3, edx
  3339  	shld	edx, ecx, 10
  3340  	vpinsrd	xmm3, xmm3, esi, 1
  3341  	vpinsrd	xmm3, xmm3, eax, 2
  3342  	vpinsrd	xmm3, xmm3, r10d, 3
  3343  	vmovd	xmm4, r9d
  3344  	vpinsrd	xmm4, xmm4, edi, 1
  3345  	vpinsrd	xmm4, xmm4, ecx, 2
  3346  	vpinsrd	xmm4, xmm4, edx, 3
  3347  	vinserti128	ymm3, ymm4, xmm3, 1
  3348  	vpsrlvd	ymm3, ymm3, ymm0
  3349  	vpand	ymm3, ymm3, ymm1
  3350  	vmovdqu	ymmword ptr [r15 - 32], ymm3
  3351  	mov	r9d, dword ptr [rbx]
  3352  	mov	r11d, dword ptr [rbx - 4]
  3353  	mov	edx, r9d
  3354  	shld	edx, r11d, 4
  3355  	mov	r10d, dword ptr [rbx - 8]
  3356  	mov	edi, r11d
  3357  	shld	edi, r10d, 8
  3358  	mov	eax, dword ptr [rbx - 16]
  3359  	mov	esi, dword ptr [rbx - 12]
  3360  	mov	ecx, r10d
  3361  	shld	ecx, esi, 12
  3362  	shrd	eax, esi, 16
  3363  	vmovd	xmm3, edi
  3364  	vpinsrd	xmm3, xmm3, r11d, 1
  3365  	vpinsrd	xmm3, xmm3, edx, 2
  3366  	vpinsrd	xmm3, xmm3, r9d, 3
  3367  	vmovd	xmm4, eax
  3368  	vpinsrd	xmm4, xmm4, esi, 1
  3369  	vpinsrd	xmm4, xmm4, ecx, 2
  3370  	vpinsrd	xmm4, xmm4, r10d, 3
  3371  	vinserti128	ymm3, ymm4, xmm3, 1
  3372  	vpsrlvd	ymm3, ymm3, ymm2
  3373  	vpand	ymm3, ymm3, ymm1
  3374  	vmovdqu	ymmword ptr [r15], ymm3
  3375  	sub	r15, -128
  3376  	add	rbx, 72
  3377  	add	r8, -1
  3378  	jne	.LBB0_119
  3379  	jmp	.LBB0_147
  3380  .LBB0_129:
  3381  	cmp	edx, 32
  3382  	jl	.LBB0_147
  3383  # %bb.130:
  3384  	mov	r8d, r14d
  3385  	add	r15, 96
  3386  	add	rbx, 36
  3387  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_101] # ymm0 = [0,10,20,0,8,18,0,6]
  3388  	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_102] # ymm1 = [4393751544831,4393751544831,4393751544831,4393751544831]
  3389  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_103] # ymm2 = [16,0,4,14,0,2,12,22]
  3390  	.p2align	4, 0x90
  3391  .LBB0_131:                              # =>This Inner Loop Header: Depth=1
  3392  	mov	ecx, dword ptr [rbx - 28]
  3393  	mov	edx, dword ptr [rbx - 36]
  3394  	mov	esi, dword ptr [rbx - 32]
  3395  	mov	edi, ecx
  3396  	shld	edi, esi, 4
  3397  	vmovd	xmm3, esi
  3398  	vpinsrd	xmm3, xmm3, esi, 1
  3399  	shld	esi, edx, 2
  3400  	vpinsrd	xmm3, xmm3, edi, 2
  3401  	vpinsrd	xmm3, xmm3, ecx, 3
  3402  	vmovd	xmm4, edx
  3403  	vpinsrd	xmm4, xmm4, edx, 1
  3404  	vpinsrd	xmm4, xmm4, edx, 2
  3405  	vpinsrd	xmm4, xmm4, esi, 3
  3406  	vinserti128	ymm3, ymm4, xmm3, 1
  3407  	vpsrlvd	ymm3, ymm3, ymm0
  3408  	vpand	ymm3, ymm3, ymm1
  3409  	vmovdqu	ymmword ptr [r15 - 96], ymm3
  3410  	mov	ecx, dword ptr [rbx - 20]
  3411  	mov	edx, dword ptr [rbx - 24]
  3412  	mov	esi, ecx
  3413  	shld	esi, edx, 8
  3414  	mov	edi, dword ptr [rbx - 28]
  3415  	mov	eax, edx
  3416  	shld	eax, edi, 6
  3417  	vmovd	xmm3, esi
  3418  	vpinsrd	xmm3, xmm3, ecx, 1
  3419  	vpinsrd	xmm3, xmm3, ecx, 2
  3420  	vpinsrd	xmm3, xmm3, ecx, 3
  3421  	vmovd	xmm4, edi
  3422  	vpinsrd	xmm4, xmm4, eax, 1
  3423  	vpinsrd	xmm4, xmm4, edx, 2
  3424  	vpinsrd	xmm4, xmm4, edx, 3
  3425  	vinserti128	ymm3, ymm4, xmm3, 1
  3426  	vpsrlvd	ymm3, ymm3, ymm2
  3427  	vpand	ymm3, ymm3, ymm1
  3428  	vmovdqu	ymmword ptr [r15 - 64], ymm3
  3429  	mov	eax, dword ptr [rbx - 8]
  3430  	mov	ecx, dword ptr [rbx - 16]
  3431  	mov	edx, dword ptr [rbx - 12]
  3432  	mov	esi, eax
  3433  	shld	esi, edx, 4
  3434  	vmovd	xmm3, edx
  3435  	vpinsrd	xmm3, xmm3, edx, 1
  3436  	shld	edx, ecx, 2
  3437  	vpinsrd	xmm3, xmm3, esi, 2
  3438  	vpinsrd	xmm3, xmm3, eax, 3
  3439  	vmovd	xmm4, ecx
  3440  	vpinsrd	xmm4, xmm4, ecx, 1
  3441  	vpinsrd	xmm4, xmm4, ecx, 2
  3442  	vpinsrd	xmm4, xmm4, edx, 3
  3443  	vinserti128	ymm3, ymm4, xmm3, 1
  3444  	vpsrlvd	ymm3, ymm3, ymm0
  3445  	vpand	ymm3, ymm3, ymm1
  3446  	vmovdqu	ymmword ptr [r15 - 32], ymm3
  3447  	mov	eax, dword ptr [rbx]
  3448  	mov	ecx, dword ptr [rbx - 8]
  3449  	mov	edx, dword ptr [rbx - 4]
  3450  	mov	esi, eax
  3451  	shld	esi, edx, 8
  3452  	mov	edi, edx
  3453  	shld	edi, ecx, 6
  3454  	vmovd	xmm3, esi
  3455  	vpinsrd	xmm3, xmm3, eax, 1
  3456  	vpinsrd	xmm3, xmm3, eax, 2
  3457  	vpinsrd	xmm3, xmm3, eax, 3
  3458  	vmovd	xmm4, ecx
  3459  	vpinsrd	xmm4, xmm4, edi, 1
  3460  	vpinsrd	xmm4, xmm4, edx, 2
  3461  	vpinsrd	xmm4, xmm4, edx, 3
  3462  	vinserti128	ymm3, ymm4, xmm3, 1
  3463  	vpsrlvd	ymm3, ymm3, ymm2
  3464  	vpand	ymm3, ymm3, ymm1
  3465  	vmovdqu	ymmword ptr [r15], ymm3
  3466  	sub	r15, -128
  3467  	add	rbx, 40
  3468  	add	r8, -1
  3469  	jne	.LBB0_131
  3470  	jmp	.LBB0_147
  3471  .LBB0_105:
  3472  	cmp	edx, 32
  3473  	jl	.LBB0_147
  3474  # %bb.106:
  3475  	mov	r8d, r14d
  3476  	add	r15, 96
  3477  	add	rbx, 100
  3478  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_34] # ymm0 = [288230371923853311,288230371923853311,288230371923853311,288230371923853311]
  3479  	vpbroadcastq	xmm1, qword ptr [rip + .LCPI0_35] # xmm1 = [42949672976,42949672976]
  3480  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_33] # ymm2 = [0,0,0,0,0,2,0,0]
  3481  	vpbroadcastq	xmm3, qword ptr [rip + .LCPI0_36] # xmm3 = [94489280528,94489280528]
  3482  	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_37] # ymm4 = [0,0,4,0,0,0,0,6]
  3483  	.p2align	4, 0x90
  3484  .LBB0_107:                              # =>This Inner Loop Header: Depth=1
  3485  	mov	ecx, dword ptr [rbx - 80]
  3486  	mov	r9d, dword ptr [rbx - 76]
  3487  	shld	r9d, ecx, 10
  3488  	mov	r11d, dword ptr [rbx - 84]
  3489  	shld	ecx, r11d, 4
  3490  	mov	edi, dword ptr [rbx - 88]
  3491  	mov	esi, r11d
  3492  	shld	esi, edi, 24
  3493  	mov	edx, dword ptr [rbx - 92]
  3494  	shld	edi, edx, 18
  3495  	mov	r10d, dword ptr [rbx - 100]
  3496  	mov	eax, dword ptr [rbx - 96]
  3497  	shld	edx, eax, 12
  3498  	shld	eax, r10d, 6
  3499  	vmovd	xmm5, r10d
  3500  	vmovd	xmm6, esi
  3501  	vpinsrd	xmm5, xmm5, eax, 1
  3502  	vpinsrd	xmm6, xmm6, r11d, 1
  3503  	vpinsrd	xmm5, xmm5, edx, 2
  3504  	vpinsrd	xmm6, xmm6, ecx, 2
  3505  	vpinsrd	xmm5, xmm5, edi, 3
  3506  	vpinsrd	xmm6, xmm6, r9d, 3
  3507  	vinserti128	ymm5, ymm5, xmm6, 1
  3508  	vpsrlvd	ymm5, ymm5, ymm2
  3509  	vpand	ymm5, ymm5, ymm0
  3510  	vmovdqu	ymmword ptr [r15 - 96], ymm5
  3511  	mov	r9d, dword ptr [rbx - 52]
  3512  	mov	ecx, dword ptr [rbx - 56]
  3513  	mov	edx, r9d
  3514  	shld	edx, ecx, 20
  3515  	mov	esi, dword ptr [rbx - 60]
  3516  	shld	ecx, esi, 14
  3517  	mov	edi, dword ptr [rbx - 68]
  3518  	mov	eax, dword ptr [rbx - 64]
  3519  	shld	esi, eax, 8
  3520  	shld	eax, edi, 2
  3521  	vmovq	xmm5, qword ptr [rbx - 76]      # xmm5 = mem[0],zero
  3522  	vpsrlvd	xmm6, xmm5, xmm1
  3523  	vpshufd	xmm5, xmm5, 229                 # xmm5 = xmm5[1,1,2,3]
  3524  	vpinsrd	xmm5, xmm5, edi, 1
  3525  	vpsllvd	xmm5, xmm5, xmm3
  3526  	vpor	xmm5, xmm6, xmm5
  3527  	vmovd	xmm6, esi
  3528  	vpinsrd	xmm6, xmm6, ecx, 1
  3529  	vpinsrd	xmm6, xmm6, edx, 2
  3530  	vpinsrd	xmm6, xmm6, r9d, 3
  3531  	vpinsrd	xmm5, xmm5, edi, 2
  3532  	vpinsrd	xmm5, xmm5, eax, 3
  3533  	vinserti128	ymm5, ymm5, xmm6, 1
  3534  	vpsrlvd	ymm5, ymm5, ymm4
  3535  	vpand	ymm5, ymm5, ymm0
  3536  	vmovdqu	ymmword ptr [r15 - 64], ymm5
  3537  	mov	eax, dword ptr [rbx - 28]
  3538  	mov	r9d, dword ptr [rbx - 24]
  3539  	shld	r9d, eax, 10
  3540  	mov	r11d, dword ptr [rbx - 32]
  3541  	shld	eax, r11d, 4
  3542  	mov	esi, dword ptr [rbx - 36]
  3543  	mov	edi, r11d
  3544  	shld	edi, esi, 24
  3545  	mov	ecx, dword ptr [rbx - 40]
  3546  	shld	esi, ecx, 18
  3547  	mov	r10d, dword ptr [rbx - 48]
  3548  	mov	edx, dword ptr [rbx - 44]
  3549  	shld	ecx, edx, 12
  3550  	shld	edx, r10d, 6
  3551  	vmovd	xmm5, r10d
  3552  	vmovd	xmm6, edi
  3553  	vpinsrd	xmm5, xmm5, edx, 1
  3554  	vpinsrd	xmm6, xmm6, r11d, 1
  3555  	vpinsrd	xmm5, xmm5, ecx, 2
  3556  	vpinsrd	xmm6, xmm6, eax, 2
  3557  	vpinsrd	xmm5, xmm5, esi, 3
  3558  	vpinsrd	xmm6, xmm6, r9d, 3
  3559  	vinserti128	ymm5, ymm5, xmm6, 1
  3560  	vpsrlvd	ymm5, ymm5, ymm2
  3561  	vpand	ymm5, ymm5, ymm0
  3562  	vmovdqu	ymmword ptr [r15 - 32], ymm5
  3563  	mov	r9d, dword ptr [rbx]
  3564  	mov	ecx, dword ptr [rbx - 4]
  3565  	mov	edx, r9d
  3566  	shld	edx, ecx, 20
  3567  	mov	esi, dword ptr [rbx - 8]
  3568  	shld	ecx, esi, 14
  3569  	mov	edi, dword ptr [rbx - 16]
  3570  	mov	eax, dword ptr [rbx - 12]
  3571  	shld	esi, eax, 8
  3572  	shld	eax, edi, 2
  3573  	vmovq	xmm5, qword ptr [rbx - 24]      # xmm5 = mem[0],zero
  3574  	vpsrlvd	xmm6, xmm5, xmm1
  3575  	vpshufd	xmm5, xmm5, 229                 # xmm5 = xmm5[1,1,2,3]
  3576  	vpinsrd	xmm5, xmm5, edi, 1
  3577  	vpsllvd	xmm5, xmm5, xmm3
  3578  	vpor	xmm5, xmm6, xmm5
  3579  	vmovd	xmm6, esi
  3580  	vpinsrd	xmm6, xmm6, ecx, 1
  3581  	vpinsrd	xmm6, xmm6, edx, 2
  3582  	vpinsrd	xmm6, xmm6, r9d, 3
  3583  	vpinsrd	xmm5, xmm5, edi, 2
  3584  	vpinsrd	xmm5, xmm5, eax, 3
  3585  	vinserti128	ymm5, ymm5, xmm6, 1
  3586  	vpsrlvd	ymm5, ymm5, ymm4
  3587  	vpand	ymm5, ymm5, ymm0
  3588  	vmovdqu	ymmword ptr [r15], ymm5
  3589  	sub	r15, -128
  3590  	add	rbx, 104
  3591  	add	r8, -1
  3592  	jne	.LBB0_107
  3593  	jmp	.LBB0_147
  3594  .LBB0_135:
  3595  	cmp	edx, 32
  3596  	jl	.LBB0_147
  3597  # %bb.136:
  3598  	mov	eax, r14d
  3599  	add	r15, 96
  3600  	add	rbx, 20
  3601  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_116] # ymm0 = [0,6,12,18,24,0,4,10]
  3602  	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_117] # ymm1 = [270582939711,270582939711,270582939711,270582939711]
  3603  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_118] # ymm2 = [16,22,0,2,8,14,20,26]
  3604  	.p2align	4, 0x90
  3605  .LBB0_137:                              # =>This Inner Loop Header: Depth=1
  3606  	mov	ecx, dword ptr [rbx - 20]
  3607  	mov	edx, dword ptr [rbx - 16]
  3608  	mov	esi, edx
  3609  	shld	esi, ecx, 2
  3610  	vmovd	xmm3, ecx
  3611  	vpbroadcastd	xmm4, xmm3
  3612  	vpinsrd	xmm3, xmm3, esi, 1
  3613  	vpinsrd	xmm3, xmm3, edx, 2
  3614  	vpinsrd	xmm3, xmm3, edx, 3
  3615  	vinserti128	ymm3, ymm4, xmm3, 1
  3616  	vpsrlvd	ymm3, ymm3, ymm0
  3617  	vpand	ymm3, ymm3, ymm1
  3618  	vmovdqu	ymmword ptr [r15 - 96], ymm3
  3619  	mov	ecx, dword ptr [rbx - 16]
  3620  	mov	edx, dword ptr [rbx - 12]
  3621  	mov	esi, edx
  3622  	shld	esi, ecx, 4
  3623  	vmovd	xmm3, ecx
  3624  	vpinsrd	xmm3, xmm3, ecx, 1
  3625  	vpinsrd	xmm3, xmm3, esi, 2
  3626  	vpinsrd	xmm3, xmm3, edx, 3
  3627  	vmovd	xmm4, edx
  3628  	vpbroadcastd	xmm4, xmm4
  3629  	vinserti128	ymm3, ymm3, xmm4, 1
  3630  	vpsrlvd	ymm3, ymm3, ymm2
  3631  	vpand	ymm3, ymm3, ymm1
  3632  	vmovdqu	ymmword ptr [r15 - 64], ymm3
  3633  	mov	ecx, dword ptr [rbx - 8]
  3634  	mov	edx, dword ptr [rbx - 4]
  3635  	mov	esi, edx
  3636  	shld	esi, ecx, 2
  3637  	vmovd	xmm3, ecx
  3638  	vpinsrd	xmm4, xmm3, esi, 1
  3639  	vpinsrd	xmm4, xmm4, edx, 2
  3640  	vpbroadcastd	xmm3, xmm3
  3641  	vpinsrd	xmm4, xmm4, edx, 3
  3642  	vinserti128	ymm3, ymm3, xmm4, 1
  3643  	vpsrlvd	ymm3, ymm3, ymm0
  3644  	vpand	ymm3, ymm3, ymm1
  3645  	vmovdqu	ymmword ptr [r15 - 32], ymm3
  3646  	mov	ecx, dword ptr [rbx - 4]
  3647  	mov	edx, dword ptr [rbx]
  3648  	mov	esi, edx
  3649  	shld	esi, ecx, 4
  3650  	vmovd	xmm3, ecx
  3651  	vpinsrd	xmm3, xmm3, ecx, 1
  3652  	vpinsrd	xmm3, xmm3, esi, 2
  3653  	vpinsrd	xmm3, xmm3, edx, 3
  3654  	vmovd	xmm4, edx
  3655  	vpbroadcastd	xmm4, xmm4
  3656  	vinserti128	ymm3, ymm3, xmm4, 1
  3657  	vpsrlvd	ymm3, ymm3, ymm2
  3658  	vpand	ymm3, ymm3, ymm1
  3659  	vmovdqu	ymmword ptr [r15], ymm3
  3660  	sub	r15, -128
  3661  	add	rbx, 24
  3662  	add	rax, -1
  3663  	jne	.LBB0_137
  3664  	jmp	.LBB0_147
  3665  .LBB0_111:
  3666  	cmp	edx, 32
  3667  	jl	.LBB0_147
  3668  # %bb.112:
  3669  	mov	r8d, r14d
  3670  	add	r15, 96
  3671  	add	rbx, 84
  3672  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_55] # ymm0 = [0,0,0,2,0,0,4,0]
  3673  	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_56] # ymm1 = [18014394218708991,18014394218708991,18014394218708991,18014394218708991]
  3674  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_57] # ymm2 = [0,6,0,0,8,0,0,10]
  3675  	.p2align	4, 0x90
  3676  .LBB0_113:                              # =>This Inner Loop Header: Depth=1
  3677  	mov	r10d, dword ptr [rbx - 68]
  3678  	mov	r9d, dword ptr [rbx - 64]
  3679  	shld	r9d, r10d, 6
  3680  	mov	esi, dword ptr [rbx - 72]
  3681  	mov	edi, r10d
  3682  	shld	edi, esi, 18
  3683  	mov	edx, dword ptr [rbx - 76]
  3684  	shld	esi, edx, 8
  3685  	mov	r11d, dword ptr [rbx - 84]
  3686  	mov	ecx, dword ptr [rbx - 80]
  3687  	mov	eax, edx
  3688  	shld	eax, ecx, 20
  3689  	shld	ecx, r11d, 10
  3690  	vmovd	xmm3, r11d
  3691  	vmovd	xmm4, esi
  3692  	vpinsrd	xmm3, xmm3, ecx, 1
  3693  	vpinsrd	xmm4, xmm4, edi, 1
  3694  	vpinsrd	xmm3, xmm3, eax, 2
  3695  	vpinsrd	xmm4, xmm4, r10d, 2
  3696  	vpinsrd	xmm3, xmm3, edx, 3
  3697  	vpinsrd	xmm4, xmm4, r9d, 3
  3698  	vinserti128	ymm3, ymm3, xmm4, 1
  3699  	vpsrlvd	ymm3, ymm3, ymm0
  3700  	vpand	ymm3, ymm3, ymm1
  3701  	vmovdqu	ymmword ptr [r15 - 96], ymm3
  3702  	mov	r9d, dword ptr [rbx - 44]
  3703  	mov	ecx, dword ptr [rbx - 48]
  3704  	mov	r10d, r9d
  3705  	shld	r10d, ecx, 12
  3706  	mov	esi, dword ptr [rbx - 52]
  3707  	shld	ecx, esi, 2
  3708  	mov	edi, dword ptr [rbx - 56]
  3709  	vmovd	xmm3, esi
  3710  	shld	esi, edi, 14
  3711  	mov	eax, dword ptr [rbx - 64]
  3712  	mov	edx, dword ptr [rbx - 60]
  3713  	shld	edi, edx, 4
  3714  	shrd	eax, edx, 16
  3715  	vpinsrd	xmm3, xmm3, ecx, 1
  3716  	vmovd	xmm4, eax
  3717  	vpinsrd	xmm3, xmm3, r10d, 2
  3718  	vpinsrd	xmm4, xmm4, edx, 1
  3719  	vpinsrd	xmm3, xmm3, r9d, 3
  3720  	vpinsrd	xmm4, xmm4, edi, 2
  3721  	vpinsrd	xmm4, xmm4, esi, 3
  3722  	vinserti128	ymm3, ymm4, xmm3, 1
  3723  	vpsrlvd	ymm3, ymm3, ymm2
  3724  	vpand	ymm3, ymm3, ymm1
  3725  	vmovdqu	ymmword ptr [r15 - 64], ymm3
  3726  	mov	r10d, dword ptr [rbx - 24]
  3727  	mov	r9d, dword ptr [rbx - 20]
  3728  	shld	r9d, r10d, 6
  3729  	mov	edx, dword ptr [rbx - 28]
  3730  	mov	esi, r10d
  3731  	shld	esi, edx, 18
  3732  	mov	ecx, dword ptr [rbx - 32]
  3733  	shld	edx, ecx, 8
  3734  	mov	r11d, dword ptr [rbx - 40]
  3735  	mov	eax, dword ptr [rbx - 36]
  3736  	mov	edi, ecx
  3737  	shld	edi, eax, 20
  3738  	shld	eax, r11d, 10
  3739  	vmovd	xmm3, r11d
  3740  	vmovd	xmm4, edx
  3741  	vpinsrd	xmm3, xmm3, eax, 1
  3742  	vpinsrd	xmm4, xmm4, esi, 1
  3743  	vpinsrd	xmm3, xmm3, edi, 2
  3744  	vpinsrd	xmm4, xmm4, r10d, 2
  3745  	vpinsrd	xmm3, xmm3, ecx, 3
  3746  	vpinsrd	xmm4, xmm4, r9d, 3
  3747  	vinserti128	ymm3, ymm3, xmm4, 1
  3748  	vpsrlvd	ymm3, ymm3, ymm0
  3749  	vpand	ymm3, ymm3, ymm1
  3750  	vmovdqu	ymmword ptr [r15 - 32], ymm3
  3751  	mov	r9d, dword ptr [rbx]
  3752  	mov	ecx, dword ptr [rbx - 4]
  3753  	mov	r10d, r9d
  3754  	shld	r10d, ecx, 12
  3755  	mov	esi, dword ptr [rbx - 8]
  3756  	shld	ecx, esi, 2
  3757  	mov	edi, dword ptr [rbx - 12]
  3758  	vmovd	xmm3, esi
  3759  	shld	esi, edi, 14
  3760  	mov	eax, dword ptr [rbx - 20]
  3761  	mov	edx, dword ptr [rbx - 16]
  3762  	shld	edi, edx, 4
  3763  	shrd	eax, edx, 16
  3764  	vpinsrd	xmm3, xmm3, ecx, 1
  3765  	vmovd	xmm4, eax
  3766  	vpinsrd	xmm3, xmm3, r10d, 2
  3767  	vpinsrd	xmm4, xmm4, edx, 1
  3768  	vpinsrd	xmm3, xmm3, r9d, 3
  3769  	vpinsrd	xmm4, xmm4, edi, 2
  3770  	vpinsrd	xmm4, xmm4, esi, 3
  3771  	vinserti128	ymm3, ymm4, xmm3, 1
  3772  	vpsrlvd	ymm3, ymm3, ymm2
  3773  	vpand	ymm3, ymm3, ymm1
  3774  	vmovdqu	ymmword ptr [r15], ymm3
  3775  	sub	r15, -128
  3776  	add	rbx, 88
  3777  	add	r8, -1
  3778  	jne	.LBB0_113
  3779  	jmp	.LBB0_147
  3780  .LBB0_123:
  3781  	cmp	edx, 32
  3782  	jl	.LBB0_147
  3783  # %bb.124:
  3784  	mov	r8d, r14d
  3785  	add	r15, 96
  3786  	add	rbx, 52
  3787  	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_86] # ymm0 = [0,14,0,10,0,6,0,2]
  3788  	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_87] # ymm1 = [70364449226751,70364449226751,70364449226751,70364449226751]
  3789  	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_88] # ymm2 = [16,0,12,0,8,0,4,18]
  3790  	.p2align	4, 0x90
  3791  .LBB0_125:                              # =>This Inner Loop Header: Depth=1
  3792  	mov	r9d, dword ptr [rbx - 40]
  3793  	mov	ecx, dword ptr [rbx - 44]
  3794  	mov	esi, r9d
  3795  	shld	esi, ecx, 12
  3796  	mov	edi, dword ptr [rbx - 52]
  3797  	mov	r10d, dword ptr [rbx - 48]
  3798  	mov	edx, ecx
  3799  	shld	edx, r10d, 8
  3800  	mov	eax, r10d
  3801  	shld	eax, edi, 4
  3802  	vmovd	xmm3, edx
  3803  	vpinsrd	xmm3, xmm3, ecx, 1
  3804  	vpinsrd	xmm3, xmm3, esi, 2
  3805  	vpinsrd	xmm3, xmm3, r9d, 3
  3806  	vmovd	xmm4, edi
  3807  	vpinsrd	xmm4, xmm4, edi, 1
  3808  	vpinsrd	xmm4, xmm4, eax, 2
  3809  	vpinsrd	xmm4, xmm4, r10d, 3
  3810  	vinserti128	ymm3, ymm4, xmm3, 1
  3811  	vpsrlvd	ymm3, ymm3, ymm0
  3812  	vpand	ymm3, ymm3, ymm1
  3813  	vmovdqu	ymmword ptr [r15 - 96], ymm3
  3814  	mov	eax, dword ptr [rbx - 28]
  3815  	mov	ecx, dword ptr [rbx - 32]
  3816  	mov	edx, eax
  3817  	shld	edx, ecx, 10
  3818  	mov	r9d, dword ptr [rbx - 40]
  3819  	mov	esi, dword ptr [rbx - 36]
  3820  	vmovd	xmm3, ecx
  3821  	shld	ecx, esi, 6
  3822  	mov	edi, esi
  3823  	shld	edi, r9d, 2
  3824  	vmovd	xmm4, r9d
  3825  	vpinsrd	xmm4, xmm4, edi, 1
  3826  	vpinsrd	xmm4, xmm4, esi, 2
  3827  	vpinsrd	xmm4, xmm4, ecx, 3
  3828  	vpinsrd	xmm3, xmm3, edx, 1
  3829  	vpinsrd	xmm3, xmm3, eax, 2
  3830  	vpinsrd	xmm3, xmm3, eax, 3
  3831  	vinserti128	ymm3, ymm4, xmm3, 1
  3832  	vpsrlvd	ymm3, ymm3, ymm2
  3833  	vpand	ymm3, ymm3, ymm1
  3834  	vmovdqu	ymmword ptr [r15 - 64], ymm3
  3835  	mov	r9d, dword ptr [rbx - 12]
  3836  	mov	eax, dword ptr [rbx - 16]
  3837  	mov	edx, r9d
  3838  	shld	edx, eax, 12
  3839  	mov	esi, dword ptr [rbx - 24]
  3840  	mov	r10d, dword ptr [rbx - 20]
  3841  	mov	ecx, eax
  3842  	shld	ecx, r10d, 8
  3843  	mov	edi, r10d
  3844  	shld	edi, esi, 4
  3845  	vmovd	xmm3, ecx
  3846  	vpinsrd	xmm3, xmm3, eax, 1
  3847  	vpinsrd	xmm3, xmm3, edx, 2
  3848  	vpinsrd	xmm3, xmm3, r9d, 3
  3849  	vmovd	xmm4, esi
  3850  	vpinsrd	xmm4, xmm4, esi, 1
  3851  	vpinsrd	xmm4, xmm4, edi, 2
  3852  	vpinsrd	xmm4, xmm4, r10d, 3
  3853  	vinserti128	ymm3, ymm4, xmm3, 1
  3854  	vpsrlvd	ymm3, ymm3, ymm0
  3855  	vpand	ymm3, ymm3, ymm1
  3856  	vmovdqu	ymmword ptr [r15 - 32], ymm3
  3857  	mov	r9d, dword ptr [rbx]
  3858  	mov	ecx, dword ptr [rbx - 4]
  3859  	mov	edx, r9d
  3860  	shld	edx, ecx, 10
  3861  	mov	eax, dword ptr [rbx - 8]
  3862  	vmovd	xmm3, ecx
  3863  	shld	ecx, eax, 6
  3864  	mov	edi, dword ptr [rbx - 12]
  3865  	mov	esi, eax
  3866  	shld	esi, edi, 2
  3867  	vmovd	xmm4, edi
  3868  	vpinsrd	xmm4, xmm4, esi, 1
  3869  	vpinsrd	xmm4, xmm4, eax, 2
  3870  	vpinsrd	xmm4, xmm4, ecx, 3
  3871  	vpinsrd	xmm3, xmm3, edx, 1
  3872  	vpinsrd	xmm3, xmm3, r9d, 2
  3873  	vpinsrd	xmm3, xmm3, r9d, 3
  3874  	vinserti128	ymm3, ymm4, xmm3, 1
  3875  	vpsrlvd	ymm3, ymm3, ymm2
  3876  	vpand	ymm3, ymm3, ymm1
  3877  	vmovdqu	ymmword ptr [r15], ymm3
  3878  	sub	r15, -128
  3879  	add	rbx, 56
  3880  	add	r8, -1
  3881  	jne	.LBB0_125
  3882  	jmp	.LBB0_147
  3883  .LBB0_99:
  3884  	cmp	edx, 32
  3885  	jl	.LBB0_147
  3886  # %bb.100:
  3887  	mov	r8d, r14d
  3888  	add	r15, 96
  3889  	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_8] # ymm0 = [4611686015206162431,4611686015206162431,4611686015206162431,4611686015206162431]
  3890  	add	rbx, 116
  3891  	vmovdqa	xmm1, xmmword ptr [rip + .LCPI0_9] # xmm1 = [16,14,12,10]
  3892  	vmovdqa	xmm2, xmmword ptr [rip + .LCPI0_10] # xmm2 = [16,18,20,22]
  3893  	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_11] # ymm3 = [0,0,0,0,0,0,0,2]
  3894  	.p2align	4, 0x90
  3895  .LBB0_101:                              # =>This Inner Loop Header: Depth=1
  3896  	mov	r11d, dword ptr [rbx - 92]
  3897  	mov	r9d, dword ptr [rbx - 88]
  3898  	shld	r9d, r11d, 14
  3899  	mov	esi, dword ptr [rbx - 96]
  3900  	shld	r11d, esi, 12
  3901  	mov	edi, dword ptr [rbx - 100]
  3902  	shld	esi, edi, 10
  3903  	mov	eax, dword ptr [rbx - 104]
  3904  	shld	edi, eax, 8
  3905  	mov	edx, dword ptr [rbx - 108]
  3906  	shld	eax, edx, 6
  3907  	mov	r10d, dword ptr [rbx - 116]
  3908  	mov	ecx, dword ptr [rbx - 112]
  3909  	shld	edx, ecx, 4
  3910  	shld	ecx, r10d, 2
  3911  	vmovd	xmm4, r10d
  3912  	vmovd	xmm5, edi
  3913  	vpinsrd	xmm4, xmm4, ecx, 1
  3914  	vpinsrd	xmm5, xmm5, esi, 1
  3915  	vpinsrd	xmm4, xmm4, edx, 2
  3916  	vpinsrd	xmm5, xmm5, r11d, 2
  3917  	vpinsrd	xmm4, xmm4, eax, 3
  3918  	vpinsrd	xmm5, xmm5, r9d, 3
  3919  	vinserti128	ymm4, ymm4, xmm5, 1
  3920  	vpand	ymm4, ymm4, ymm0
  3921  	vmovdqu	ymmword ptr [r15 - 96], ymm4
  3922  	mov	eax, dword ptr [rbx - 60]
  3923  	mov	ecx, dword ptr [rbx - 64]
  3924  	mov	edx, eax
  3925  	shld	edx, ecx, 28
  3926  	mov	esi, dword ptr [rbx - 68]
  3927  	mov	edi, dword ptr [rbx - 72]
  3928  	shld	ecx, esi, 26
  3929  	shld	esi, edi, 24
  3930  	vmovdqu	xmm4, xmmword ptr [rbx - 88]
  3931  	vpsrlvd	xmm5, xmm4, xmm1
  3932  	vpshufd	xmm4, xmm4, 249                 # xmm4 = xmm4[1,2,3,3]
  3933  	vpinsrd	xmm4, xmm4, edi, 3
  3934  	vmovd	xmm6, esi
  3935  	vpinsrd	xmm6, xmm6, ecx, 1
  3936  	vpinsrd	xmm6, xmm6, edx, 2
  3937  	vpsllvd	xmm4, xmm4, xmm2
  3938  	vpinsrd	xmm6, xmm6, eax, 3
  3939  	vpor	xmm4, xmm5, xmm4
  3940  	vinserti128	ymm4, ymm4, xmm6, 1
  3941  	vpsrlvd	ymm4, ymm4, ymm3
  3942  	vpand	ymm4, ymm4, ymm0
  3943  	vmovdqu	ymmword ptr [r15 - 64], ymm4
  3944  	mov	r11d, dword ptr [rbx - 32]
  3945  	mov	r9d, dword ptr [rbx - 28]
  3946  	shld	r9d, r11d, 14
  3947  	mov	edx, dword ptr [rbx - 36]
  3948  	shld	r11d, edx, 12
  3949  	mov	esi, dword ptr [rbx - 40]
  3950  	shld	edx, esi, 10
  3951  	mov	edi, dword ptr [rbx - 44]
  3952  	shld	esi, edi, 8
  3953  	mov	ecx, dword ptr [rbx - 48]
  3954  	shld	edi, ecx, 6
  3955  	mov	r10d, dword ptr [rbx - 56]
  3956  	mov	eax, dword ptr [rbx - 52]
  3957  	shld	ecx, eax, 4
  3958  	shld	eax, r10d, 2
  3959  	vmovd	xmm4, r10d
  3960  	vmovd	xmm5, esi
  3961  	vpinsrd	xmm4, xmm4, eax, 1
  3962  	vpinsrd	xmm5, xmm5, edx, 1
  3963  	vpinsrd	xmm4, xmm4, ecx, 2
  3964  	vpinsrd	xmm5, xmm5, r11d, 2
  3965  	vpinsrd	xmm4, xmm4, edi, 3
  3966  	vpinsrd	xmm5, xmm5, r9d, 3
  3967  	vinserti128	ymm4, ymm4, xmm5, 1
  3968  	vpand	ymm4, ymm4, ymm0
  3969  	vmovdqu	ymmword ptr [r15 - 32], ymm4
  3970  	mov	eax, dword ptr [rbx]
  3971  	mov	ecx, dword ptr [rbx - 4]
  3972  	mov	edx, eax
  3973  	shld	edx, ecx, 28
  3974  	mov	esi, dword ptr [rbx - 8]
  3975  	shld	ecx, esi, 26
  3976  	mov	edi, dword ptr [rbx - 12]
  3977  	vmovdqu	xmm4, xmmword ptr [rbx - 28]
  3978  	shld	esi, edi, 24
  3979  	vpsrlvd	xmm5, xmm4, xmm1
  3980  	vpshufd	xmm4, xmm4, 249                 # xmm4 = xmm4[1,2,3,3]
  3981  	vpinsrd	xmm4, xmm4, edi, 3
  3982  	vmovd	xmm6, esi
  3983  	vpinsrd	xmm6, xmm6, ecx, 1
  3984  	vpsllvd	xmm4, xmm4, xmm2
  3985  	vpinsrd	xmm6, xmm6, edx, 2
  3986  	vpinsrd	xmm6, xmm6, eax, 3
  3987  	vpor	xmm4, xmm5, xmm4
  3988  	vinserti128	ymm4, ymm4, xmm6, 1
  3989  	vpsrlvd	ymm4, ymm4, ymm3
  3990  	vpand	ymm4, ymm4, ymm0
  3991  	vmovdqu	ymmword ptr [r15], ymm4
  3992  	sub	r15, -128
  3993  	add	rbx, 120
  3994  	add	r8, -1
  3995  	jne	.LBB0_101
  3996  .LBB0_147:
  3997  	shl	r14d, 5
  3998  	mov	eax, r14d
  3999  	lea	rsp, [rbp - 32]
  4000  	pop	rbx
  4001  	pop	r12
  4002  	pop	r14
  4003  	pop	r15
  4004  	pop	rbp
  4005  	vzeroupper
  4006  	ret
  4007  .Lfunc_end0:
  4008  	.size	unpack32_avx2, .Lfunc_end0-unpack32_avx2
  4009                                          # -- End function
  4010  	.ident	"Debian clang version 11.1.0-++20210428103820+1fdec59bffc1-1~exp1~20210428204437.162"
  4011  	.section	".note.GNU-stack","",@progbits
  4012  	.addrsig