github.com/apache/arrow/go/v16@v16.1.0/arrow/compute/internal/kernels/_lib/base_arithmetic_sse4_amd64.s (about)

     1  	.text
     2  	.intel_syntax noprefix
     3  	.file	"base_arithmetic.cc"
     4  	.section	.rodata.cst16,"aM",@progbits,16
     5  	.p2align	4                               # -- Begin function arithmetic_binary_sse4
     6  .LCPI0_0:
     7  	.short	255                             # 0xff
     8  	.short	255                             # 0xff
     9  	.short	255                             # 0xff
    10  	.short	255                             # 0xff
    11  	.short	255                             # 0xff
    12  	.short	255                             # 0xff
    13  	.short	255                             # 0xff
    14  	.short	255                             # 0xff
    15  	.text
    16  	.globl	arithmetic_binary_sse4
    17  	.p2align	4, 0x90
    18  	.type	arithmetic_binary_sse4,@function
    19  arithmetic_binary_sse4:                 # @arithmetic_binary_sse4
    20  # %bb.0:
    21  	push	rbp
    22  	mov	rbp, rsp
    23  	and	rsp, -8
    24  	cmp	sil, 20
    25  	jg	.LBB0_11
    26  # %bb.1:
    27  	test	sil, sil
    28  	je	.LBB0_21
    29  # %bb.2:
    30  	cmp	sil, 1
    31  	je	.LBB0_367
    32  # %bb.3:
    33  	cmp	sil, 2
    34  	jne	.LBB0_1013
    35  # %bb.4:
    36  	cmp	edi, 6
    37  	jg	.LBB0_719
    38  # %bb.5:
    39  	cmp	edi, 3
    40  	jle	.LBB0_6
    41  # %bb.713:
    42  	cmp	edi, 4
    43  	je	.LBB0_760
    44  # %bb.714:
    45  	cmp	edi, 5
    46  	je	.LBB0_776
    47  # %bb.715:
    48  	cmp	edi, 6
    49  	jne	.LBB0_1013
    50  # %bb.716:
    51  	test	r9d, r9d
    52  	jle	.LBB0_1013
    53  # %bb.717:
    54  	mov	r10d, r9d
    55  	cmp	r9d, 8
    56  	jae	.LBB0_792
    57  # %bb.718:
    58  	xor	esi, esi
    59  .LBB0_801:
    60  	mov	r9, rsi
    61  	not	r9
    62  	add	r9, r10
    63  	mov	rdi, r10
    64  	and	rdi, 3
    65  	je	.LBB0_803
    66  .LBB0_802:                              # =>This Inner Loop Header: Depth=1
    67  	mov	eax, dword ptr [rcx + 4*rsi]
    68  	imul	eax, dword ptr [rdx + 4*rsi]
    69  	mov	dword ptr [r8 + 4*rsi], eax
    70  	add	rsi, 1
    71  	add	rdi, -1
    72  	jne	.LBB0_802
    73  .LBB0_803:
    74  	cmp	r9, 3
    75  	jb	.LBB0_1013
    76  .LBB0_804:                              # =>This Inner Loop Header: Depth=1
    77  	mov	eax, dword ptr [rcx + 4*rsi]
    78  	imul	eax, dword ptr [rdx + 4*rsi]
    79  	mov	dword ptr [r8 + 4*rsi], eax
    80  	mov	eax, dword ptr [rcx + 4*rsi + 4]
    81  	imul	eax, dword ptr [rdx + 4*rsi + 4]
    82  	mov	dword ptr [r8 + 4*rsi + 4], eax
    83  	mov	eax, dword ptr [rcx + 4*rsi + 8]
    84  	imul	eax, dword ptr [rdx + 4*rsi + 8]
    85  	mov	dword ptr [r8 + 4*rsi + 8], eax
    86  	mov	eax, dword ptr [rcx + 4*rsi + 12]
    87  	imul	eax, dword ptr [rdx + 4*rsi + 12]
    88  	mov	dword ptr [r8 + 4*rsi + 12], eax
    89  	add	rsi, 4
    90  	cmp	r10, rsi
    91  	jne	.LBB0_804
    92  	jmp	.LBB0_1013
    93  .LBB0_11:
    94  	cmp	sil, 21
    95  	je	.LBB0_194
    96  # %bb.12:
    97  	cmp	sil, 22
    98  	je	.LBB0_540
    99  # %bb.13:
   100  	cmp	sil, 23
   101  	jne	.LBB0_1013
   102  # %bb.14:
   103  	cmp	edi, 6
   104  	jg	.LBB0_869
   105  # %bb.15:
   106  	cmp	edi, 3
   107  	jle	.LBB0_16
   108  # %bb.863:
   109  	cmp	edi, 4
   110  	je	.LBB0_910
   111  # %bb.864:
   112  	cmp	edi, 5
   113  	je	.LBB0_926
   114  # %bb.865:
   115  	cmp	edi, 6
   116  	jne	.LBB0_1013
   117  # %bb.866:
   118  	test	r9d, r9d
   119  	jle	.LBB0_1013
   120  # %bb.867:
   121  	mov	r10d, r9d
   122  	cmp	r9d, 8
   123  	jae	.LBB0_942
   124  # %bb.868:
   125  	xor	esi, esi
   126  .LBB0_951:
   127  	mov	r9, rsi
   128  	not	r9
   129  	add	r9, r10
   130  	mov	rdi, r10
   131  	and	rdi, 3
   132  	je	.LBB0_953
   133  .LBB0_952:                              # =>This Inner Loop Header: Depth=1
   134  	mov	eax, dword ptr [rcx + 4*rsi]
   135  	imul	eax, dword ptr [rdx + 4*rsi]
   136  	mov	dword ptr [r8 + 4*rsi], eax
   137  	add	rsi, 1
   138  	add	rdi, -1
   139  	jne	.LBB0_952
   140  .LBB0_953:
   141  	cmp	r9, 3
   142  	jb	.LBB0_1013
   143  .LBB0_954:                              # =>This Inner Loop Header: Depth=1
   144  	mov	eax, dword ptr [rcx + 4*rsi]
   145  	imul	eax, dword ptr [rdx + 4*rsi]
   146  	mov	dword ptr [r8 + 4*rsi], eax
   147  	mov	eax, dword ptr [rcx + 4*rsi + 4]
   148  	imul	eax, dword ptr [rdx + 4*rsi + 4]
   149  	mov	dword ptr [r8 + 4*rsi + 4], eax
   150  	mov	eax, dword ptr [rcx + 4*rsi + 8]
   151  	imul	eax, dword ptr [rdx + 4*rsi + 8]
   152  	mov	dword ptr [r8 + 4*rsi + 8], eax
   153  	mov	eax, dword ptr [rcx + 4*rsi + 12]
   154  	imul	eax, dword ptr [rdx + 4*rsi + 12]
   155  	mov	dword ptr [r8 + 4*rsi + 12], eax
   156  	add	rsi, 4
   157  	cmp	r10, rsi
   158  	jne	.LBB0_954
   159  	jmp	.LBB0_1013
   160  .LBB0_21:
   161  	cmp	edi, 6
   162  	jg	.LBB0_34
   163  # %bb.22:
   164  	cmp	edi, 3
   165  	jle	.LBB0_23
   166  # %bb.28:
   167  	cmp	edi, 4
   168  	je	.LBB0_75
   169  # %bb.29:
   170  	cmp	edi, 5
   171  	je	.LBB0_91
   172  # %bb.30:
   173  	cmp	edi, 6
   174  	jne	.LBB0_1013
   175  # %bb.31:
   176  	test	r9d, r9d
   177  	jle	.LBB0_1013
   178  # %bb.32:
   179  	mov	r10d, r9d
   180  	cmp	r9d, 8
   181  	jae	.LBB0_107
   182  # %bb.33:
   183  	xor	esi, esi
   184  .LBB0_116:
   185  	mov	r9, rsi
   186  	not	r9
   187  	add	r9, r10
   188  	mov	rdi, r10
   189  	and	rdi, 3
   190  	je	.LBB0_118
   191  .LBB0_117:                              # =>This Inner Loop Header: Depth=1
   192  	mov	eax, dword ptr [rcx + 4*rsi]
   193  	add	eax, dword ptr [rdx + 4*rsi]
   194  	mov	dword ptr [r8 + 4*rsi], eax
   195  	add	rsi, 1
   196  	add	rdi, -1
   197  	jne	.LBB0_117
   198  .LBB0_118:
   199  	cmp	r9, 3
   200  	jb	.LBB0_1013
   201  .LBB0_119:                              # =>This Inner Loop Header: Depth=1
   202  	mov	eax, dword ptr [rcx + 4*rsi]
   203  	add	eax, dword ptr [rdx + 4*rsi]
   204  	mov	dword ptr [r8 + 4*rsi], eax
   205  	mov	eax, dword ptr [rcx + 4*rsi + 4]
   206  	add	eax, dword ptr [rdx + 4*rsi + 4]
   207  	mov	dword ptr [r8 + 4*rsi + 4], eax
   208  	mov	eax, dword ptr [rcx + 4*rsi + 8]
   209  	add	eax, dword ptr [rdx + 4*rsi + 8]
   210  	mov	dword ptr [r8 + 4*rsi + 8], eax
   211  	mov	eax, dword ptr [rcx + 4*rsi + 12]
   212  	add	eax, dword ptr [rdx + 4*rsi + 12]
   213  	mov	dword ptr [r8 + 4*rsi + 12], eax
   214  	add	rsi, 4
   215  	cmp	r10, rsi
   216  	jne	.LBB0_119
   217  	jmp	.LBB0_1013
   218  .LBB0_367:
   219  	cmp	edi, 6
   220  	jg	.LBB0_380
   221  # %bb.368:
   222  	cmp	edi, 3
   223  	jle	.LBB0_369
   224  # %bb.374:
   225  	cmp	edi, 4
   226  	je	.LBB0_421
   227  # %bb.375:
   228  	cmp	edi, 5
   229  	je	.LBB0_437
   230  # %bb.376:
   231  	cmp	edi, 6
   232  	jne	.LBB0_1013
   233  # %bb.377:
   234  	test	r9d, r9d
   235  	jle	.LBB0_1013
   236  # %bb.378:
   237  	mov	r10d, r9d
   238  	cmp	r9d, 8
   239  	jae	.LBB0_453
   240  # %bb.379:
   241  	xor	esi, esi
   242  .LBB0_462:
   243  	mov	r9, rsi
   244  	not	r9
   245  	add	r9, r10
   246  	mov	rdi, r10
   247  	and	rdi, 3
   248  	je	.LBB0_464
   249  .LBB0_463:                              # =>This Inner Loop Header: Depth=1
   250  	mov	eax, dword ptr [rdx + 4*rsi]
   251  	sub	eax, dword ptr [rcx + 4*rsi]
   252  	mov	dword ptr [r8 + 4*rsi], eax
   253  	add	rsi, 1
   254  	add	rdi, -1
   255  	jne	.LBB0_463
   256  .LBB0_464:
   257  	cmp	r9, 3
   258  	jb	.LBB0_1013
   259  .LBB0_465:                              # =>This Inner Loop Header: Depth=1
   260  	mov	eax, dword ptr [rdx + 4*rsi]
   261  	sub	eax, dword ptr [rcx + 4*rsi]
   262  	mov	dword ptr [r8 + 4*rsi], eax
   263  	mov	eax, dword ptr [rdx + 4*rsi + 4]
   264  	sub	eax, dword ptr [rcx + 4*rsi + 4]
   265  	mov	dword ptr [r8 + 4*rsi + 4], eax
   266  	mov	eax, dword ptr [rdx + 4*rsi + 8]
   267  	sub	eax, dword ptr [rcx + 4*rsi + 8]
   268  	mov	dword ptr [r8 + 4*rsi + 8], eax
   269  	mov	eax, dword ptr [rdx + 4*rsi + 12]
   270  	sub	eax, dword ptr [rcx + 4*rsi + 12]
   271  	mov	dword ptr [r8 + 4*rsi + 12], eax
   272  	add	rsi, 4
   273  	cmp	r10, rsi
   274  	jne	.LBB0_465
   275  	jmp	.LBB0_1013
   276  .LBB0_194:
   277  	cmp	edi, 6
   278  	jg	.LBB0_207
   279  # %bb.195:
   280  	cmp	edi, 3
   281  	jle	.LBB0_196
   282  # %bb.201:
   283  	cmp	edi, 4
   284  	je	.LBB0_248
   285  # %bb.202:
   286  	cmp	edi, 5
   287  	je	.LBB0_264
   288  # %bb.203:
   289  	cmp	edi, 6
   290  	jne	.LBB0_1013
   291  # %bb.204:
   292  	test	r9d, r9d
   293  	jle	.LBB0_1013
   294  # %bb.205:
   295  	mov	r10d, r9d
   296  	cmp	r9d, 8
   297  	jae	.LBB0_280
   298  # %bb.206:
   299  	xor	esi, esi
   300  .LBB0_289:
   301  	mov	r9, rsi
   302  	not	r9
   303  	add	r9, r10
   304  	mov	rdi, r10
   305  	and	rdi, 3
   306  	je	.LBB0_291
   307  .LBB0_290:                              # =>This Inner Loop Header: Depth=1
   308  	mov	eax, dword ptr [rcx + 4*rsi]
   309  	add	eax, dword ptr [rdx + 4*rsi]
   310  	mov	dword ptr [r8 + 4*rsi], eax
   311  	add	rsi, 1
   312  	add	rdi, -1
   313  	jne	.LBB0_290
   314  .LBB0_291:
   315  	cmp	r9, 3
   316  	jb	.LBB0_1013
   317  .LBB0_292:                              # =>This Inner Loop Header: Depth=1
   318  	mov	eax, dword ptr [rcx + 4*rsi]
   319  	add	eax, dword ptr [rdx + 4*rsi]
   320  	mov	dword ptr [r8 + 4*rsi], eax
   321  	mov	eax, dword ptr [rcx + 4*rsi + 4]
   322  	add	eax, dword ptr [rdx + 4*rsi + 4]
   323  	mov	dword ptr [r8 + 4*rsi + 4], eax
   324  	mov	eax, dword ptr [rcx + 4*rsi + 8]
   325  	add	eax, dword ptr [rdx + 4*rsi + 8]
   326  	mov	dword ptr [r8 + 4*rsi + 8], eax
   327  	mov	eax, dword ptr [rcx + 4*rsi + 12]
   328  	add	eax, dword ptr [rdx + 4*rsi + 12]
   329  	mov	dword ptr [r8 + 4*rsi + 12], eax
   330  	add	rsi, 4
   331  	cmp	r10, rsi
   332  	jne	.LBB0_292
   333  	jmp	.LBB0_1013
   334  .LBB0_540:
   335  	cmp	edi, 6
   336  	jg	.LBB0_553
   337  # %bb.541:
   338  	cmp	edi, 3
   339  	jle	.LBB0_542
   340  # %bb.547:
   341  	cmp	edi, 4
   342  	je	.LBB0_594
   343  # %bb.548:
   344  	cmp	edi, 5
   345  	je	.LBB0_610
   346  # %bb.549:
   347  	cmp	edi, 6
   348  	jne	.LBB0_1013
   349  # %bb.550:
   350  	test	r9d, r9d
   351  	jle	.LBB0_1013
   352  # %bb.551:
   353  	mov	r10d, r9d
   354  	cmp	r9d, 8
   355  	jae	.LBB0_626
   356  # %bb.552:
   357  	xor	esi, esi
   358  .LBB0_635:
   359  	mov	r9, rsi
   360  	not	r9
   361  	add	r9, r10
   362  	mov	rdi, r10
   363  	and	rdi, 3
   364  	je	.LBB0_637
   365  .LBB0_636:                              # =>This Inner Loop Header: Depth=1
   366  	mov	eax, dword ptr [rdx + 4*rsi]
   367  	sub	eax, dword ptr [rcx + 4*rsi]
   368  	mov	dword ptr [r8 + 4*rsi], eax
   369  	add	rsi, 1
   370  	add	rdi, -1
   371  	jne	.LBB0_636
   372  .LBB0_637:
   373  	cmp	r9, 3
   374  	jb	.LBB0_1013
   375  .LBB0_638:                              # =>This Inner Loop Header: Depth=1
   376  	mov	eax, dword ptr [rdx + 4*rsi]
   377  	sub	eax, dword ptr [rcx + 4*rsi]
   378  	mov	dword ptr [r8 + 4*rsi], eax
   379  	mov	eax, dword ptr [rdx + 4*rsi + 4]
   380  	sub	eax, dword ptr [rcx + 4*rsi + 4]
   381  	mov	dword ptr [r8 + 4*rsi + 4], eax
   382  	mov	eax, dword ptr [rdx + 4*rsi + 8]
   383  	sub	eax, dword ptr [rcx + 4*rsi + 8]
   384  	mov	dword ptr [r8 + 4*rsi + 8], eax
   385  	mov	eax, dword ptr [rdx + 4*rsi + 12]
   386  	sub	eax, dword ptr [rcx + 4*rsi + 12]
   387  	mov	dword ptr [r8 + 4*rsi + 12], eax
   388  	add	rsi, 4
   389  	cmp	r10, rsi
   390  	jne	.LBB0_638
   391  	jmp	.LBB0_1013
   392  .LBB0_719:
   393  	cmp	edi, 8
   394  	jle	.LBB0_720
   395  # %bb.725:
   396  	cmp	edi, 9
   397  	je	.LBB0_826
   398  # %bb.726:
   399  	cmp	edi, 11
   400  	je	.LBB0_834
   401  # %bb.727:
   402  	cmp	edi, 12
   403  	jne	.LBB0_1013
   404  # %bb.728:
   405  	test	r9d, r9d
   406  	jle	.LBB0_1013
   407  # %bb.729:
   408  	mov	r10d, r9d
   409  	cmp	r9d, 4
   410  	jae	.LBB0_850
   411  # %bb.730:
   412  	xor	esi, esi
   413  .LBB0_859:
   414  	mov	rax, rsi
   415  	not	rax
   416  	add	rax, r10
   417  	mov	rdi, r10
   418  	and	rdi, 3
   419  	je	.LBB0_861
   420  .LBB0_860:                              # =>This Inner Loop Header: Depth=1
   421  	movsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
   422  	mulsd	xmm0, qword ptr [rdx + 8*rsi]
   423  	movsd	qword ptr [r8 + 8*rsi], xmm0
   424  	add	rsi, 1
   425  	add	rdi, -1
   426  	jne	.LBB0_860
   427  .LBB0_861:
   428  	cmp	rax, 3
   429  	jb	.LBB0_1013
   430  .LBB0_862:                              # =>This Inner Loop Header: Depth=1
   431  	movsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
   432  	mulsd	xmm0, qword ptr [rdx + 8*rsi]
   433  	movsd	qword ptr [r8 + 8*rsi], xmm0
   434  	movsd	xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
   435  	mulsd	xmm0, qword ptr [rdx + 8*rsi + 8]
   436  	movsd	qword ptr [r8 + 8*rsi + 8], xmm0
   437  	movsd	xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
   438  	mulsd	xmm0, qword ptr [rdx + 8*rsi + 16]
   439  	movsd	qword ptr [r8 + 8*rsi + 16], xmm0
   440  	movsd	xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
   441  	mulsd	xmm0, qword ptr [rdx + 8*rsi + 24]
   442  	movsd	qword ptr [r8 + 8*rsi + 24], xmm0
   443  	add	rsi, 4
   444  	cmp	r10, rsi
   445  	jne	.LBB0_862
   446  	jmp	.LBB0_1013
   447  .LBB0_869:
   448  	cmp	edi, 8
   449  	jle	.LBB0_870
   450  # %bb.875:
   451  	cmp	edi, 9
   452  	je	.LBB0_976
   453  # %bb.876:
   454  	cmp	edi, 11
   455  	je	.LBB0_984
   456  # %bb.877:
   457  	cmp	edi, 12
   458  	jne	.LBB0_1013
   459  # %bb.878:
   460  	test	r9d, r9d
   461  	jle	.LBB0_1013
   462  # %bb.879:
   463  	mov	r10d, r9d
   464  	cmp	r9d, 4
   465  	jae	.LBB0_1000
   466  # %bb.880:
   467  	xor	esi, esi
   468  .LBB0_1009:
   469  	mov	rax, rsi
   470  	not	rax
   471  	add	rax, r10
   472  	mov	rdi, r10
   473  	and	rdi, 3
   474  	je	.LBB0_1011
   475  .LBB0_1010:                             # =>This Inner Loop Header: Depth=1
   476  	movsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
   477  	mulsd	xmm0, qword ptr [rdx + 8*rsi]
   478  	movsd	qword ptr [r8 + 8*rsi], xmm0
   479  	add	rsi, 1
   480  	add	rdi, -1
   481  	jne	.LBB0_1010
   482  .LBB0_1011:
   483  	cmp	rax, 3
   484  	jb	.LBB0_1013
   485  .LBB0_1012:                             # =>This Inner Loop Header: Depth=1
   486  	movsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
   487  	mulsd	xmm0, qword ptr [rdx + 8*rsi]
   488  	movsd	qword ptr [r8 + 8*rsi], xmm0
   489  	movsd	xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
   490  	mulsd	xmm0, qword ptr [rdx + 8*rsi + 8]
   491  	movsd	qword ptr [r8 + 8*rsi + 8], xmm0
   492  	movsd	xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
   493  	mulsd	xmm0, qword ptr [rdx + 8*rsi + 16]
   494  	movsd	qword ptr [r8 + 8*rsi + 16], xmm0
   495  	movsd	xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
   496  	mulsd	xmm0, qword ptr [rdx + 8*rsi + 24]
   497  	movsd	qword ptr [r8 + 8*rsi + 24], xmm0
   498  	add	rsi, 4
   499  	cmp	r10, rsi
   500  	jne	.LBB0_1012
   501  	jmp	.LBB0_1013
   502  .LBB0_34:
   503  	cmp	edi, 8
   504  	jle	.LBB0_35
   505  # %bb.40:
   506  	cmp	edi, 9
   507  	je	.LBB0_149
   508  # %bb.41:
   509  	cmp	edi, 11
   510  	je	.LBB0_165
   511  # %bb.42:
   512  	cmp	edi, 12
   513  	jne	.LBB0_1013
   514  # %bb.43:
   515  	test	r9d, r9d
   516  	jle	.LBB0_1013
   517  # %bb.44:
   518  	mov	r10d, r9d
   519  	cmp	r9d, 4
   520  	jae	.LBB0_181
   521  # %bb.45:
   522  	xor	esi, esi
   523  .LBB0_190:
   524  	mov	rax, rsi
   525  	not	rax
   526  	add	rax, r10
   527  	mov	rdi, r10
   528  	and	rdi, 3
   529  	je	.LBB0_192
   530  .LBB0_191:                              # =>This Inner Loop Header: Depth=1
   531  	movsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
   532  	addsd	xmm0, qword ptr [rdx + 8*rsi]
   533  	movsd	qword ptr [r8 + 8*rsi], xmm0
   534  	add	rsi, 1
   535  	add	rdi, -1
   536  	jne	.LBB0_191
   537  .LBB0_192:
   538  	cmp	rax, 3
   539  	jb	.LBB0_1013
   540  .LBB0_193:                              # =>This Inner Loop Header: Depth=1
   541  	movsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
   542  	addsd	xmm0, qword ptr [rdx + 8*rsi]
   543  	movsd	qword ptr [r8 + 8*rsi], xmm0
   544  	movsd	xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
   545  	addsd	xmm0, qword ptr [rdx + 8*rsi + 8]
   546  	movsd	qword ptr [r8 + 8*rsi + 8], xmm0
   547  	movsd	xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
   548  	addsd	xmm0, qword ptr [rdx + 8*rsi + 16]
   549  	movsd	qword ptr [r8 + 8*rsi + 16], xmm0
   550  	movsd	xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
   551  	addsd	xmm0, qword ptr [rdx + 8*rsi + 24]
   552  	movsd	qword ptr [r8 + 8*rsi + 24], xmm0
   553  	add	rsi, 4
   554  	cmp	r10, rsi
   555  	jne	.LBB0_193
   556  	jmp	.LBB0_1013
   557  .LBB0_380:
   558  	cmp	edi, 8
   559  	jle	.LBB0_381
   560  # %bb.386:
   561  	cmp	edi, 9
   562  	je	.LBB0_495
   563  # %bb.387:
   564  	cmp	edi, 11
   565  	je	.LBB0_511
   566  # %bb.388:
   567  	cmp	edi, 12
   568  	jne	.LBB0_1013
   569  # %bb.389:
   570  	test	r9d, r9d
   571  	jle	.LBB0_1013
   572  # %bb.390:
   573  	mov	r10d, r9d
   574  	cmp	r9d, 4
   575  	jae	.LBB0_527
   576  # %bb.391:
   577  	xor	esi, esi
   578  .LBB0_536:
   579  	mov	rax, rsi
   580  	not	rax
   581  	add	rax, r10
   582  	mov	rdi, r10
   583  	and	rdi, 3
   584  	je	.LBB0_538
   585  .LBB0_537:                              # =>This Inner Loop Header: Depth=1
   586  	movsd	xmm0, qword ptr [rdx + 8*rsi]   # xmm0 = mem[0],zero
   587  	subsd	xmm0, qword ptr [rcx + 8*rsi]
   588  	movsd	qword ptr [r8 + 8*rsi], xmm0
   589  	add	rsi, 1
   590  	add	rdi, -1
   591  	jne	.LBB0_537
   592  .LBB0_538:
   593  	cmp	rax, 3
   594  	jb	.LBB0_1013
   595  .LBB0_539:                              # =>This Inner Loop Header: Depth=1
   596  	movsd	xmm0, qword ptr [rdx + 8*rsi]   # xmm0 = mem[0],zero
   597  	subsd	xmm0, qword ptr [rcx + 8*rsi]
   598  	movsd	qword ptr [r8 + 8*rsi], xmm0
   599  	movsd	xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
   600  	subsd	xmm0, qword ptr [rcx + 8*rsi + 8]
   601  	movsd	qword ptr [r8 + 8*rsi + 8], xmm0
   602  	movsd	xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
   603  	subsd	xmm0, qword ptr [rcx + 8*rsi + 16]
   604  	movsd	qword ptr [r8 + 8*rsi + 16], xmm0
   605  	movsd	xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
   606  	subsd	xmm0, qword ptr [rcx + 8*rsi + 24]
   607  	movsd	qword ptr [r8 + 8*rsi + 24], xmm0
   608  	add	rsi, 4
   609  	cmp	r10, rsi
   610  	jne	.LBB0_539
   611  	jmp	.LBB0_1013
   612  .LBB0_207:
   613  	cmp	edi, 8
   614  	jle	.LBB0_208
   615  # %bb.213:
   616  	cmp	edi, 9
   617  	je	.LBB0_322
   618  # %bb.214:
   619  	cmp	edi, 11
   620  	je	.LBB0_338
   621  # %bb.215:
   622  	cmp	edi, 12
   623  	jne	.LBB0_1013
   624  # %bb.216:
   625  	test	r9d, r9d
   626  	jle	.LBB0_1013
   627  # %bb.217:
   628  	mov	r10d, r9d
   629  	cmp	r9d, 4
   630  	jae	.LBB0_354
   631  # %bb.218:
   632  	xor	esi, esi
   633  .LBB0_363:
   634  	mov	rax, rsi
   635  	not	rax
   636  	add	rax, r10
   637  	mov	rdi, r10
   638  	and	rdi, 3
   639  	je	.LBB0_365
   640  .LBB0_364:                              # =>This Inner Loop Header: Depth=1
   641  	movsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
   642  	addsd	xmm0, qword ptr [rdx + 8*rsi]
   643  	movsd	qword ptr [r8 + 8*rsi], xmm0
   644  	add	rsi, 1
   645  	add	rdi, -1
   646  	jne	.LBB0_364
   647  .LBB0_365:
   648  	cmp	rax, 3
   649  	jb	.LBB0_1013
   650  .LBB0_366:                              # =>This Inner Loop Header: Depth=1
   651  	movsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
   652  	addsd	xmm0, qword ptr [rdx + 8*rsi]
   653  	movsd	qword ptr [r8 + 8*rsi], xmm0
   654  	movsd	xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
   655  	addsd	xmm0, qword ptr [rdx + 8*rsi + 8]
   656  	movsd	qword ptr [r8 + 8*rsi + 8], xmm0
   657  	movsd	xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
   658  	addsd	xmm0, qword ptr [rdx + 8*rsi + 16]
   659  	movsd	qword ptr [r8 + 8*rsi + 16], xmm0
   660  	movsd	xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
   661  	addsd	xmm0, qword ptr [rdx + 8*rsi + 24]
   662  	movsd	qword ptr [r8 + 8*rsi + 24], xmm0
   663  	add	rsi, 4
   664  	cmp	r10, rsi
   665  	jne	.LBB0_366
   666  	jmp	.LBB0_1013
   667  .LBB0_553:
   668  	cmp	edi, 8
   669  	jle	.LBB0_554
   670  # %bb.559:
   671  	cmp	edi, 9
   672  	je	.LBB0_668
   673  # %bb.560:
   674  	cmp	edi, 11
   675  	je	.LBB0_684
   676  # %bb.561:
   677  	cmp	edi, 12
   678  	jne	.LBB0_1013
   679  # %bb.562:
   680  	test	r9d, r9d
   681  	jle	.LBB0_1013
   682  # %bb.563:
   683  	mov	r10d, r9d
   684  	cmp	r9d, 4
   685  	jae	.LBB0_700
   686  # %bb.564:
   687  	xor	esi, esi
   688  .LBB0_709:
   689  	mov	rax, rsi
   690  	not	rax
   691  	add	rax, r10
   692  	mov	rdi, r10
   693  	and	rdi, 3
   694  	je	.LBB0_711
   695  .LBB0_710:                              # =>This Inner Loop Header: Depth=1
   696  	movsd	xmm0, qword ptr [rdx + 8*rsi]   # xmm0 = mem[0],zero
   697  	subsd	xmm0, qword ptr [rcx + 8*rsi]
   698  	movsd	qword ptr [r8 + 8*rsi], xmm0
   699  	add	rsi, 1
   700  	add	rdi, -1
   701  	jne	.LBB0_710
   702  .LBB0_711:
   703  	cmp	rax, 3
   704  	jb	.LBB0_1013
   705  .LBB0_712:                              # =>This Inner Loop Header: Depth=1
   706  	movsd	xmm0, qword ptr [rdx + 8*rsi]   # xmm0 = mem[0],zero
   707  	subsd	xmm0, qword ptr [rcx + 8*rsi]
   708  	movsd	qword ptr [r8 + 8*rsi], xmm0
   709  	movsd	xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
   710  	subsd	xmm0, qword ptr [rcx + 8*rsi + 8]
   711  	movsd	qword ptr [r8 + 8*rsi + 8], xmm0
   712  	movsd	xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
   713  	subsd	xmm0, qword ptr [rcx + 8*rsi + 16]
   714  	movsd	qword ptr [r8 + 8*rsi + 16], xmm0
   715  	movsd	xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
   716  	subsd	xmm0, qword ptr [rcx + 8*rsi + 24]
   717  	movsd	qword ptr [r8 + 8*rsi + 24], xmm0
   718  	add	rsi, 4
   719  	cmp	r10, rsi
   720  	jne	.LBB0_712
   721  	jmp	.LBB0_1013
   722  .LBB0_6:
   723  	cmp	edi, 2
   724  	je	.LBB0_731
   725  # %bb.7:
   726  	cmp	edi, 3
   727  	jne	.LBB0_1013
   728  # %bb.8:
   729  	test	r9d, r9d
   730  	jle	.LBB0_1013
   731  # %bb.9:
   732  	mov	r10d, r9d
   733  	cmp	r9d, 32
   734  	jae	.LBB0_747
   735  # %bb.10:
   736  	xor	edi, edi
   737  .LBB0_756:
   738  	mov	r9, rdi
   739  	not	r9
   740  	add	r9, r10
   741  	mov	rsi, r10
   742  	and	rsi, 3
   743  	je	.LBB0_758
   744  .LBB0_757:                              # =>This Inner Loop Header: Depth=1
   745  	movzx	eax, byte ptr [rcx + rdi]
   746  	mul	byte ptr [rdx + rdi]
   747  	mov	byte ptr [r8 + rdi], al
   748  	add	rdi, 1
   749  	add	rsi, -1
   750  	jne	.LBB0_757
   751  .LBB0_758:
   752  	cmp	r9, 3
   753  	jb	.LBB0_1013
   754  .LBB0_759:                              # =>This Inner Loop Header: Depth=1
   755  	movzx	eax, byte ptr [rcx + rdi]
   756  	mul	byte ptr [rdx + rdi]
   757  	mov	byte ptr [r8 + rdi], al
   758  	movzx	eax, byte ptr [rcx + rdi + 1]
   759  	mul	byte ptr [rdx + rdi + 1]
   760  	mov	byte ptr [r8 + rdi + 1], al
   761  	movzx	eax, byte ptr [rcx + rdi + 2]
   762  	mul	byte ptr [rdx + rdi + 2]
   763  	mov	byte ptr [r8 + rdi + 2], al
   764  	movzx	eax, byte ptr [rcx + rdi + 3]
   765  	mul	byte ptr [rdx + rdi + 3]
   766  	mov	byte ptr [r8 + rdi + 3], al
   767  	add	rdi, 4
   768  	cmp	r10, rdi
   769  	jne	.LBB0_759
   770  	jmp	.LBB0_1013
   771  .LBB0_16:
   772  	cmp	edi, 2
   773  	je	.LBB0_881
   774  # %bb.17:
   775  	cmp	edi, 3
   776  	jne	.LBB0_1013
   777  # %bb.18:
   778  	test	r9d, r9d
   779  	jle	.LBB0_1013
   780  # %bb.19:
   781  	mov	r10d, r9d
   782  	cmp	r9d, 32
   783  	jae	.LBB0_897
   784  # %bb.20:
   785  	xor	edi, edi
   786  .LBB0_906:
   787  	mov	r9, rdi
   788  	not	r9
   789  	add	r9, r10
   790  	mov	rsi, r10
   791  	and	rsi, 3
   792  	je	.LBB0_908
   793  .LBB0_907:                              # =>This Inner Loop Header: Depth=1
   794  	movzx	eax, byte ptr [rcx + rdi]
   795  	mul	byte ptr [rdx + rdi]
   796  	mov	byte ptr [r8 + rdi], al
   797  	add	rdi, 1
   798  	add	rsi, -1
   799  	jne	.LBB0_907
   800  .LBB0_908:
   801  	cmp	r9, 3
   802  	jb	.LBB0_1013
   803  .LBB0_909:                              # =>This Inner Loop Header: Depth=1
   804  	movzx	eax, byte ptr [rcx + rdi]
   805  	mul	byte ptr [rdx + rdi]
   806  	mov	byte ptr [r8 + rdi], al
   807  	movzx	eax, byte ptr [rcx + rdi + 1]
   808  	mul	byte ptr [rdx + rdi + 1]
   809  	mov	byte ptr [r8 + rdi + 1], al
   810  	movzx	eax, byte ptr [rcx + rdi + 2]
   811  	mul	byte ptr [rdx + rdi + 2]
   812  	mov	byte ptr [r8 + rdi + 2], al
   813  	movzx	eax, byte ptr [rcx + rdi + 3]
   814  	mul	byte ptr [rdx + rdi + 3]
   815  	mov	byte ptr [r8 + rdi + 3], al
   816  	add	rdi, 4
   817  	cmp	r10, rdi
   818  	jne	.LBB0_909
   819  	jmp	.LBB0_1013
   820  .LBB0_23:
   821  	cmp	edi, 2
   822  	je	.LBB0_46
   823  # %bb.24:
   824  	cmp	edi, 3
   825  	jne	.LBB0_1013
   826  # %bb.25:
   827  	test	r9d, r9d
   828  	jle	.LBB0_1013
   829  # %bb.26:
   830  	mov	r10d, r9d
   831  	cmp	r9d, 32
   832  	jae	.LBB0_62
   833  # %bb.27:
   834  	xor	esi, esi
   835  .LBB0_71:
   836  	mov	r9, rsi
   837  	not	r9
   838  	add	r9, r10
   839  	mov	rdi, r10
   840  	and	rdi, 3
   841  	je	.LBB0_73
   842  .LBB0_72:                               # =>This Inner Loop Header: Depth=1
   843  	movzx	eax, byte ptr [rcx + rsi]
   844  	add	al, byte ptr [rdx + rsi]
   845  	mov	byte ptr [r8 + rsi], al
   846  	add	rsi, 1
   847  	add	rdi, -1
   848  	jne	.LBB0_72
   849  .LBB0_73:
   850  	cmp	r9, 3
   851  	jb	.LBB0_1013
   852  .LBB0_74:                               # =>This Inner Loop Header: Depth=1
   853  	movzx	eax, byte ptr [rcx + rsi]
   854  	add	al, byte ptr [rdx + rsi]
   855  	mov	byte ptr [r8 + rsi], al
   856  	movzx	eax, byte ptr [rcx + rsi + 1]
   857  	add	al, byte ptr [rdx + rsi + 1]
   858  	mov	byte ptr [r8 + rsi + 1], al
   859  	movzx	eax, byte ptr [rcx + rsi + 2]
   860  	add	al, byte ptr [rdx + rsi + 2]
   861  	mov	byte ptr [r8 + rsi + 2], al
   862  	movzx	eax, byte ptr [rcx + rsi + 3]
   863  	add	al, byte ptr [rdx + rsi + 3]
   864  	mov	byte ptr [r8 + rsi + 3], al
   865  	add	rsi, 4
   866  	cmp	r10, rsi
   867  	jne	.LBB0_74
   868  	jmp	.LBB0_1013
   869  .LBB0_369:
   870  	cmp	edi, 2
   871  	je	.LBB0_392
   872  # %bb.370:
   873  	cmp	edi, 3
   874  	jne	.LBB0_1013
   875  # %bb.371:
   876  	test	r9d, r9d
   877  	jle	.LBB0_1013
   878  # %bb.372:
   879  	mov	r10d, r9d
   880  	cmp	r9d, 32
   881  	jae	.LBB0_408
   882  # %bb.373:
   883  	xor	esi, esi
   884  .LBB0_417:
   885  	mov	r9, rsi
   886  	not	r9
   887  	add	r9, r10
   888  	mov	rdi, r10
   889  	and	rdi, 3
   890  	je	.LBB0_419
   891  .LBB0_418:                              # =>This Inner Loop Header: Depth=1
   892  	movzx	eax, byte ptr [rdx + rsi]
   893  	sub	al, byte ptr [rcx + rsi]
   894  	mov	byte ptr [r8 + rsi], al
   895  	add	rsi, 1
   896  	add	rdi, -1
   897  	jne	.LBB0_418
   898  .LBB0_419:
   899  	cmp	r9, 3
   900  	jb	.LBB0_1013
   901  .LBB0_420:                              # =>This Inner Loop Header: Depth=1
   902  	movzx	eax, byte ptr [rdx + rsi]
   903  	sub	al, byte ptr [rcx + rsi]
   904  	mov	byte ptr [r8 + rsi], al
   905  	movzx	eax, byte ptr [rdx + rsi + 1]
   906  	sub	al, byte ptr [rcx + rsi + 1]
   907  	mov	byte ptr [r8 + rsi + 1], al
   908  	movzx	eax, byte ptr [rdx + rsi + 2]
   909  	sub	al, byte ptr [rcx + rsi + 2]
   910  	mov	byte ptr [r8 + rsi + 2], al
   911  	movzx	eax, byte ptr [rdx + rsi + 3]
   912  	sub	al, byte ptr [rcx + rsi + 3]
   913  	mov	byte ptr [r8 + rsi + 3], al
   914  	add	rsi, 4
   915  	cmp	r10, rsi
   916  	jne	.LBB0_420
   917  	jmp	.LBB0_1013
   918  .LBB0_196:
   919  	cmp	edi, 2
   920  	je	.LBB0_219
   921  # %bb.197:
   922  	cmp	edi, 3
   923  	jne	.LBB0_1013
   924  # %bb.198:
   925  	test	r9d, r9d
   926  	jle	.LBB0_1013
   927  # %bb.199:
   928  	mov	r10d, r9d
   929  	cmp	r9d, 32
   930  	jae	.LBB0_235
   931  # %bb.200:
   932  	xor	esi, esi
   933  .LBB0_244:
   934  	mov	r9, rsi
   935  	not	r9
   936  	add	r9, r10
   937  	mov	rdi, r10
   938  	and	rdi, 3
   939  	je	.LBB0_246
   940  .LBB0_245:                              # =>This Inner Loop Header: Depth=1
   941  	movzx	eax, byte ptr [rcx + rsi]
   942  	add	al, byte ptr [rdx + rsi]
   943  	mov	byte ptr [r8 + rsi], al
   944  	add	rsi, 1
   945  	add	rdi, -1
   946  	jne	.LBB0_245
   947  .LBB0_246:
   948  	cmp	r9, 3
   949  	jb	.LBB0_1013
   950  .LBB0_247:                              # =>This Inner Loop Header: Depth=1
   951  	movzx	eax, byte ptr [rcx + rsi]
   952  	add	al, byte ptr [rdx + rsi]
   953  	mov	byte ptr [r8 + rsi], al
   954  	movzx	eax, byte ptr [rcx + rsi + 1]
   955  	add	al, byte ptr [rdx + rsi + 1]
   956  	mov	byte ptr [r8 + rsi + 1], al
   957  	movzx	eax, byte ptr [rcx + rsi + 2]
   958  	add	al, byte ptr [rdx + rsi + 2]
   959  	mov	byte ptr [r8 + rsi + 2], al
   960  	movzx	eax, byte ptr [rcx + rsi + 3]
   961  	add	al, byte ptr [rdx + rsi + 3]
   962  	mov	byte ptr [r8 + rsi + 3], al
   963  	add	rsi, 4
   964  	cmp	r10, rsi
   965  	jne	.LBB0_247
   966  	jmp	.LBB0_1013
   967  .LBB0_542:
   968  	cmp	edi, 2
   969  	je	.LBB0_565
   970  # %bb.543:
   971  	cmp	edi, 3
   972  	jne	.LBB0_1013
   973  # %bb.544:
   974  	test	r9d, r9d
   975  	jle	.LBB0_1013
   976  # %bb.545:
   977  	mov	r10d, r9d
   978  	cmp	r9d, 32
   979  	jae	.LBB0_581
   980  # %bb.546:
   981  	xor	esi, esi
   982  .LBB0_590:
   983  	mov	r9, rsi
   984  	not	r9
   985  	add	r9, r10
   986  	mov	rdi, r10
   987  	and	rdi, 3
   988  	je	.LBB0_592
   989  .LBB0_591:                              # =>This Inner Loop Header: Depth=1
   990  	movzx	eax, byte ptr [rdx + rsi]
   991  	sub	al, byte ptr [rcx + rsi]
   992  	mov	byte ptr [r8 + rsi], al
   993  	add	rsi, 1
   994  	add	rdi, -1
   995  	jne	.LBB0_591
   996  .LBB0_592:
   997  	cmp	r9, 3
   998  	jb	.LBB0_1013
   999  .LBB0_593:                              # =>This Inner Loop Header: Depth=1
  1000  	movzx	eax, byte ptr [rdx + rsi]
  1001  	sub	al, byte ptr [rcx + rsi]
  1002  	mov	byte ptr [r8 + rsi], al
  1003  	movzx	eax, byte ptr [rdx + rsi + 1]
  1004  	sub	al, byte ptr [rcx + rsi + 1]
  1005  	mov	byte ptr [r8 + rsi + 1], al
  1006  	movzx	eax, byte ptr [rdx + rsi + 2]
  1007  	sub	al, byte ptr [rcx + rsi + 2]
  1008  	mov	byte ptr [r8 + rsi + 2], al
  1009  	movzx	eax, byte ptr [rdx + rsi + 3]
  1010  	sub	al, byte ptr [rcx + rsi + 3]
  1011  	mov	byte ptr [r8 + rsi + 3], al
  1012  	add	rsi, 4
  1013  	cmp	r10, rsi
  1014  	jne	.LBB0_593
  1015  	jmp	.LBB0_1013
  1016  .LBB0_720:
  1017  	cmp	edi, 7
  1018  	je	.LBB0_805
  1019  # %bb.721:
  1020  	cmp	edi, 8
  1021  	jne	.LBB0_1013
  1022  # %bb.722:
  1023  	test	r9d, r9d
  1024  	jle	.LBB0_1013
  1025  # %bb.723:
  1026  	mov	esi, r9d
  1027  	lea	rdi, [rsi - 1]
  1028  	mov	r9d, esi
  1029  	and	r9d, 3
  1030  	cmp	rdi, 3
  1031  	jae	.LBB0_821
  1032  # %bb.724:
  1033  	xor	edi, edi
  1034  	jmp	.LBB0_823
  1035  .LBB0_870:
  1036  	cmp	edi, 7
  1037  	je	.LBB0_955
  1038  # %bb.871:
  1039  	cmp	edi, 8
  1040  	jne	.LBB0_1013
  1041  # %bb.872:
  1042  	test	r9d, r9d
  1043  	jle	.LBB0_1013
  1044  # %bb.873:
  1045  	mov	esi, r9d
  1046  	lea	rdi, [rsi - 1]
  1047  	mov	r9d, esi
  1048  	and	r9d, 3
  1049  	cmp	rdi, 3
  1050  	jae	.LBB0_971
  1051  # %bb.874:
  1052  	xor	edi, edi
  1053  	jmp	.LBB0_973
  1054  .LBB0_35:
  1055  	cmp	edi, 7
  1056  	je	.LBB0_120
  1057  # %bb.36:
  1058  	cmp	edi, 8
  1059  	jne	.LBB0_1013
  1060  # %bb.37:
  1061  	test	r9d, r9d
  1062  	jle	.LBB0_1013
  1063  # %bb.38:
  1064  	mov	r10d, r9d
  1065  	cmp	r9d, 4
  1066  	jae	.LBB0_136
  1067  # %bb.39:
  1068  	xor	esi, esi
  1069  .LBB0_145:
  1070  	mov	r9, rsi
  1071  	not	r9
  1072  	add	r9, r10
  1073  	mov	rdi, r10
  1074  	and	rdi, 3
  1075  	je	.LBB0_147
  1076  .LBB0_146:                              # =>This Inner Loop Header: Depth=1
  1077  	mov	rax, qword ptr [rcx + 8*rsi]
  1078  	add	rax, qword ptr [rdx + 8*rsi]
  1079  	mov	qword ptr [r8 + 8*rsi], rax
  1080  	add	rsi, 1
  1081  	add	rdi, -1
  1082  	jne	.LBB0_146
  1083  .LBB0_147:
  1084  	cmp	r9, 3
  1085  	jb	.LBB0_1013
  1086  .LBB0_148:                              # =>This Inner Loop Header: Depth=1
  1087  	mov	rax, qword ptr [rcx + 8*rsi]
  1088  	add	rax, qword ptr [rdx + 8*rsi]
  1089  	mov	qword ptr [r8 + 8*rsi], rax
  1090  	mov	rax, qword ptr [rcx + 8*rsi + 8]
  1091  	add	rax, qword ptr [rdx + 8*rsi + 8]
  1092  	mov	qword ptr [r8 + 8*rsi + 8], rax
  1093  	mov	rax, qword ptr [rcx + 8*rsi + 16]
  1094  	add	rax, qword ptr [rdx + 8*rsi + 16]
  1095  	mov	qword ptr [r8 + 8*rsi + 16], rax
  1096  	mov	rax, qword ptr [rcx + 8*rsi + 24]
  1097  	add	rax, qword ptr [rdx + 8*rsi + 24]
  1098  	mov	qword ptr [r8 + 8*rsi + 24], rax
  1099  	add	rsi, 4
  1100  	cmp	r10, rsi
  1101  	jne	.LBB0_148
  1102  	jmp	.LBB0_1013
  1103  .LBB0_381:
  1104  	cmp	edi, 7
  1105  	je	.LBB0_466
  1106  # %bb.382:
  1107  	cmp	edi, 8
  1108  	jne	.LBB0_1013
  1109  # %bb.383:
  1110  	test	r9d, r9d
  1111  	jle	.LBB0_1013
  1112  # %bb.384:
  1113  	mov	r10d, r9d
  1114  	cmp	r9d, 4
  1115  	jae	.LBB0_482
  1116  # %bb.385:
  1117  	xor	esi, esi
  1118  .LBB0_491:
  1119  	mov	r9, rsi
  1120  	not	r9
  1121  	add	r9, r10
  1122  	mov	rdi, r10
  1123  	and	rdi, 3
  1124  	je	.LBB0_493
  1125  .LBB0_492:                              # =>This Inner Loop Header: Depth=1
  1126  	mov	rax, qword ptr [rdx + 8*rsi]
  1127  	sub	rax, qword ptr [rcx + 8*rsi]
  1128  	mov	qword ptr [r8 + 8*rsi], rax
  1129  	add	rsi, 1
  1130  	add	rdi, -1
  1131  	jne	.LBB0_492
  1132  .LBB0_493:
  1133  	cmp	r9, 3
  1134  	jb	.LBB0_1013
  1135  .LBB0_494:                              # =>This Inner Loop Header: Depth=1
  1136  	mov	rax, qword ptr [rdx + 8*rsi]
  1137  	sub	rax, qword ptr [rcx + 8*rsi]
  1138  	mov	qword ptr [r8 + 8*rsi], rax
  1139  	mov	rax, qword ptr [rdx + 8*rsi + 8]
  1140  	sub	rax, qword ptr [rcx + 8*rsi + 8]
  1141  	mov	qword ptr [r8 + 8*rsi + 8], rax
  1142  	mov	rax, qword ptr [rdx + 8*rsi + 16]
  1143  	sub	rax, qword ptr [rcx + 8*rsi + 16]
  1144  	mov	qword ptr [r8 + 8*rsi + 16], rax
  1145  	mov	rax, qword ptr [rdx + 8*rsi + 24]
  1146  	sub	rax, qword ptr [rcx + 8*rsi + 24]
  1147  	mov	qword ptr [r8 + 8*rsi + 24], rax
  1148  	add	rsi, 4
  1149  	cmp	r10, rsi
  1150  	jne	.LBB0_494
  1151  	jmp	.LBB0_1013
  1152  .LBB0_208:
  1153  	cmp	edi, 7
  1154  	je	.LBB0_293
  1155  # %bb.209:
  1156  	cmp	edi, 8
  1157  	jne	.LBB0_1013
  1158  # %bb.210:
  1159  	test	r9d, r9d
  1160  	jle	.LBB0_1013
  1161  # %bb.211:
  1162  	mov	r10d, r9d
  1163  	cmp	r9d, 4
  1164  	jae	.LBB0_309
  1165  # %bb.212:
  1166  	xor	esi, esi
  1167  .LBB0_318:
  1168  	mov	r9, rsi
  1169  	not	r9
  1170  	add	r9, r10
  1171  	mov	rdi, r10
  1172  	and	rdi, 3
  1173  	je	.LBB0_320
  1174  .LBB0_319:                              # =>This Inner Loop Header: Depth=1
  1175  	mov	rax, qword ptr [rcx + 8*rsi]
  1176  	add	rax, qword ptr [rdx + 8*rsi]
  1177  	mov	qword ptr [r8 + 8*rsi], rax
  1178  	add	rsi, 1
  1179  	add	rdi, -1
  1180  	jne	.LBB0_319
  1181  .LBB0_320:
  1182  	cmp	r9, 3
  1183  	jb	.LBB0_1013
  1184  .LBB0_321:                              # =>This Inner Loop Header: Depth=1
  1185  	mov	rax, qword ptr [rcx + 8*rsi]
  1186  	add	rax, qword ptr [rdx + 8*rsi]
  1187  	mov	qword ptr [r8 + 8*rsi], rax
  1188  	mov	rax, qword ptr [rcx + 8*rsi + 8]
  1189  	add	rax, qword ptr [rdx + 8*rsi + 8]
  1190  	mov	qword ptr [r8 + 8*rsi + 8], rax
  1191  	mov	rax, qword ptr [rcx + 8*rsi + 16]
  1192  	add	rax, qword ptr [rdx + 8*rsi + 16]
  1193  	mov	qword ptr [r8 + 8*rsi + 16], rax
  1194  	mov	rax, qword ptr [rcx + 8*rsi + 24]
  1195  	add	rax, qword ptr [rdx + 8*rsi + 24]
  1196  	mov	qword ptr [r8 + 8*rsi + 24], rax
  1197  	add	rsi, 4
  1198  	cmp	r10, rsi
  1199  	jne	.LBB0_321
  1200  	jmp	.LBB0_1013
  1201  .LBB0_554:
  1202  	cmp	edi, 7
  1203  	je	.LBB0_639
  1204  # %bb.555:
  1205  	cmp	edi, 8
  1206  	jne	.LBB0_1013
  1207  # %bb.556:
  1208  	test	r9d, r9d
  1209  	jle	.LBB0_1013
  1210  # %bb.557:
  1211  	mov	r10d, r9d
  1212  	cmp	r9d, 4
  1213  	jae	.LBB0_655
  1214  # %bb.558:
  1215  	xor	esi, esi
  1216  .LBB0_664:
  1217  	mov	r9, rsi
  1218  	not	r9
  1219  	add	r9, r10
  1220  	mov	rdi, r10
  1221  	and	rdi, 3
  1222  	je	.LBB0_666
  1223  .LBB0_665:                              # =>This Inner Loop Header: Depth=1
  1224  	mov	rax, qword ptr [rdx + 8*rsi]
  1225  	sub	rax, qword ptr [rcx + 8*rsi]
  1226  	mov	qword ptr [r8 + 8*rsi], rax
  1227  	add	rsi, 1
  1228  	add	rdi, -1
  1229  	jne	.LBB0_665
  1230  .LBB0_666:
  1231  	cmp	r9, 3
  1232  	jb	.LBB0_1013
  1233  .LBB0_667:                              # =>This Inner Loop Header: Depth=1
  1234  	mov	rax, qword ptr [rdx + 8*rsi]
  1235  	sub	rax, qword ptr [rcx + 8*rsi]
  1236  	mov	qword ptr [r8 + 8*rsi], rax
  1237  	mov	rax, qword ptr [rdx + 8*rsi + 8]
  1238  	sub	rax, qword ptr [rcx + 8*rsi + 8]
  1239  	mov	qword ptr [r8 + 8*rsi + 8], rax
  1240  	mov	rax, qword ptr [rdx + 8*rsi + 16]
  1241  	sub	rax, qword ptr [rcx + 8*rsi + 16]
  1242  	mov	qword ptr [r8 + 8*rsi + 16], rax
  1243  	mov	rax, qword ptr [rdx + 8*rsi + 24]
  1244  	sub	rax, qword ptr [rcx + 8*rsi + 24]
  1245  	mov	qword ptr [r8 + 8*rsi + 24], rax
  1246  	add	rsi, 4
  1247  	cmp	r10, rsi
  1248  	jne	.LBB0_667
  1249  	jmp	.LBB0_1013
  1250  .LBB0_760:
  1251  	test	r9d, r9d
  1252  	jle	.LBB0_1013
  1253  # %bb.761:
  1254  	mov	r10d, r9d
  1255  	cmp	r9d, 16
  1256  	jae	.LBB0_763
  1257  # %bb.762:
  1258  	xor	esi, esi
  1259  .LBB0_772:
  1260  	mov	r9, rsi
  1261  	not	r9
  1262  	add	r9, r10
  1263  	mov	rdi, r10
  1264  	and	rdi, 3
  1265  	je	.LBB0_774
  1266  .LBB0_773:                              # =>This Inner Loop Header: Depth=1
  1267  	movzx	eax, word ptr [rcx + 2*rsi]
  1268  	imul	ax, word ptr [rdx + 2*rsi]
  1269  	mov	word ptr [r8 + 2*rsi], ax
  1270  	add	rsi, 1
  1271  	add	rdi, -1
  1272  	jne	.LBB0_773
  1273  .LBB0_774:
  1274  	cmp	r9, 3
  1275  	jb	.LBB0_1013
  1276  .LBB0_775:                              # =>This Inner Loop Header: Depth=1
  1277  	movzx	eax, word ptr [rcx + 2*rsi]
  1278  	imul	ax, word ptr [rdx + 2*rsi]
  1279  	mov	word ptr [r8 + 2*rsi], ax
  1280  	movzx	eax, word ptr [rcx + 2*rsi + 2]
  1281  	imul	ax, word ptr [rdx + 2*rsi + 2]
  1282  	mov	word ptr [r8 + 2*rsi + 2], ax
  1283  	movzx	eax, word ptr [rcx + 2*rsi + 4]
  1284  	imul	ax, word ptr [rdx + 2*rsi + 4]
  1285  	mov	word ptr [r8 + 2*rsi + 4], ax
  1286  	movzx	eax, word ptr [rcx + 2*rsi + 6]
  1287  	imul	ax, word ptr [rdx + 2*rsi + 6]
  1288  	mov	word ptr [r8 + 2*rsi + 6], ax
  1289  	add	rsi, 4
  1290  	cmp	r10, rsi
  1291  	jne	.LBB0_775
  1292  	jmp	.LBB0_1013
  1293  .LBB0_776:
  1294  	test	r9d, r9d
  1295  	jle	.LBB0_1013
  1296  # %bb.777:
  1297  	mov	r10d, r9d
  1298  	cmp	r9d, 16
  1299  	jae	.LBB0_779
  1300  # %bb.778:
  1301  	xor	esi, esi
  1302  .LBB0_788:
  1303  	mov	r9, rsi
  1304  	not	r9
  1305  	add	r9, r10
  1306  	mov	rdi, r10
  1307  	and	rdi, 3
  1308  	je	.LBB0_790
  1309  .LBB0_789:                              # =>This Inner Loop Header: Depth=1
  1310  	movzx	eax, word ptr [rcx + 2*rsi]
  1311  	imul	ax, word ptr [rdx + 2*rsi]
  1312  	mov	word ptr [r8 + 2*rsi], ax
  1313  	add	rsi, 1
  1314  	add	rdi, -1
  1315  	jne	.LBB0_789
  1316  .LBB0_790:
  1317  	cmp	r9, 3
  1318  	jb	.LBB0_1013
  1319  .LBB0_791:                              # =>This Inner Loop Header: Depth=1
  1320  	movzx	eax, word ptr [rcx + 2*rsi]
  1321  	imul	ax, word ptr [rdx + 2*rsi]
  1322  	mov	word ptr [r8 + 2*rsi], ax
  1323  	movzx	eax, word ptr [rcx + 2*rsi + 2]
  1324  	imul	ax, word ptr [rdx + 2*rsi + 2]
  1325  	mov	word ptr [r8 + 2*rsi + 2], ax
  1326  	movzx	eax, word ptr [rcx + 2*rsi + 4]
  1327  	imul	ax, word ptr [rdx + 2*rsi + 4]
  1328  	mov	word ptr [r8 + 2*rsi + 4], ax
  1329  	movzx	eax, word ptr [rcx + 2*rsi + 6]
  1330  	imul	ax, word ptr [rdx + 2*rsi + 6]
  1331  	mov	word ptr [r8 + 2*rsi + 6], ax
  1332  	add	rsi, 4
  1333  	cmp	r10, rsi
  1334  	jne	.LBB0_791
  1335  	jmp	.LBB0_1013
  1336  .LBB0_910:
  1337  	test	r9d, r9d
  1338  	jle	.LBB0_1013
  1339  # %bb.911:
  1340  	mov	r10d, r9d
  1341  	cmp	r9d, 16
  1342  	jae	.LBB0_913
  1343  # %bb.912:
  1344  	xor	esi, esi
  1345  .LBB0_922:
  1346  	mov	r9, rsi
  1347  	not	r9
  1348  	add	r9, r10
  1349  	mov	rdi, r10
  1350  	and	rdi, 3
  1351  	je	.LBB0_924
  1352  .LBB0_923:                              # =>This Inner Loop Header: Depth=1
  1353  	movzx	eax, word ptr [rcx + 2*rsi]
  1354  	imul	ax, word ptr [rdx + 2*rsi]
  1355  	mov	word ptr [r8 + 2*rsi], ax
  1356  	add	rsi, 1
  1357  	add	rdi, -1
  1358  	jne	.LBB0_923
  1359  .LBB0_924:
  1360  	cmp	r9, 3
  1361  	jb	.LBB0_1013
  1362  .LBB0_925:                              # =>This Inner Loop Header: Depth=1
  1363  	movzx	eax, word ptr [rcx + 2*rsi]
  1364  	imul	ax, word ptr [rdx + 2*rsi]
  1365  	mov	word ptr [r8 + 2*rsi], ax
  1366  	movzx	eax, word ptr [rcx + 2*rsi + 2]
  1367  	imul	ax, word ptr [rdx + 2*rsi + 2]
  1368  	mov	word ptr [r8 + 2*rsi + 2], ax
  1369  	movzx	eax, word ptr [rcx + 2*rsi + 4]
  1370  	imul	ax, word ptr [rdx + 2*rsi + 4]
  1371  	mov	word ptr [r8 + 2*rsi + 4], ax
  1372  	movzx	eax, word ptr [rcx + 2*rsi + 6]
  1373  	imul	ax, word ptr [rdx + 2*rsi + 6]
  1374  	mov	word ptr [r8 + 2*rsi + 6], ax
  1375  	add	rsi, 4
  1376  	cmp	r10, rsi
  1377  	jne	.LBB0_925
  1378  	jmp	.LBB0_1013
  1379  .LBB0_926:
  1380  	test	r9d, r9d
  1381  	jle	.LBB0_1013
  1382  # %bb.927:
  1383  	mov	r10d, r9d
  1384  	cmp	r9d, 16
  1385  	jae	.LBB0_929
  1386  # %bb.928:
  1387  	xor	esi, esi
  1388  .LBB0_938:
  1389  	mov	r9, rsi
  1390  	not	r9
  1391  	add	r9, r10
  1392  	mov	rdi, r10
  1393  	and	rdi, 3
  1394  	je	.LBB0_940
  1395  .LBB0_939:                              # =>This Inner Loop Header: Depth=1
  1396  	movzx	eax, word ptr [rcx + 2*rsi]
  1397  	imul	ax, word ptr [rdx + 2*rsi]
  1398  	mov	word ptr [r8 + 2*rsi], ax
  1399  	add	rsi, 1
  1400  	add	rdi, -1
  1401  	jne	.LBB0_939
  1402  .LBB0_940:
  1403  	cmp	r9, 3
  1404  	jb	.LBB0_1013
  1405  .LBB0_941:                              # =>This Inner Loop Header: Depth=1
  1406  	movzx	eax, word ptr [rcx + 2*rsi]
  1407  	imul	ax, word ptr [rdx + 2*rsi]
  1408  	mov	word ptr [r8 + 2*rsi], ax
  1409  	movzx	eax, word ptr [rcx + 2*rsi + 2]
  1410  	imul	ax, word ptr [rdx + 2*rsi + 2]
  1411  	mov	word ptr [r8 + 2*rsi + 2], ax
  1412  	movzx	eax, word ptr [rcx + 2*rsi + 4]
  1413  	imul	ax, word ptr [rdx + 2*rsi + 4]
  1414  	mov	word ptr [r8 + 2*rsi + 4], ax
  1415  	movzx	eax, word ptr [rcx + 2*rsi + 6]
  1416  	imul	ax, word ptr [rdx + 2*rsi + 6]
  1417  	mov	word ptr [r8 + 2*rsi + 6], ax
  1418  	add	rsi, 4
  1419  	cmp	r10, rsi
  1420  	jne	.LBB0_941
  1421  	jmp	.LBB0_1013
  1422  .LBB0_75:
  1423  	test	r9d, r9d
  1424  	jle	.LBB0_1013
  1425  # %bb.76:
  1426  	mov	r10d, r9d
  1427  	cmp	r9d, 16
  1428  	jae	.LBB0_78
  1429  # %bb.77:
  1430  	xor	esi, esi
  1431  .LBB0_87:
  1432  	mov	r9, rsi
  1433  	not	r9
  1434  	add	r9, r10
  1435  	mov	rdi, r10
  1436  	and	rdi, 3
  1437  	je	.LBB0_89
  1438  .LBB0_88:                               # =>This Inner Loop Header: Depth=1
  1439  	movzx	eax, word ptr [rcx + 2*rsi]
  1440  	add	ax, word ptr [rdx + 2*rsi]
  1441  	mov	word ptr [r8 + 2*rsi], ax
  1442  	add	rsi, 1
  1443  	add	rdi, -1
  1444  	jne	.LBB0_88
  1445  .LBB0_89:
  1446  	cmp	r9, 3
  1447  	jb	.LBB0_1013
  1448  .LBB0_90:                               # =>This Inner Loop Header: Depth=1
  1449  	movzx	eax, word ptr [rcx + 2*rsi]
  1450  	add	ax, word ptr [rdx + 2*rsi]
  1451  	mov	word ptr [r8 + 2*rsi], ax
  1452  	movzx	eax, word ptr [rcx + 2*rsi + 2]
  1453  	add	ax, word ptr [rdx + 2*rsi + 2]
  1454  	mov	word ptr [r8 + 2*rsi + 2], ax
  1455  	movzx	eax, word ptr [rcx + 2*rsi + 4]
  1456  	add	ax, word ptr [rdx + 2*rsi + 4]
  1457  	mov	word ptr [r8 + 2*rsi + 4], ax
  1458  	movzx	eax, word ptr [rcx + 2*rsi + 6]
  1459  	add	ax, word ptr [rdx + 2*rsi + 6]
  1460  	mov	word ptr [r8 + 2*rsi + 6], ax
  1461  	add	rsi, 4
  1462  	cmp	r10, rsi
  1463  	jne	.LBB0_90
  1464  	jmp	.LBB0_1013
  1465  .LBB0_91:
  1466  	test	r9d, r9d
  1467  	jle	.LBB0_1013
  1468  # %bb.92:
  1469  	mov	r10d, r9d
  1470  	cmp	r9d, 16
  1471  	jae	.LBB0_94
  1472  # %bb.93:
  1473  	xor	esi, esi
  1474  .LBB0_103:
  1475  	mov	r9, rsi
  1476  	not	r9
  1477  	add	r9, r10
  1478  	mov	rdi, r10
  1479  	and	rdi, 3
  1480  	je	.LBB0_105
  1481  .LBB0_104:                              # =>This Inner Loop Header: Depth=1
  1482  	movzx	eax, word ptr [rcx + 2*rsi]
  1483  	add	ax, word ptr [rdx + 2*rsi]
  1484  	mov	word ptr [r8 + 2*rsi], ax
  1485  	add	rsi, 1
  1486  	add	rdi, -1
  1487  	jne	.LBB0_104
  1488  .LBB0_105:
  1489  	cmp	r9, 3
  1490  	jb	.LBB0_1013
  1491  .LBB0_106:                              # =>This Inner Loop Header: Depth=1
  1492  	movzx	eax, word ptr [rcx + 2*rsi]
  1493  	add	ax, word ptr [rdx + 2*rsi]
  1494  	mov	word ptr [r8 + 2*rsi], ax
  1495  	movzx	eax, word ptr [rcx + 2*rsi + 2]
  1496  	add	ax, word ptr [rdx + 2*rsi + 2]
  1497  	mov	word ptr [r8 + 2*rsi + 2], ax
  1498  	movzx	eax, word ptr [rcx + 2*rsi + 4]
  1499  	add	ax, word ptr [rdx + 2*rsi + 4]
  1500  	mov	word ptr [r8 + 2*rsi + 4], ax
  1501  	movzx	eax, word ptr [rcx + 2*rsi + 6]
  1502  	add	ax, word ptr [rdx + 2*rsi + 6]
  1503  	mov	word ptr [r8 + 2*rsi + 6], ax
  1504  	add	rsi, 4
  1505  	cmp	r10, rsi
  1506  	jne	.LBB0_106
  1507  	jmp	.LBB0_1013
  1508  .LBB0_421:
  1509  	test	r9d, r9d
  1510  	jle	.LBB0_1013
  1511  # %bb.422:
  1512  	mov	r10d, r9d
  1513  	cmp	r9d, 16
  1514  	jae	.LBB0_424
  1515  # %bb.423:
  1516  	xor	esi, esi
  1517  .LBB0_433:
  1518  	mov	r9, rsi
  1519  	not	r9
  1520  	add	r9, r10
  1521  	mov	rdi, r10
  1522  	and	rdi, 3
  1523  	je	.LBB0_435
  1524  .LBB0_434:                              # =>This Inner Loop Header: Depth=1
  1525  	movzx	eax, word ptr [rdx + 2*rsi]
  1526  	sub	ax, word ptr [rcx + 2*rsi]
  1527  	mov	word ptr [r8 + 2*rsi], ax
  1528  	add	rsi, 1
  1529  	add	rdi, -1
  1530  	jne	.LBB0_434
  1531  .LBB0_435:
  1532  	cmp	r9, 3
  1533  	jb	.LBB0_1013
  1534  .LBB0_436:                              # =>This Inner Loop Header: Depth=1
  1535  	movzx	eax, word ptr [rdx + 2*rsi]
  1536  	sub	ax, word ptr [rcx + 2*rsi]
  1537  	mov	word ptr [r8 + 2*rsi], ax
  1538  	movzx	eax, word ptr [rdx + 2*rsi + 2]
  1539  	sub	ax, word ptr [rcx + 2*rsi + 2]
  1540  	mov	word ptr [r8 + 2*rsi + 2], ax
  1541  	movzx	eax, word ptr [rdx + 2*rsi + 4]
  1542  	sub	ax, word ptr [rcx + 2*rsi + 4]
  1543  	mov	word ptr [r8 + 2*rsi + 4], ax
  1544  	movzx	eax, word ptr [rdx + 2*rsi + 6]
  1545  	sub	ax, word ptr [rcx + 2*rsi + 6]
  1546  	mov	word ptr [r8 + 2*rsi + 6], ax
  1547  	add	rsi, 4
  1548  	cmp	r10, rsi
  1549  	jne	.LBB0_436
  1550  	jmp	.LBB0_1013
  1551  .LBB0_437:
  1552  	test	r9d, r9d
  1553  	jle	.LBB0_1013
  1554  # %bb.438:
  1555  	mov	r10d, r9d
  1556  	cmp	r9d, 16
  1557  	jae	.LBB0_440
  1558  # %bb.439:
  1559  	xor	esi, esi
  1560  .LBB0_449:
  1561  	mov	r9, rsi
  1562  	not	r9
  1563  	add	r9, r10
  1564  	mov	rdi, r10
  1565  	and	rdi, 3
  1566  	je	.LBB0_451
  1567  .LBB0_450:                              # =>This Inner Loop Header: Depth=1
  1568  	movzx	eax, word ptr [rdx + 2*rsi]
  1569  	sub	ax, word ptr [rcx + 2*rsi]
  1570  	mov	word ptr [r8 + 2*rsi], ax
  1571  	add	rsi, 1
  1572  	add	rdi, -1
  1573  	jne	.LBB0_450
  1574  .LBB0_451:
  1575  	cmp	r9, 3
  1576  	jb	.LBB0_1013
  1577  .LBB0_452:                              # =>This Inner Loop Header: Depth=1
  1578  	movzx	eax, word ptr [rdx + 2*rsi]
  1579  	sub	ax, word ptr [rcx + 2*rsi]
  1580  	mov	word ptr [r8 + 2*rsi], ax
  1581  	movzx	eax, word ptr [rdx + 2*rsi + 2]
  1582  	sub	ax, word ptr [rcx + 2*rsi + 2]
  1583  	mov	word ptr [r8 + 2*rsi + 2], ax
  1584  	movzx	eax, word ptr [rdx + 2*rsi + 4]
  1585  	sub	ax, word ptr [rcx + 2*rsi + 4]
  1586  	mov	word ptr [r8 + 2*rsi + 4], ax
  1587  	movzx	eax, word ptr [rdx + 2*rsi + 6]
  1588  	sub	ax, word ptr [rcx + 2*rsi + 6]
  1589  	mov	word ptr [r8 + 2*rsi + 6], ax
  1590  	add	rsi, 4
  1591  	cmp	r10, rsi
  1592  	jne	.LBB0_452
  1593  	jmp	.LBB0_1013
  1594  .LBB0_248:
  1595  	test	r9d, r9d
  1596  	jle	.LBB0_1013
  1597  # %bb.249:
  1598  	mov	r10d, r9d
  1599  	cmp	r9d, 16
  1600  	jae	.LBB0_251
  1601  # %bb.250:
  1602  	xor	esi, esi
  1603  .LBB0_260:
  1604  	mov	r9, rsi
  1605  	not	r9
  1606  	add	r9, r10
  1607  	mov	rdi, r10
  1608  	and	rdi, 3
  1609  	je	.LBB0_262
  1610  .LBB0_261:                              # =>This Inner Loop Header: Depth=1
  1611  	movzx	eax, word ptr [rcx + 2*rsi]
  1612  	add	ax, word ptr [rdx + 2*rsi]
  1613  	mov	word ptr [r8 + 2*rsi], ax
  1614  	add	rsi, 1
  1615  	add	rdi, -1
  1616  	jne	.LBB0_261
  1617  .LBB0_262:
  1618  	cmp	r9, 3
  1619  	jb	.LBB0_1013
  1620  .LBB0_263:                              # =>This Inner Loop Header: Depth=1
  1621  	movzx	eax, word ptr [rcx + 2*rsi]
  1622  	add	ax, word ptr [rdx + 2*rsi]
  1623  	mov	word ptr [r8 + 2*rsi], ax
  1624  	movzx	eax, word ptr [rcx + 2*rsi + 2]
  1625  	add	ax, word ptr [rdx + 2*rsi + 2]
  1626  	mov	word ptr [r8 + 2*rsi + 2], ax
  1627  	movzx	eax, word ptr [rcx + 2*rsi + 4]
  1628  	add	ax, word ptr [rdx + 2*rsi + 4]
  1629  	mov	word ptr [r8 + 2*rsi + 4], ax
  1630  	movzx	eax, word ptr [rcx + 2*rsi + 6]
  1631  	add	ax, word ptr [rdx + 2*rsi + 6]
  1632  	mov	word ptr [r8 + 2*rsi + 6], ax
  1633  	add	rsi, 4
  1634  	cmp	r10, rsi
  1635  	jne	.LBB0_263
  1636  	jmp	.LBB0_1013
  1637  .LBB0_264:
  1638  	test	r9d, r9d
  1639  	jle	.LBB0_1013
  1640  # %bb.265:
  1641  	mov	r10d, r9d
  1642  	cmp	r9d, 16
  1643  	jae	.LBB0_267
  1644  # %bb.266:
  1645  	xor	esi, esi
  1646  .LBB0_276:
  1647  	mov	r9, rsi
  1648  	not	r9
  1649  	add	r9, r10
  1650  	mov	rdi, r10
  1651  	and	rdi, 3
  1652  	je	.LBB0_278
  1653  .LBB0_277:                              # =>This Inner Loop Header: Depth=1
  1654  	movzx	eax, word ptr [rcx + 2*rsi]
  1655  	add	ax, word ptr [rdx + 2*rsi]
  1656  	mov	word ptr [r8 + 2*rsi], ax
  1657  	add	rsi, 1
  1658  	add	rdi, -1
  1659  	jne	.LBB0_277
  1660  .LBB0_278:
  1661  	cmp	r9, 3
  1662  	jb	.LBB0_1013
  1663  .LBB0_279:                              # =>This Inner Loop Header: Depth=1
  1664  	movzx	eax, word ptr [rcx + 2*rsi]
  1665  	add	ax, word ptr [rdx + 2*rsi]
  1666  	mov	word ptr [r8 + 2*rsi], ax
  1667  	movzx	eax, word ptr [rcx + 2*rsi + 2]
  1668  	add	ax, word ptr [rdx + 2*rsi + 2]
  1669  	mov	word ptr [r8 + 2*rsi + 2], ax
  1670  	movzx	eax, word ptr [rcx + 2*rsi + 4]
  1671  	add	ax, word ptr [rdx + 2*rsi + 4]
  1672  	mov	word ptr [r8 + 2*rsi + 4], ax
  1673  	movzx	eax, word ptr [rcx + 2*rsi + 6]
  1674  	add	ax, word ptr [rdx + 2*rsi + 6]
  1675  	mov	word ptr [r8 + 2*rsi + 6], ax
  1676  	add	rsi, 4
  1677  	cmp	r10, rsi
  1678  	jne	.LBB0_279
  1679  	jmp	.LBB0_1013
  1680  .LBB0_594:
  1681  	test	r9d, r9d
  1682  	jle	.LBB0_1013
  1683  # %bb.595:
  1684  	mov	r10d, r9d
  1685  	cmp	r9d, 16
  1686  	jae	.LBB0_597
  1687  # %bb.596:
  1688  	xor	esi, esi
  1689  .LBB0_606:
  1690  	mov	r9, rsi
  1691  	not	r9
  1692  	add	r9, r10
  1693  	mov	rdi, r10
  1694  	and	rdi, 3
  1695  	je	.LBB0_608
  1696  .LBB0_607:                              # =>This Inner Loop Header: Depth=1
  1697  	movzx	eax, word ptr [rdx + 2*rsi]
  1698  	sub	ax, word ptr [rcx + 2*rsi]
  1699  	mov	word ptr [r8 + 2*rsi], ax
  1700  	add	rsi, 1
  1701  	add	rdi, -1
  1702  	jne	.LBB0_607
  1703  .LBB0_608:
  1704  	cmp	r9, 3
  1705  	jb	.LBB0_1013
  1706  .LBB0_609:                              # =>This Inner Loop Header: Depth=1
  1707  	movzx	eax, word ptr [rdx + 2*rsi]
  1708  	sub	ax, word ptr [rcx + 2*rsi]
  1709  	mov	word ptr [r8 + 2*rsi], ax
  1710  	movzx	eax, word ptr [rdx + 2*rsi + 2]
  1711  	sub	ax, word ptr [rcx + 2*rsi + 2]
  1712  	mov	word ptr [r8 + 2*rsi + 2], ax
  1713  	movzx	eax, word ptr [rdx + 2*rsi + 4]
  1714  	sub	ax, word ptr [rcx + 2*rsi + 4]
  1715  	mov	word ptr [r8 + 2*rsi + 4], ax
  1716  	movzx	eax, word ptr [rdx + 2*rsi + 6]
  1717  	sub	ax, word ptr [rcx + 2*rsi + 6]
  1718  	mov	word ptr [r8 + 2*rsi + 6], ax
  1719  	add	rsi, 4
  1720  	cmp	r10, rsi
  1721  	jne	.LBB0_609
  1722  	jmp	.LBB0_1013
  1723  .LBB0_610:
  1724  	test	r9d, r9d
  1725  	jle	.LBB0_1013
  1726  # %bb.611:
  1727  	mov	r10d, r9d
  1728  	cmp	r9d, 16
  1729  	jae	.LBB0_613
  1730  # %bb.612:
  1731  	xor	esi, esi
  1732  .LBB0_622:
  1733  	mov	r9, rsi
  1734  	not	r9
  1735  	add	r9, r10
  1736  	mov	rdi, r10
  1737  	and	rdi, 3
  1738  	je	.LBB0_624
  1739  .LBB0_623:                              # =>This Inner Loop Header: Depth=1
  1740  	movzx	eax, word ptr [rdx + 2*rsi]
  1741  	sub	ax, word ptr [rcx + 2*rsi]
  1742  	mov	word ptr [r8 + 2*rsi], ax
  1743  	add	rsi, 1
  1744  	add	rdi, -1
  1745  	jne	.LBB0_623
  1746  .LBB0_624:
  1747  	cmp	r9, 3
  1748  	jb	.LBB0_1013
  1749  .LBB0_625:                              # =>This Inner Loop Header: Depth=1
  1750  	movzx	eax, word ptr [rdx + 2*rsi]
  1751  	sub	ax, word ptr [rcx + 2*rsi]
  1752  	mov	word ptr [r8 + 2*rsi], ax
  1753  	movzx	eax, word ptr [rdx + 2*rsi + 2]
  1754  	sub	ax, word ptr [rcx + 2*rsi + 2]
  1755  	mov	word ptr [r8 + 2*rsi + 2], ax
  1756  	movzx	eax, word ptr [rdx + 2*rsi + 4]
  1757  	sub	ax, word ptr [rcx + 2*rsi + 4]
  1758  	mov	word ptr [r8 + 2*rsi + 4], ax
  1759  	movzx	eax, word ptr [rdx + 2*rsi + 6]
  1760  	sub	ax, word ptr [rcx + 2*rsi + 6]
  1761  	mov	word ptr [r8 + 2*rsi + 6], ax
  1762  	add	rsi, 4
  1763  	cmp	r10, rsi
  1764  	jne	.LBB0_625
  1765  	jmp	.LBB0_1013
  1766  .LBB0_826:
  1767  	test	r9d, r9d
  1768  	jle	.LBB0_1013
  1769  # %bb.827:
  1770  	mov	esi, r9d
  1771  	lea	rdi, [rsi - 1]
  1772  	mov	r9d, esi
  1773  	and	r9d, 3
  1774  	cmp	rdi, 3
  1775  	jae	.LBB0_829
  1776  # %bb.828:
  1777  	xor	edi, edi
  1778  	jmp	.LBB0_831
  1779  .LBB0_834:
  1780  	test	r9d, r9d
  1781  	jle	.LBB0_1013
  1782  # %bb.835:
  1783  	mov	r10d, r9d
  1784  	cmp	r9d, 8
  1785  	jae	.LBB0_837
  1786  # %bb.836:
  1787  	xor	esi, esi
  1788  .LBB0_846:
  1789  	mov	rax, rsi
  1790  	not	rax
  1791  	add	rax, r10
  1792  	mov	rdi, r10
  1793  	and	rdi, 3
  1794  	je	.LBB0_848
  1795  .LBB0_847:                              # =>This Inner Loop Header: Depth=1
  1796  	movss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
  1797  	mulss	xmm0, dword ptr [rdx + 4*rsi]
  1798  	movss	dword ptr [r8 + 4*rsi], xmm0
  1799  	add	rsi, 1
  1800  	add	rdi, -1
  1801  	jne	.LBB0_847
  1802  .LBB0_848:
  1803  	cmp	rax, 3
  1804  	jb	.LBB0_1013
  1805  .LBB0_849:                              # =>This Inner Loop Header: Depth=1
  1806  	movss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
  1807  	mulss	xmm0, dword ptr [rdx + 4*rsi]
  1808  	movss	dword ptr [r8 + 4*rsi], xmm0
  1809  	movss	xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
  1810  	mulss	xmm0, dword ptr [rdx + 4*rsi + 4]
  1811  	movss	dword ptr [r8 + 4*rsi + 4], xmm0
  1812  	movss	xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
  1813  	mulss	xmm0, dword ptr [rdx + 4*rsi + 8]
  1814  	movss	dword ptr [r8 + 4*rsi + 8], xmm0
  1815  	movss	xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
  1816  	mulss	xmm0, dword ptr [rdx + 4*rsi + 12]
  1817  	movss	dword ptr [r8 + 4*rsi + 12], xmm0
  1818  	add	rsi, 4
  1819  	cmp	r10, rsi
  1820  	jne	.LBB0_849
  1821  	jmp	.LBB0_1013
  1822  .LBB0_976:
  1823  	test	r9d, r9d
  1824  	jle	.LBB0_1013
  1825  # %bb.977:
  1826  	mov	esi, r9d
  1827  	lea	rdi, [rsi - 1]
  1828  	mov	r9d, esi
  1829  	and	r9d, 3
  1830  	cmp	rdi, 3
  1831  	jae	.LBB0_979
  1832  # %bb.978:
  1833  	xor	edi, edi
  1834  	jmp	.LBB0_981
  1835  .LBB0_984:
  1836  	test	r9d, r9d
  1837  	jle	.LBB0_1013
  1838  # %bb.985:
  1839  	mov	r10d, r9d
  1840  	cmp	r9d, 8
  1841  	jae	.LBB0_987
  1842  # %bb.986:
  1843  	xor	esi, esi
  1844  .LBB0_996:
  1845  	mov	rax, rsi
  1846  	not	rax
  1847  	add	rax, r10
  1848  	mov	rdi, r10
  1849  	and	rdi, 3
  1850  	je	.LBB0_998
  1851  .LBB0_997:                              # =>This Inner Loop Header: Depth=1
  1852  	movss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
  1853  	mulss	xmm0, dword ptr [rdx + 4*rsi]
  1854  	movss	dword ptr [r8 + 4*rsi], xmm0
  1855  	add	rsi, 1
  1856  	add	rdi, -1
  1857  	jne	.LBB0_997
  1858  .LBB0_998:
  1859  	cmp	rax, 3
  1860  	jb	.LBB0_1013
  1861  .LBB0_999:                              # =>This Inner Loop Header: Depth=1
  1862  	movss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
  1863  	mulss	xmm0, dword ptr [rdx + 4*rsi]
  1864  	movss	dword ptr [r8 + 4*rsi], xmm0
  1865  	movss	xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
  1866  	mulss	xmm0, dword ptr [rdx + 4*rsi + 4]
  1867  	movss	dword ptr [r8 + 4*rsi + 4], xmm0
  1868  	movss	xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
  1869  	mulss	xmm0, dword ptr [rdx + 4*rsi + 8]
  1870  	movss	dword ptr [r8 + 4*rsi + 8], xmm0
  1871  	movss	xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
  1872  	mulss	xmm0, dword ptr [rdx + 4*rsi + 12]
  1873  	movss	dword ptr [r8 + 4*rsi + 12], xmm0
  1874  	add	rsi, 4
  1875  	cmp	r10, rsi
  1876  	jne	.LBB0_999
  1877  	jmp	.LBB0_1013
  1878  .LBB0_149:
  1879  	test	r9d, r9d
  1880  	jle	.LBB0_1013
  1881  # %bb.150:
  1882  	mov	r10d, r9d
  1883  	cmp	r9d, 4
  1884  	jae	.LBB0_152
  1885  # %bb.151:
  1886  	xor	esi, esi
  1887  .LBB0_161:
  1888  	mov	r9, rsi
  1889  	not	r9
  1890  	add	r9, r10
  1891  	mov	rdi, r10
  1892  	and	rdi, 3
  1893  	je	.LBB0_163
  1894  .LBB0_162:                              # =>This Inner Loop Header: Depth=1
  1895  	mov	rax, qword ptr [rcx + 8*rsi]
  1896  	add	rax, qword ptr [rdx + 8*rsi]
  1897  	mov	qword ptr [r8 + 8*rsi], rax
  1898  	add	rsi, 1
  1899  	add	rdi, -1
  1900  	jne	.LBB0_162
  1901  .LBB0_163:
  1902  	cmp	r9, 3
  1903  	jb	.LBB0_1013
  1904  .LBB0_164:                              # =>This Inner Loop Header: Depth=1
  1905  	mov	rax, qword ptr [rcx + 8*rsi]
  1906  	add	rax, qword ptr [rdx + 8*rsi]
  1907  	mov	qword ptr [r8 + 8*rsi], rax
  1908  	mov	rax, qword ptr [rcx + 8*rsi + 8]
  1909  	add	rax, qword ptr [rdx + 8*rsi + 8]
  1910  	mov	qword ptr [r8 + 8*rsi + 8], rax
  1911  	mov	rax, qword ptr [rcx + 8*rsi + 16]
  1912  	add	rax, qword ptr [rdx + 8*rsi + 16]
  1913  	mov	qword ptr [r8 + 8*rsi + 16], rax
  1914  	mov	rax, qword ptr [rcx + 8*rsi + 24]
  1915  	add	rax, qword ptr [rdx + 8*rsi + 24]
  1916  	mov	qword ptr [r8 + 8*rsi + 24], rax
  1917  	add	rsi, 4
  1918  	cmp	r10, rsi
  1919  	jne	.LBB0_164
  1920  	jmp	.LBB0_1013
  1921  .LBB0_165:
  1922  	test	r9d, r9d
  1923  	jle	.LBB0_1013
  1924  # %bb.166:
  1925  	mov	r10d, r9d
  1926  	cmp	r9d, 8
  1927  	jae	.LBB0_168
  1928  # %bb.167:
  1929  	xor	esi, esi
  1930  .LBB0_177:
  1931  	mov	rax, rsi
  1932  	not	rax
  1933  	add	rax, r10
  1934  	mov	rdi, r10
  1935  	and	rdi, 3
  1936  	je	.LBB0_179
  1937  .LBB0_178:                              # =>This Inner Loop Header: Depth=1
  1938  	movss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
  1939  	addss	xmm0, dword ptr [rdx + 4*rsi]
  1940  	movss	dword ptr [r8 + 4*rsi], xmm0
  1941  	add	rsi, 1
  1942  	add	rdi, -1
  1943  	jne	.LBB0_178
  1944  .LBB0_179:
  1945  	cmp	rax, 3
  1946  	jb	.LBB0_1013
  1947  .LBB0_180:                              # =>This Inner Loop Header: Depth=1
  1948  	movss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
  1949  	addss	xmm0, dword ptr [rdx + 4*rsi]
  1950  	movss	dword ptr [r8 + 4*rsi], xmm0
  1951  	movss	xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
  1952  	addss	xmm0, dword ptr [rdx + 4*rsi + 4]
  1953  	movss	dword ptr [r8 + 4*rsi + 4], xmm0
  1954  	movss	xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
  1955  	addss	xmm0, dword ptr [rdx + 4*rsi + 8]
  1956  	movss	dword ptr [r8 + 4*rsi + 8], xmm0
  1957  	movss	xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
  1958  	addss	xmm0, dword ptr [rdx + 4*rsi + 12]
  1959  	movss	dword ptr [r8 + 4*rsi + 12], xmm0
  1960  	add	rsi, 4
  1961  	cmp	r10, rsi
  1962  	jne	.LBB0_180
  1963  	jmp	.LBB0_1013
  1964  .LBB0_495:
  1965  	test	r9d, r9d
  1966  	jle	.LBB0_1013
  1967  # %bb.496:
  1968  	mov	r10d, r9d
  1969  	cmp	r9d, 4
  1970  	jae	.LBB0_498
  1971  # %bb.497:
  1972  	xor	esi, esi
  1973  .LBB0_507:
  1974  	mov	r9, rsi
  1975  	not	r9
  1976  	add	r9, r10
  1977  	mov	rdi, r10
  1978  	and	rdi, 3
  1979  	je	.LBB0_509
  1980  .LBB0_508:                              # =>This Inner Loop Header: Depth=1
  1981  	mov	rax, qword ptr [rdx + 8*rsi]
  1982  	sub	rax, qword ptr [rcx + 8*rsi]
  1983  	mov	qword ptr [r8 + 8*rsi], rax
  1984  	add	rsi, 1
  1985  	add	rdi, -1
  1986  	jne	.LBB0_508
  1987  .LBB0_509:
  1988  	cmp	r9, 3
  1989  	jb	.LBB0_1013
  1990  .LBB0_510:                              # =>This Inner Loop Header: Depth=1
  1991  	mov	rax, qword ptr [rdx + 8*rsi]
  1992  	sub	rax, qword ptr [rcx + 8*rsi]
  1993  	mov	qword ptr [r8 + 8*rsi], rax
  1994  	mov	rax, qword ptr [rdx + 8*rsi + 8]
  1995  	sub	rax, qword ptr [rcx + 8*rsi + 8]
  1996  	mov	qword ptr [r8 + 8*rsi + 8], rax
  1997  	mov	rax, qword ptr [rdx + 8*rsi + 16]
  1998  	sub	rax, qword ptr [rcx + 8*rsi + 16]
  1999  	mov	qword ptr [r8 + 8*rsi + 16], rax
  2000  	mov	rax, qword ptr [rdx + 8*rsi + 24]
  2001  	sub	rax, qword ptr [rcx + 8*rsi + 24]
  2002  	mov	qword ptr [r8 + 8*rsi + 24], rax
  2003  	add	rsi, 4
  2004  	cmp	r10, rsi
  2005  	jne	.LBB0_510
  2006  	jmp	.LBB0_1013
  2007  .LBB0_511:
  2008  	test	r9d, r9d
  2009  	jle	.LBB0_1013
  2010  # %bb.512:
  2011  	mov	r10d, r9d
  2012  	cmp	r9d, 8
  2013  	jae	.LBB0_514
  2014  # %bb.513:
  2015  	xor	esi, esi
  2016  .LBB0_523:
  2017  	mov	rax, rsi
  2018  	not	rax
  2019  	add	rax, r10
  2020  	mov	rdi, r10
  2021  	and	rdi, 3
  2022  	je	.LBB0_525
  2023  .LBB0_524:                              # =>This Inner Loop Header: Depth=1
  2024  	movss	xmm0, dword ptr [rdx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
  2025  	subss	xmm0, dword ptr [rcx + 4*rsi]
  2026  	movss	dword ptr [r8 + 4*rsi], xmm0
  2027  	add	rsi, 1
  2028  	add	rdi, -1
  2029  	jne	.LBB0_524
  2030  .LBB0_525:
  2031  	cmp	rax, 3
  2032  	jb	.LBB0_1013
  2033  .LBB0_526:                              # =>This Inner Loop Header: Depth=1
  2034  	movss	xmm0, dword ptr [rdx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
  2035  	subss	xmm0, dword ptr [rcx + 4*rsi]
  2036  	movss	dword ptr [r8 + 4*rsi], xmm0
  2037  	movss	xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
  2038  	subss	xmm0, dword ptr [rcx + 4*rsi + 4]
  2039  	movss	dword ptr [r8 + 4*rsi + 4], xmm0
  2040  	movss	xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
  2041  	subss	xmm0, dword ptr [rcx + 4*rsi + 8]
  2042  	movss	dword ptr [r8 + 4*rsi + 8], xmm0
  2043  	movss	xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
  2044  	subss	xmm0, dword ptr [rcx + 4*rsi + 12]
  2045  	movss	dword ptr [r8 + 4*rsi + 12], xmm0
  2046  	add	rsi, 4
  2047  	cmp	r10, rsi
  2048  	jne	.LBB0_526
  2049  	jmp	.LBB0_1013
  2050  .LBB0_322:
  2051  	test	r9d, r9d
  2052  	jle	.LBB0_1013
  2053  # %bb.323:
  2054  	mov	r10d, r9d
  2055  	cmp	r9d, 4
  2056  	jae	.LBB0_325
  2057  # %bb.324:
  2058  	xor	esi, esi
  2059  .LBB0_334:
  2060  	mov	r9, rsi
  2061  	not	r9
  2062  	add	r9, r10
  2063  	mov	rdi, r10
  2064  	and	rdi, 3
  2065  	je	.LBB0_336
  2066  .LBB0_335:                              # =>This Inner Loop Header: Depth=1
  2067  	mov	rax, qword ptr [rcx + 8*rsi]
  2068  	add	rax, qword ptr [rdx + 8*rsi]
  2069  	mov	qword ptr [r8 + 8*rsi], rax
  2070  	add	rsi, 1
  2071  	add	rdi, -1
  2072  	jne	.LBB0_335
  2073  .LBB0_336:
  2074  	cmp	r9, 3
  2075  	jb	.LBB0_1013
  2076  .LBB0_337:                              # =>This Inner Loop Header: Depth=1
  2077  	mov	rax, qword ptr [rcx + 8*rsi]
  2078  	add	rax, qword ptr [rdx + 8*rsi]
  2079  	mov	qword ptr [r8 + 8*rsi], rax
  2080  	mov	rax, qword ptr [rcx + 8*rsi + 8]
  2081  	add	rax, qword ptr [rdx + 8*rsi + 8]
  2082  	mov	qword ptr [r8 + 8*rsi + 8], rax
  2083  	mov	rax, qword ptr [rcx + 8*rsi + 16]
  2084  	add	rax, qword ptr [rdx + 8*rsi + 16]
  2085  	mov	qword ptr [r8 + 8*rsi + 16], rax
  2086  	mov	rax, qword ptr [rcx + 8*rsi + 24]
  2087  	add	rax, qword ptr [rdx + 8*rsi + 24]
  2088  	mov	qword ptr [r8 + 8*rsi + 24], rax
  2089  	add	rsi, 4
  2090  	cmp	r10, rsi
  2091  	jne	.LBB0_337
  2092  	jmp	.LBB0_1013
  2093  .LBB0_338:
  2094  	test	r9d, r9d
  2095  	jle	.LBB0_1013
  2096  # %bb.339:
  2097  	mov	r10d, r9d
  2098  	cmp	r9d, 8
  2099  	jae	.LBB0_341
  2100  # %bb.340:
  2101  	xor	esi, esi
  2102  .LBB0_350:
  2103  	mov	rax, rsi
  2104  	not	rax
  2105  	add	rax, r10
  2106  	mov	rdi, r10
  2107  	and	rdi, 3
  2108  	je	.LBB0_352
  2109  .LBB0_351:                              # =>This Inner Loop Header: Depth=1
  2110  	movss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
  2111  	addss	xmm0, dword ptr [rdx + 4*rsi]
  2112  	movss	dword ptr [r8 + 4*rsi], xmm0
  2113  	add	rsi, 1
  2114  	add	rdi, -1
  2115  	jne	.LBB0_351
  2116  .LBB0_352:
  2117  	cmp	rax, 3
  2118  	jb	.LBB0_1013
  2119  .LBB0_353:                              # =>This Inner Loop Header: Depth=1
  2120  	movss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
  2121  	addss	xmm0, dword ptr [rdx + 4*rsi]
  2122  	movss	dword ptr [r8 + 4*rsi], xmm0
  2123  	movss	xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
  2124  	addss	xmm0, dword ptr [rdx + 4*rsi + 4]
  2125  	movss	dword ptr [r8 + 4*rsi + 4], xmm0
  2126  	movss	xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
  2127  	addss	xmm0, dword ptr [rdx + 4*rsi + 8]
  2128  	movss	dword ptr [r8 + 4*rsi + 8], xmm0
  2129  	movss	xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
  2130  	addss	xmm0, dword ptr [rdx + 4*rsi + 12]
  2131  	movss	dword ptr [r8 + 4*rsi + 12], xmm0
  2132  	add	rsi, 4
  2133  	cmp	r10, rsi
  2134  	jne	.LBB0_353
  2135  	jmp	.LBB0_1013
  2136  .LBB0_668:
  2137  	test	r9d, r9d
  2138  	jle	.LBB0_1013
  2139  # %bb.669:
  2140  	mov	r10d, r9d
  2141  	cmp	r9d, 4
  2142  	jae	.LBB0_671
  2143  # %bb.670:
  2144  	xor	esi, esi
  2145  .LBB0_680:
  2146  	mov	r9, rsi
  2147  	not	r9
  2148  	add	r9, r10
  2149  	mov	rdi, r10
  2150  	and	rdi, 3
  2151  	je	.LBB0_682
  2152  .LBB0_681:                              # =>This Inner Loop Header: Depth=1
  2153  	mov	rax, qword ptr [rdx + 8*rsi]
  2154  	sub	rax, qword ptr [rcx + 8*rsi]
  2155  	mov	qword ptr [r8 + 8*rsi], rax
  2156  	add	rsi, 1
  2157  	add	rdi, -1
  2158  	jne	.LBB0_681
  2159  .LBB0_682:
  2160  	cmp	r9, 3
  2161  	jb	.LBB0_1013
  2162  .LBB0_683:                              # =>This Inner Loop Header: Depth=1
  2163  	mov	rax, qword ptr [rdx + 8*rsi]
  2164  	sub	rax, qword ptr [rcx + 8*rsi]
  2165  	mov	qword ptr [r8 + 8*rsi], rax
  2166  	mov	rax, qword ptr [rdx + 8*rsi + 8]
  2167  	sub	rax, qword ptr [rcx + 8*rsi + 8]
  2168  	mov	qword ptr [r8 + 8*rsi + 8], rax
  2169  	mov	rax, qword ptr [rdx + 8*rsi + 16]
  2170  	sub	rax, qword ptr [rcx + 8*rsi + 16]
  2171  	mov	qword ptr [r8 + 8*rsi + 16], rax
  2172  	mov	rax, qword ptr [rdx + 8*rsi + 24]
  2173  	sub	rax, qword ptr [rcx + 8*rsi + 24]
  2174  	mov	qword ptr [r8 + 8*rsi + 24], rax
  2175  	add	rsi, 4
  2176  	cmp	r10, rsi
  2177  	jne	.LBB0_683
  2178  	jmp	.LBB0_1013
  2179  .LBB0_684:
  2180  	test	r9d, r9d
  2181  	jle	.LBB0_1013
  2182  # %bb.685:
  2183  	mov	r10d, r9d
  2184  	cmp	r9d, 8
  2185  	jae	.LBB0_687
  2186  # %bb.686:
  2187  	xor	esi, esi
  2188  .LBB0_696:
  2189  	mov	rax, rsi
  2190  	not	rax
  2191  	add	rax, r10
  2192  	mov	rdi, r10
  2193  	and	rdi, 3
  2194  	je	.LBB0_698
  2195  .LBB0_697:                              # =>This Inner Loop Header: Depth=1
  2196  	movss	xmm0, dword ptr [rdx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
  2197  	subss	xmm0, dword ptr [rcx + 4*rsi]
  2198  	movss	dword ptr [r8 + 4*rsi], xmm0
  2199  	add	rsi, 1
  2200  	add	rdi, -1
  2201  	jne	.LBB0_697
  2202  .LBB0_698:
  2203  	cmp	rax, 3
  2204  	jb	.LBB0_1013
  2205  .LBB0_699:                              # =>This Inner Loop Header: Depth=1
  2206  	movss	xmm0, dword ptr [rdx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
  2207  	subss	xmm0, dword ptr [rcx + 4*rsi]
  2208  	movss	dword ptr [r8 + 4*rsi], xmm0
  2209  	movss	xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
  2210  	subss	xmm0, dword ptr [rcx + 4*rsi + 4]
  2211  	movss	dword ptr [r8 + 4*rsi + 4], xmm0
  2212  	movss	xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
  2213  	subss	xmm0, dword ptr [rcx + 4*rsi + 8]
  2214  	movss	dword ptr [r8 + 4*rsi + 8], xmm0
  2215  	movss	xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
  2216  	subss	xmm0, dword ptr [rcx + 4*rsi + 12]
  2217  	movss	dword ptr [r8 + 4*rsi + 12], xmm0
  2218  	add	rsi, 4
  2219  	cmp	r10, rsi
  2220  	jne	.LBB0_699
  2221  	jmp	.LBB0_1013
  2222  .LBB0_731:
  2223  	test	r9d, r9d
  2224  	jle	.LBB0_1013
  2225  # %bb.732:
  2226  	mov	r10d, r9d
  2227  	cmp	r9d, 32
  2228  	jae	.LBB0_734
  2229  # %bb.733:
  2230  	xor	edi, edi
  2231  .LBB0_743:
  2232  	mov	r9, rdi
  2233  	not	r9
  2234  	add	r9, r10
  2235  	mov	rsi, r10
  2236  	and	rsi, 3
  2237  	je	.LBB0_745
  2238  .LBB0_744:                              # =>This Inner Loop Header: Depth=1
  2239  	movzx	eax, byte ptr [rcx + rdi]
  2240  	mul	byte ptr [rdx + rdi]
  2241  	mov	byte ptr [r8 + rdi], al
  2242  	add	rdi, 1
  2243  	add	rsi, -1
  2244  	jne	.LBB0_744
  2245  .LBB0_745:
  2246  	cmp	r9, 3
  2247  	jb	.LBB0_1013
  2248  .LBB0_746:                              # =>This Inner Loop Header: Depth=1
  2249  	movzx	eax, byte ptr [rcx + rdi]
  2250  	mul	byte ptr [rdx + rdi]
  2251  	mov	byte ptr [r8 + rdi], al
  2252  	movzx	eax, byte ptr [rcx + rdi + 1]
  2253  	mul	byte ptr [rdx + rdi + 1]
  2254  	mov	byte ptr [r8 + rdi + 1], al
  2255  	movzx	eax, byte ptr [rcx + rdi + 2]
  2256  	mul	byte ptr [rdx + rdi + 2]
  2257  	mov	byte ptr [r8 + rdi + 2], al
  2258  	movzx	eax, byte ptr [rcx + rdi + 3]
  2259  	mul	byte ptr [rdx + rdi + 3]
  2260  	mov	byte ptr [r8 + rdi + 3], al
  2261  	add	rdi, 4
  2262  	cmp	r10, rdi
  2263  	jne	.LBB0_746
  2264  	jmp	.LBB0_1013
  2265  .LBB0_881:
  2266  	test	r9d, r9d
  2267  	jle	.LBB0_1013
  2268  # %bb.882:
  2269  	mov	r10d, r9d
  2270  	cmp	r9d, 32
  2271  	jae	.LBB0_884
  2272  # %bb.883:
  2273  	xor	edi, edi
  2274  .LBB0_893:
  2275  	mov	r9, rdi
  2276  	not	r9
  2277  	add	r9, r10
  2278  	mov	rsi, r10
  2279  	and	rsi, 3
  2280  	je	.LBB0_895
  2281  .LBB0_894:                              # =>This Inner Loop Header: Depth=1
  2282  	movzx	eax, byte ptr [rcx + rdi]
  2283  	mul	byte ptr [rdx + rdi]
  2284  	mov	byte ptr [r8 + rdi], al
  2285  	add	rdi, 1
  2286  	add	rsi, -1
  2287  	jne	.LBB0_894
  2288  .LBB0_895:
  2289  	cmp	r9, 3
  2290  	jb	.LBB0_1013
  2291  .LBB0_896:                              # =>This Inner Loop Header: Depth=1
  2292  	movzx	eax, byte ptr [rcx + rdi]
  2293  	mul	byte ptr [rdx + rdi]
  2294  	mov	byte ptr [r8 + rdi], al
  2295  	movzx	eax, byte ptr [rcx + rdi + 1]
  2296  	mul	byte ptr [rdx + rdi + 1]
  2297  	mov	byte ptr [r8 + rdi + 1], al
  2298  	movzx	eax, byte ptr [rcx + rdi + 2]
  2299  	mul	byte ptr [rdx + rdi + 2]
  2300  	mov	byte ptr [r8 + rdi + 2], al
  2301  	movzx	eax, byte ptr [rcx + rdi + 3]
  2302  	mul	byte ptr [rdx + rdi + 3]
  2303  	mov	byte ptr [r8 + rdi + 3], al
  2304  	add	rdi, 4
  2305  	cmp	r10, rdi
  2306  	jne	.LBB0_896
  2307  	jmp	.LBB0_1013
  2308  .LBB0_46:
  2309  	test	r9d, r9d
  2310  	jle	.LBB0_1013
  2311  # %bb.47:
  2312  	mov	r10d, r9d
  2313  	cmp	r9d, 32
  2314  	jae	.LBB0_49
  2315  # %bb.48:
  2316  	xor	esi, esi
  2317  .LBB0_58:
  2318  	mov	r9, rsi
  2319  	not	r9
  2320  	add	r9, r10
  2321  	mov	rdi, r10
  2322  	and	rdi, 3
  2323  	je	.LBB0_60
  2324  .LBB0_59:                               # =>This Inner Loop Header: Depth=1
  2325  	movzx	eax, byte ptr [rcx + rsi]
  2326  	add	al, byte ptr [rdx + rsi]
  2327  	mov	byte ptr [r8 + rsi], al
  2328  	add	rsi, 1
  2329  	add	rdi, -1
  2330  	jne	.LBB0_59
  2331  .LBB0_60:
  2332  	cmp	r9, 3
  2333  	jb	.LBB0_1013
  2334  .LBB0_61:                               # =>This Inner Loop Header: Depth=1
  2335  	movzx	eax, byte ptr [rcx + rsi]
  2336  	add	al, byte ptr [rdx + rsi]
  2337  	mov	byte ptr [r8 + rsi], al
  2338  	movzx	eax, byte ptr [rcx + rsi + 1]
  2339  	add	al, byte ptr [rdx + rsi + 1]
  2340  	mov	byte ptr [r8 + rsi + 1], al
  2341  	movzx	eax, byte ptr [rcx + rsi + 2]
  2342  	add	al, byte ptr [rdx + rsi + 2]
  2343  	mov	byte ptr [r8 + rsi + 2], al
  2344  	movzx	eax, byte ptr [rcx + rsi + 3]
  2345  	add	al, byte ptr [rdx + rsi + 3]
  2346  	mov	byte ptr [r8 + rsi + 3], al
  2347  	add	rsi, 4
  2348  	cmp	r10, rsi
  2349  	jne	.LBB0_61
  2350  	jmp	.LBB0_1013
  2351  .LBB0_392:
  2352  	test	r9d, r9d
  2353  	jle	.LBB0_1013
  2354  # %bb.393:
  2355  	mov	r10d, r9d
  2356  	cmp	r9d, 32
  2357  	jae	.LBB0_395
  2358  # %bb.394:
  2359  	xor	esi, esi
  2360  .LBB0_404:
  2361  	mov	r9, rsi
  2362  	not	r9
  2363  	add	r9, r10
  2364  	mov	rdi, r10
  2365  	and	rdi, 3
  2366  	je	.LBB0_406
  2367  .LBB0_405:                              # =>This Inner Loop Header: Depth=1
  2368  	movzx	eax, byte ptr [rdx + rsi]
  2369  	sub	al, byte ptr [rcx + rsi]
  2370  	mov	byte ptr [r8 + rsi], al
  2371  	add	rsi, 1
  2372  	add	rdi, -1
  2373  	jne	.LBB0_405
  2374  .LBB0_406:
  2375  	cmp	r9, 3
  2376  	jb	.LBB0_1013
  2377  .LBB0_407:                              # =>This Inner Loop Header: Depth=1
  2378  	movzx	eax, byte ptr [rdx + rsi]
  2379  	sub	al, byte ptr [rcx + rsi]
  2380  	mov	byte ptr [r8 + rsi], al
  2381  	movzx	eax, byte ptr [rdx + rsi + 1]
  2382  	sub	al, byte ptr [rcx + rsi + 1]
  2383  	mov	byte ptr [r8 + rsi + 1], al
  2384  	movzx	eax, byte ptr [rdx + rsi + 2]
  2385  	sub	al, byte ptr [rcx + rsi + 2]
  2386  	mov	byte ptr [r8 + rsi + 2], al
  2387  	movzx	eax, byte ptr [rdx + rsi + 3]
  2388  	sub	al, byte ptr [rcx + rsi + 3]
  2389  	mov	byte ptr [r8 + rsi + 3], al
  2390  	add	rsi, 4
  2391  	cmp	r10, rsi
  2392  	jne	.LBB0_407
  2393  	jmp	.LBB0_1013
  2394  .LBB0_219:
  2395  	test	r9d, r9d
  2396  	jle	.LBB0_1013
  2397  # %bb.220:
  2398  	mov	r10d, r9d
  2399  	cmp	r9d, 32
  2400  	jae	.LBB0_222
  2401  # %bb.221:
  2402  	xor	esi, esi
  2403  .LBB0_231:
  2404  	mov	r9, rsi
  2405  	not	r9
  2406  	add	r9, r10
  2407  	mov	rdi, r10
  2408  	and	rdi, 3
  2409  	je	.LBB0_233
  2410  .LBB0_232:                              # =>This Inner Loop Header: Depth=1
  2411  	movzx	eax, byte ptr [rcx + rsi]
  2412  	add	al, byte ptr [rdx + rsi]
  2413  	mov	byte ptr [r8 + rsi], al
  2414  	add	rsi, 1
  2415  	add	rdi, -1
  2416  	jne	.LBB0_232
  2417  .LBB0_233:
  2418  	cmp	r9, 3
  2419  	jb	.LBB0_1013
  2420  .LBB0_234:                              # =>This Inner Loop Header: Depth=1
  2421  	movzx	eax, byte ptr [rcx + rsi]
  2422  	add	al, byte ptr [rdx + rsi]
  2423  	mov	byte ptr [r8 + rsi], al
  2424  	movzx	eax, byte ptr [rcx + rsi + 1]
  2425  	add	al, byte ptr [rdx + rsi + 1]
  2426  	mov	byte ptr [r8 + rsi + 1], al
  2427  	movzx	eax, byte ptr [rcx + rsi + 2]
  2428  	add	al, byte ptr [rdx + rsi + 2]
  2429  	mov	byte ptr [r8 + rsi + 2], al
  2430  	movzx	eax, byte ptr [rcx + rsi + 3]
  2431  	add	al, byte ptr [rdx + rsi + 3]
  2432  	mov	byte ptr [r8 + rsi + 3], al
  2433  	add	rsi, 4
  2434  	cmp	r10, rsi
  2435  	jne	.LBB0_234
  2436  	jmp	.LBB0_1013
  2437  .LBB0_565:
  2438  	test	r9d, r9d
  2439  	jle	.LBB0_1013
  2440  # %bb.566:
  2441  	mov	r10d, r9d
  2442  	cmp	r9d, 32
  2443  	jae	.LBB0_568
  2444  # %bb.567:
  2445  	xor	esi, esi
  2446  .LBB0_577:
  2447  	mov	r9, rsi
  2448  	not	r9
  2449  	add	r9, r10
  2450  	mov	rdi, r10
  2451  	and	rdi, 3
  2452  	je	.LBB0_579
  2453  .LBB0_578:                              # =>This Inner Loop Header: Depth=1
  2454  	movzx	eax, byte ptr [rdx + rsi]
  2455  	sub	al, byte ptr [rcx + rsi]
  2456  	mov	byte ptr [r8 + rsi], al
  2457  	add	rsi, 1
  2458  	add	rdi, -1
  2459  	jne	.LBB0_578
  2460  .LBB0_579:
  2461  	cmp	r9, 3
  2462  	jb	.LBB0_1013
  2463  .LBB0_580:                              # =>This Inner Loop Header: Depth=1
  2464  	movzx	eax, byte ptr [rdx + rsi]
  2465  	sub	al, byte ptr [rcx + rsi]
  2466  	mov	byte ptr [r8 + rsi], al
  2467  	movzx	eax, byte ptr [rdx + rsi + 1]
  2468  	sub	al, byte ptr [rcx + rsi + 1]
  2469  	mov	byte ptr [r8 + rsi + 1], al
  2470  	movzx	eax, byte ptr [rdx + rsi + 2]
  2471  	sub	al, byte ptr [rcx + rsi + 2]
  2472  	mov	byte ptr [r8 + rsi + 2], al
  2473  	movzx	eax, byte ptr [rdx + rsi + 3]
  2474  	sub	al, byte ptr [rcx + rsi + 3]
  2475  	mov	byte ptr [r8 + rsi + 3], al
  2476  	add	rsi, 4
  2477  	cmp	r10, rsi
  2478  	jne	.LBB0_580
  2479  	jmp	.LBB0_1013
  2480  .LBB0_805:
  2481  	test	r9d, r9d
  2482  	jle	.LBB0_1013
  2483  # %bb.806:
  2484  	mov	r10d, r9d
  2485  	cmp	r9d, 8
  2486  	jae	.LBB0_808
  2487  # %bb.807:
  2488  	xor	esi, esi
  2489  .LBB0_817:
  2490  	mov	r9, rsi
  2491  	not	r9
  2492  	add	r9, r10
  2493  	mov	rdi, r10
  2494  	and	rdi, 3
  2495  	je	.LBB0_819
  2496  .LBB0_818:                              # =>This Inner Loop Header: Depth=1
  2497  	mov	eax, dword ptr [rcx + 4*rsi]
  2498  	imul	eax, dword ptr [rdx + 4*rsi]
  2499  	mov	dword ptr [r8 + 4*rsi], eax
  2500  	add	rsi, 1
  2501  	add	rdi, -1
  2502  	jne	.LBB0_818
  2503  .LBB0_819:
  2504  	cmp	r9, 3
  2505  	jb	.LBB0_1013
  2506  .LBB0_820:                              # =>This Inner Loop Header: Depth=1
  2507  	mov	eax, dword ptr [rcx + 4*rsi]
  2508  	imul	eax, dword ptr [rdx + 4*rsi]
  2509  	mov	dword ptr [r8 + 4*rsi], eax
  2510  	mov	eax, dword ptr [rcx + 4*rsi + 4]
  2511  	imul	eax, dword ptr [rdx + 4*rsi + 4]
  2512  	mov	dword ptr [r8 + 4*rsi + 4], eax
  2513  	mov	eax, dword ptr [rcx + 4*rsi + 8]
  2514  	imul	eax, dword ptr [rdx + 4*rsi + 8]
  2515  	mov	dword ptr [r8 + 4*rsi + 8], eax
  2516  	mov	eax, dword ptr [rcx + 4*rsi + 12]
  2517  	imul	eax, dword ptr [rdx + 4*rsi + 12]
  2518  	mov	dword ptr [r8 + 4*rsi + 12], eax
  2519  	add	rsi, 4
  2520  	cmp	r10, rsi
  2521  	jne	.LBB0_820
  2522  	jmp	.LBB0_1013
  2523  .LBB0_955:
  2524  	test	r9d, r9d
  2525  	jle	.LBB0_1013
  2526  # %bb.956:
  2527  	mov	r10d, r9d
  2528  	cmp	r9d, 8
  2529  	jae	.LBB0_958
  2530  # %bb.957:
  2531  	xor	esi, esi
  2532  .LBB0_967:
  2533  	mov	r9, rsi
  2534  	not	r9
  2535  	add	r9, r10
  2536  	mov	rdi, r10
  2537  	and	rdi, 3
  2538  	je	.LBB0_969
  2539  .LBB0_968:                              # =>This Inner Loop Header: Depth=1
  2540  	mov	eax, dword ptr [rcx + 4*rsi]
  2541  	imul	eax, dword ptr [rdx + 4*rsi]
  2542  	mov	dword ptr [r8 + 4*rsi], eax
  2543  	add	rsi, 1
  2544  	add	rdi, -1
  2545  	jne	.LBB0_968
  2546  .LBB0_969:
  2547  	cmp	r9, 3
  2548  	jb	.LBB0_1013
  2549  .LBB0_970:                              # =>This Inner Loop Header: Depth=1
  2550  	mov	eax, dword ptr [rcx + 4*rsi]
  2551  	imul	eax, dword ptr [rdx + 4*rsi]
  2552  	mov	dword ptr [r8 + 4*rsi], eax
  2553  	mov	eax, dword ptr [rcx + 4*rsi + 4]
  2554  	imul	eax, dword ptr [rdx + 4*rsi + 4]
  2555  	mov	dword ptr [r8 + 4*rsi + 4], eax
  2556  	mov	eax, dword ptr [rcx + 4*rsi + 8]
  2557  	imul	eax, dword ptr [rdx + 4*rsi + 8]
  2558  	mov	dword ptr [r8 + 4*rsi + 8], eax
  2559  	mov	eax, dword ptr [rcx + 4*rsi + 12]
  2560  	imul	eax, dword ptr [rdx + 4*rsi + 12]
  2561  	mov	dword ptr [r8 + 4*rsi + 12], eax
  2562  	add	rsi, 4
  2563  	cmp	r10, rsi
  2564  	jne	.LBB0_970
  2565  	jmp	.LBB0_1013
  2566  .LBB0_120:
  2567  	test	r9d, r9d
  2568  	jle	.LBB0_1013
  2569  # %bb.121:
  2570  	mov	r10d, r9d
  2571  	cmp	r9d, 8
  2572  	jae	.LBB0_123
  2573  # %bb.122:
  2574  	xor	esi, esi
  2575  .LBB0_132:
  2576  	mov	r9, rsi
  2577  	not	r9
  2578  	add	r9, r10
  2579  	mov	rdi, r10
  2580  	and	rdi, 3
  2581  	je	.LBB0_134
  2582  .LBB0_133:                              # =>This Inner Loop Header: Depth=1
  2583  	mov	eax, dword ptr [rcx + 4*rsi]
  2584  	add	eax, dword ptr [rdx + 4*rsi]
  2585  	mov	dword ptr [r8 + 4*rsi], eax
  2586  	add	rsi, 1
  2587  	add	rdi, -1
  2588  	jne	.LBB0_133
  2589  .LBB0_134:
  2590  	cmp	r9, 3
  2591  	jb	.LBB0_1013
  2592  .LBB0_135:                              # =>This Inner Loop Header: Depth=1
  2593  	mov	eax, dword ptr [rcx + 4*rsi]
  2594  	add	eax, dword ptr [rdx + 4*rsi]
  2595  	mov	dword ptr [r8 + 4*rsi], eax
  2596  	mov	eax, dword ptr [rcx + 4*rsi + 4]
  2597  	add	eax, dword ptr [rdx + 4*rsi + 4]
  2598  	mov	dword ptr [r8 + 4*rsi + 4], eax
  2599  	mov	eax, dword ptr [rcx + 4*rsi + 8]
  2600  	add	eax, dword ptr [rdx + 4*rsi + 8]
  2601  	mov	dword ptr [r8 + 4*rsi + 8], eax
  2602  	mov	eax, dword ptr [rcx + 4*rsi + 12]
  2603  	add	eax, dword ptr [rdx + 4*rsi + 12]
  2604  	mov	dword ptr [r8 + 4*rsi + 12], eax
  2605  	add	rsi, 4
  2606  	cmp	r10, rsi
  2607  	jne	.LBB0_135
  2608  	jmp	.LBB0_1013
  2609  .LBB0_466:
  2610  	test	r9d, r9d
  2611  	jle	.LBB0_1013
  2612  # %bb.467:
  2613  	mov	r10d, r9d
  2614  	cmp	r9d, 8
  2615  	jae	.LBB0_469
  2616  # %bb.468:
  2617  	xor	esi, esi
  2618  .LBB0_478:
  2619  	mov	r9, rsi
  2620  	not	r9
  2621  	add	r9, r10
  2622  	mov	rdi, r10
  2623  	and	rdi, 3
  2624  	je	.LBB0_480
  2625  .LBB0_479:                              # =>This Inner Loop Header: Depth=1
  2626  	mov	eax, dword ptr [rdx + 4*rsi]
  2627  	sub	eax, dword ptr [rcx + 4*rsi]
  2628  	mov	dword ptr [r8 + 4*rsi], eax
  2629  	add	rsi, 1
  2630  	add	rdi, -1
  2631  	jne	.LBB0_479
  2632  .LBB0_480:
  2633  	cmp	r9, 3
  2634  	jb	.LBB0_1013
  2635  .LBB0_481:                              # =>This Inner Loop Header: Depth=1
  2636  	mov	eax, dword ptr [rdx + 4*rsi]
  2637  	sub	eax, dword ptr [rcx + 4*rsi]
  2638  	mov	dword ptr [r8 + 4*rsi], eax
  2639  	mov	eax, dword ptr [rdx + 4*rsi + 4]
  2640  	sub	eax, dword ptr [rcx + 4*rsi + 4]
  2641  	mov	dword ptr [r8 + 4*rsi + 4], eax
  2642  	mov	eax, dword ptr [rdx + 4*rsi + 8]
  2643  	sub	eax, dword ptr [rcx + 4*rsi + 8]
  2644  	mov	dword ptr [r8 + 4*rsi + 8], eax
  2645  	mov	eax, dword ptr [rdx + 4*rsi + 12]
  2646  	sub	eax, dword ptr [rcx + 4*rsi + 12]
  2647  	mov	dword ptr [r8 + 4*rsi + 12], eax
  2648  	add	rsi, 4
  2649  	cmp	r10, rsi
  2650  	jne	.LBB0_481
  2651  	jmp	.LBB0_1013
  2652  .LBB0_293:
  2653  	test	r9d, r9d
  2654  	jle	.LBB0_1013
  2655  # %bb.294:
  2656  	mov	r10d, r9d
  2657  	cmp	r9d, 8
  2658  	jae	.LBB0_296
  2659  # %bb.295:
  2660  	xor	esi, esi
  2661  .LBB0_305:
  2662  	mov	r9, rsi
  2663  	not	r9
  2664  	add	r9, r10
  2665  	mov	rdi, r10
  2666  	and	rdi, 3
  2667  	je	.LBB0_307
  2668  .LBB0_306:                              # =>This Inner Loop Header: Depth=1
  2669  	mov	eax, dword ptr [rcx + 4*rsi]
  2670  	add	eax, dword ptr [rdx + 4*rsi]
  2671  	mov	dword ptr [r8 + 4*rsi], eax
  2672  	add	rsi, 1
  2673  	add	rdi, -1
  2674  	jne	.LBB0_306
  2675  .LBB0_307:
  2676  	cmp	r9, 3
  2677  	jb	.LBB0_1013
  2678  .LBB0_308:                              # =>This Inner Loop Header: Depth=1
  2679  	mov	eax, dword ptr [rcx + 4*rsi]
  2680  	add	eax, dword ptr [rdx + 4*rsi]
  2681  	mov	dword ptr [r8 + 4*rsi], eax
  2682  	mov	eax, dword ptr [rcx + 4*rsi + 4]
  2683  	add	eax, dword ptr [rdx + 4*rsi + 4]
  2684  	mov	dword ptr [r8 + 4*rsi + 4], eax
  2685  	mov	eax, dword ptr [rcx + 4*rsi + 8]
  2686  	add	eax, dword ptr [rdx + 4*rsi + 8]
  2687  	mov	dword ptr [r8 + 4*rsi + 8], eax
  2688  	mov	eax, dword ptr [rcx + 4*rsi + 12]
  2689  	add	eax, dword ptr [rdx + 4*rsi + 12]
  2690  	mov	dword ptr [r8 + 4*rsi + 12], eax
  2691  	add	rsi, 4
  2692  	cmp	r10, rsi
  2693  	jne	.LBB0_308
  2694  	jmp	.LBB0_1013
  2695  .LBB0_639:
  2696  	test	r9d, r9d
  2697  	jle	.LBB0_1013
  2698  # %bb.640:
  2699  	mov	r10d, r9d
  2700  	cmp	r9d, 8
  2701  	jae	.LBB0_642
  2702  # %bb.641:
  2703  	xor	esi, esi
  2704  .LBB0_651:
  2705  	mov	r9, rsi
  2706  	not	r9
  2707  	add	r9, r10
  2708  	mov	rdi, r10
  2709  	and	rdi, 3
  2710  	je	.LBB0_653
  2711  .LBB0_652:                              # =>This Inner Loop Header: Depth=1
  2712  	mov	eax, dword ptr [rdx + 4*rsi]
  2713  	sub	eax, dword ptr [rcx + 4*rsi]
  2714  	mov	dword ptr [r8 + 4*rsi], eax
  2715  	add	rsi, 1
  2716  	add	rdi, -1
  2717  	jne	.LBB0_652
  2718  .LBB0_653:
  2719  	cmp	r9, 3
  2720  	jb	.LBB0_1013
  2721  .LBB0_654:                              # =>This Inner Loop Header: Depth=1
  2722  	mov	eax, dword ptr [rdx + 4*rsi]
  2723  	sub	eax, dword ptr [rcx + 4*rsi]
  2724  	mov	dword ptr [r8 + 4*rsi], eax
  2725  	mov	eax, dword ptr [rdx + 4*rsi + 4]
  2726  	sub	eax, dword ptr [rcx + 4*rsi + 4]
  2727  	mov	dword ptr [r8 + 4*rsi + 4], eax
  2728  	mov	eax, dword ptr [rdx + 4*rsi + 8]
  2729  	sub	eax, dword ptr [rcx + 4*rsi + 8]
  2730  	mov	dword ptr [r8 + 4*rsi + 8], eax
  2731  	mov	eax, dword ptr [rdx + 4*rsi + 12]
  2732  	sub	eax, dword ptr [rcx + 4*rsi + 12]
  2733  	mov	dword ptr [r8 + 4*rsi + 12], eax
  2734  	add	rsi, 4
  2735  	cmp	r10, rsi
  2736  	jne	.LBB0_654
  2737  	jmp	.LBB0_1013
  2738  .LBB0_792:
  2739  	lea	rsi, [r8 + 4*r10]
  2740  	lea	rax, [rdx + 4*r10]
  2741  	cmp	rax, r8
  2742  	seta	r9b
  2743  	lea	rax, [rcx + 4*r10]
  2744  	cmp	rsi, rdx
  2745  	seta	r11b
  2746  	cmp	rax, r8
  2747  	seta	al
  2748  	cmp	rsi, rcx
  2749  	seta	dil
  2750  	xor	esi, esi
  2751  	test	r9b, r11b
  2752  	jne	.LBB0_801
  2753  # %bb.793:
  2754  	and	al, dil
  2755  	jne	.LBB0_801
  2756  # %bb.794:
  2757  	mov	esi, r10d
  2758  	and	esi, -8
  2759  	lea	rax, [rsi - 8]
  2760  	mov	r9, rax
  2761  	shr	r9, 3
  2762  	add	r9, 1
  2763  	test	rax, rax
  2764  	je	.LBB0_795
  2765  # %bb.796:
  2766  	mov	rax, r9
  2767  	and	rax, -2
  2768  	neg	rax
  2769  	xor	edi, edi
  2770  .LBB0_797:                              # =>This Inner Loop Header: Depth=1
  2771  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  2772  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  2773  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  2774  	pmulld	xmm2, xmm0
  2775  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  2776  	pmulld	xmm0, xmm1
  2777  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  2778  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  2779  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  2780  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  2781  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  2782  	pmulld	xmm2, xmm0
  2783  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 48]
  2784  	pmulld	xmm0, xmm1
  2785  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm2
  2786  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm0
  2787  	add	rdi, 16
  2788  	add	rax, 2
  2789  	jne	.LBB0_797
  2790  	jmp	.LBB0_798
  2791  .LBB0_942:
  2792  	lea	rsi, [r8 + 4*r10]
  2793  	lea	rax, [rdx + 4*r10]
  2794  	cmp	rax, r8
  2795  	seta	r9b
  2796  	lea	rax, [rcx + 4*r10]
  2797  	cmp	rsi, rdx
  2798  	seta	r11b
  2799  	cmp	rax, r8
  2800  	seta	al
  2801  	cmp	rsi, rcx
  2802  	seta	dil
  2803  	xor	esi, esi
  2804  	test	r9b, r11b
  2805  	jne	.LBB0_951
  2806  # %bb.943:
  2807  	and	al, dil
  2808  	jne	.LBB0_951
  2809  # %bb.944:
  2810  	mov	esi, r10d
  2811  	and	esi, -8
  2812  	lea	rax, [rsi - 8]
  2813  	mov	r9, rax
  2814  	shr	r9, 3
  2815  	add	r9, 1
  2816  	test	rax, rax
  2817  	je	.LBB0_945
  2818  # %bb.946:
  2819  	mov	rax, r9
  2820  	and	rax, -2
  2821  	neg	rax
  2822  	xor	edi, edi
  2823  .LBB0_947:                              # =>This Inner Loop Header: Depth=1
  2824  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  2825  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  2826  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  2827  	pmulld	xmm2, xmm0
  2828  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  2829  	pmulld	xmm0, xmm1
  2830  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  2831  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  2832  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  2833  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  2834  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  2835  	pmulld	xmm2, xmm0
  2836  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 48]
  2837  	pmulld	xmm0, xmm1
  2838  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm2
  2839  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm0
  2840  	add	rdi, 16
  2841  	add	rax, 2
  2842  	jne	.LBB0_947
  2843  	jmp	.LBB0_948
  2844  .LBB0_107:
  2845  	lea	rsi, [r8 + 4*r10]
  2846  	lea	rax, [rdx + 4*r10]
  2847  	cmp	rax, r8
  2848  	seta	r9b
  2849  	lea	rax, [rcx + 4*r10]
  2850  	cmp	rsi, rdx
  2851  	seta	r11b
  2852  	cmp	rax, r8
  2853  	seta	al
  2854  	cmp	rsi, rcx
  2855  	seta	dil
  2856  	xor	esi, esi
  2857  	test	r9b, r11b
  2858  	jne	.LBB0_116
  2859  # %bb.108:
  2860  	and	al, dil
  2861  	jne	.LBB0_116
  2862  # %bb.109:
  2863  	mov	esi, r10d
  2864  	and	esi, -8
  2865  	lea	rax, [rsi - 8]
  2866  	mov	r9, rax
  2867  	shr	r9, 3
  2868  	add	r9, 1
  2869  	test	rax, rax
  2870  	je	.LBB0_110
  2871  # %bb.111:
  2872  	mov	rax, r9
  2873  	and	rax, -2
  2874  	neg	rax
  2875  	xor	edi, edi
  2876  .LBB0_112:                              # =>This Inner Loop Header: Depth=1
  2877  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  2878  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  2879  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  2880  	paddd	xmm2, xmm0
  2881  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  2882  	paddd	xmm0, xmm1
  2883  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  2884  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  2885  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  2886  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  2887  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  2888  	paddd	xmm2, xmm0
  2889  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 48]
  2890  	paddd	xmm0, xmm1
  2891  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm2
  2892  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm0
  2893  	add	rdi, 16
  2894  	add	rax, 2
  2895  	jne	.LBB0_112
  2896  	jmp	.LBB0_113
  2897  .LBB0_453:
  2898  	lea	rsi, [r8 + 4*r10]
  2899  	lea	rax, [rdx + 4*r10]
  2900  	cmp	rax, r8
  2901  	seta	r9b
  2902  	lea	rax, [rcx + 4*r10]
  2903  	cmp	rsi, rdx
  2904  	seta	r11b
  2905  	cmp	rax, r8
  2906  	seta	al
  2907  	cmp	rsi, rcx
  2908  	seta	dil
  2909  	xor	esi, esi
  2910  	test	r9b, r11b
  2911  	jne	.LBB0_462
  2912  # %bb.454:
  2913  	and	al, dil
  2914  	jne	.LBB0_462
  2915  # %bb.455:
  2916  	mov	esi, r10d
  2917  	and	esi, -8
  2918  	lea	rax, [rsi - 8]
  2919  	mov	r9, rax
  2920  	shr	r9, 3
  2921  	add	r9, 1
  2922  	test	rax, rax
  2923  	je	.LBB0_456
  2924  # %bb.457:
  2925  	mov	rax, r9
  2926  	and	rax, -2
  2927  	neg	rax
  2928  	xor	edi, edi
  2929  .LBB0_458:                              # =>This Inner Loop Header: Depth=1
  2930  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  2931  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  2932  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  2933  	psubd	xmm0, xmm2
  2934  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
  2935  	psubd	xmm1, xmm2
  2936  	movdqu	xmmword ptr [r8 + 4*rdi], xmm0
  2937  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm1
  2938  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  2939  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  2940  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  2941  	psubd	xmm0, xmm2
  2942  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
  2943  	psubd	xmm1, xmm2
  2944  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm0
  2945  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm1
  2946  	add	rdi, 16
  2947  	add	rax, 2
  2948  	jne	.LBB0_458
  2949  	jmp	.LBB0_459
  2950  .LBB0_280:
  2951  	lea	rsi, [r8 + 4*r10]
  2952  	lea	rax, [rdx + 4*r10]
  2953  	cmp	rax, r8
  2954  	seta	r9b
  2955  	lea	rax, [rcx + 4*r10]
  2956  	cmp	rsi, rdx
  2957  	seta	r11b
  2958  	cmp	rax, r8
  2959  	seta	al
  2960  	cmp	rsi, rcx
  2961  	seta	dil
  2962  	xor	esi, esi
  2963  	test	r9b, r11b
  2964  	jne	.LBB0_289
  2965  # %bb.281:
  2966  	and	al, dil
  2967  	jne	.LBB0_289
  2968  # %bb.282:
  2969  	mov	esi, r10d
  2970  	and	esi, -8
  2971  	lea	rax, [rsi - 8]
  2972  	mov	r9, rax
  2973  	shr	r9, 3
  2974  	add	r9, 1
  2975  	test	rax, rax
  2976  	je	.LBB0_283
  2977  # %bb.284:
  2978  	mov	rax, r9
  2979  	and	rax, -2
  2980  	neg	rax
  2981  	xor	edi, edi
  2982  .LBB0_285:                              # =>This Inner Loop Header: Depth=1
  2983  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  2984  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  2985  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  2986  	paddd	xmm2, xmm0
  2987  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  2988  	paddd	xmm0, xmm1
  2989  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  2990  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  2991  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  2992  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  2993  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  2994  	paddd	xmm2, xmm0
  2995  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 48]
  2996  	paddd	xmm0, xmm1
  2997  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm2
  2998  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm0
  2999  	add	rdi, 16
  3000  	add	rax, 2
  3001  	jne	.LBB0_285
  3002  	jmp	.LBB0_286
  3003  .LBB0_626:
  3004  	lea	rsi, [r8 + 4*r10]
  3005  	lea	rax, [rdx + 4*r10]
  3006  	cmp	rax, r8
  3007  	seta	r9b
  3008  	lea	rax, [rcx + 4*r10]
  3009  	cmp	rsi, rdx
  3010  	seta	r11b
  3011  	cmp	rax, r8
  3012  	seta	al
  3013  	cmp	rsi, rcx
  3014  	seta	dil
  3015  	xor	esi, esi
  3016  	test	r9b, r11b
  3017  	jne	.LBB0_635
  3018  # %bb.627:
  3019  	and	al, dil
  3020  	jne	.LBB0_635
  3021  # %bb.628:
  3022  	mov	esi, r10d
  3023  	and	esi, -8
  3024  	lea	rax, [rsi - 8]
  3025  	mov	r9, rax
  3026  	shr	r9, 3
  3027  	add	r9, 1
  3028  	test	rax, rax
  3029  	je	.LBB0_629
  3030  # %bb.630:
  3031  	mov	rax, r9
  3032  	and	rax, -2
  3033  	neg	rax
  3034  	xor	edi, edi
  3035  .LBB0_631:                              # =>This Inner Loop Header: Depth=1
  3036  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  3037  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  3038  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  3039  	psubd	xmm0, xmm2
  3040  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
  3041  	psubd	xmm1, xmm2
  3042  	movdqu	xmmword ptr [r8 + 4*rdi], xmm0
  3043  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm1
  3044  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  3045  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  3046  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  3047  	psubd	xmm0, xmm2
  3048  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
  3049  	psubd	xmm1, xmm2
  3050  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm0
  3051  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm1
  3052  	add	rdi, 16
  3053  	add	rax, 2
  3054  	jne	.LBB0_631
  3055  	jmp	.LBB0_632
  3056  .LBB0_850:
  3057  	lea	rsi, [r8 + 8*r10]
  3058  	lea	rax, [rdx + 8*r10]
  3059  	cmp	rax, r8
  3060  	seta	r9b
  3061  	lea	rax, [rcx + 8*r10]
  3062  	cmp	rsi, rdx
  3063  	seta	r11b
  3064  	cmp	rax, r8
  3065  	seta	al
  3066  	cmp	rsi, rcx
  3067  	seta	dil
  3068  	xor	esi, esi
  3069  	test	r9b, r11b
  3070  	jne	.LBB0_859
  3071  # %bb.851:
  3072  	and	al, dil
  3073  	jne	.LBB0_859
  3074  # %bb.852:
  3075  	mov	esi, r10d
  3076  	and	esi, -4
  3077  	lea	rax, [rsi - 4]
  3078  	mov	r9, rax
  3079  	shr	r9, 2
  3080  	add	r9, 1
  3081  	test	rax, rax
  3082  	je	.LBB0_853
  3083  # %bb.854:
  3084  	mov	rax, r9
  3085  	and	rax, -2
  3086  	neg	rax
  3087  	xor	edi, edi
  3088  .LBB0_855:                              # =>This Inner Loop Header: Depth=1
  3089  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
  3090  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  3091  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
  3092  	mulpd	xmm2, xmm0
  3093  	movupd	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  3094  	mulpd	xmm0, xmm1
  3095  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
  3096  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm0
  3097  	movupd	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  3098  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  3099  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  3100  	mulpd	xmm2, xmm0
  3101  	movupd	xmm0, xmmword ptr [rcx + 8*rdi + 48]
  3102  	mulpd	xmm0, xmm1
  3103  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
  3104  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm0
  3105  	add	rdi, 8
  3106  	add	rax, 2
  3107  	jne	.LBB0_855
  3108  	jmp	.LBB0_856
  3109  .LBB0_1000:
  3110  	lea	rsi, [r8 + 8*r10]
  3111  	lea	rax, [rdx + 8*r10]
  3112  	cmp	rax, r8
  3113  	seta	r9b
  3114  	lea	rax, [rcx + 8*r10]
  3115  	cmp	rsi, rdx
  3116  	seta	r11b
  3117  	cmp	rax, r8
  3118  	seta	al
  3119  	cmp	rsi, rcx
  3120  	seta	dil
  3121  	xor	esi, esi
  3122  	test	r9b, r11b
  3123  	jne	.LBB0_1009
  3124  # %bb.1001:
  3125  	and	al, dil
  3126  	jne	.LBB0_1009
  3127  # %bb.1002:
  3128  	mov	esi, r10d
  3129  	and	esi, -4
  3130  	lea	rax, [rsi - 4]
  3131  	mov	r9, rax
  3132  	shr	r9, 2
  3133  	add	r9, 1
  3134  	test	rax, rax
  3135  	je	.LBB0_1003
  3136  # %bb.1004:
  3137  	mov	rax, r9
  3138  	and	rax, -2
  3139  	neg	rax
  3140  	xor	edi, edi
  3141  .LBB0_1005:                             # =>This Inner Loop Header: Depth=1
  3142  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
  3143  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  3144  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
  3145  	mulpd	xmm2, xmm0
  3146  	movupd	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  3147  	mulpd	xmm0, xmm1
  3148  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
  3149  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm0
  3150  	movupd	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  3151  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  3152  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  3153  	mulpd	xmm2, xmm0
  3154  	movupd	xmm0, xmmword ptr [rcx + 8*rdi + 48]
  3155  	mulpd	xmm0, xmm1
  3156  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
  3157  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm0
  3158  	add	rdi, 8
  3159  	add	rax, 2
  3160  	jne	.LBB0_1005
  3161  	jmp	.LBB0_1006
  3162  .LBB0_181:
  3163  	lea	rsi, [r8 + 8*r10]
  3164  	lea	rax, [rdx + 8*r10]
  3165  	cmp	rax, r8
  3166  	seta	r9b
  3167  	lea	rax, [rcx + 8*r10]
  3168  	cmp	rsi, rdx
  3169  	seta	r11b
  3170  	cmp	rax, r8
  3171  	seta	al
  3172  	cmp	rsi, rcx
  3173  	seta	dil
  3174  	xor	esi, esi
  3175  	test	r9b, r11b
  3176  	jne	.LBB0_190
  3177  # %bb.182:
  3178  	and	al, dil
  3179  	jne	.LBB0_190
  3180  # %bb.183:
  3181  	mov	esi, r10d
  3182  	and	esi, -4
  3183  	lea	rax, [rsi - 4]
  3184  	mov	r9, rax
  3185  	shr	r9, 2
  3186  	add	r9, 1
  3187  	test	rax, rax
  3188  	je	.LBB0_184
  3189  # %bb.185:
  3190  	mov	rax, r9
  3191  	and	rax, -2
  3192  	neg	rax
  3193  	xor	edi, edi
  3194  .LBB0_186:                              # =>This Inner Loop Header: Depth=1
  3195  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
  3196  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  3197  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
  3198  	addpd	xmm2, xmm0
  3199  	movupd	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  3200  	addpd	xmm0, xmm1
  3201  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
  3202  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm0
  3203  	movupd	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  3204  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  3205  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  3206  	addpd	xmm2, xmm0
  3207  	movupd	xmm0, xmmword ptr [rcx + 8*rdi + 48]
  3208  	addpd	xmm0, xmm1
  3209  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
  3210  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm0
  3211  	add	rdi, 8
  3212  	add	rax, 2
  3213  	jne	.LBB0_186
  3214  	jmp	.LBB0_187
  3215  .LBB0_527:
  3216  	lea	rsi, [r8 + 8*r10]
  3217  	lea	rax, [rdx + 8*r10]
  3218  	cmp	rax, r8
  3219  	seta	r9b
  3220  	lea	rax, [rcx + 8*r10]
  3221  	cmp	rsi, rdx
  3222  	seta	r11b
  3223  	cmp	rax, r8
  3224  	seta	al
  3225  	cmp	rsi, rcx
  3226  	seta	dil
  3227  	xor	esi, esi
  3228  	test	r9b, r11b
  3229  	jne	.LBB0_536
  3230  # %bb.528:
  3231  	and	al, dil
  3232  	jne	.LBB0_536
  3233  # %bb.529:
  3234  	mov	esi, r10d
  3235  	and	esi, -4
  3236  	lea	rax, [rsi - 4]
  3237  	mov	r9, rax
  3238  	shr	r9, 2
  3239  	add	r9, 1
  3240  	test	rax, rax
  3241  	je	.LBB0_530
  3242  # %bb.531:
  3243  	mov	rax, r9
  3244  	and	rax, -2
  3245  	neg	rax
  3246  	xor	edi, edi
  3247  .LBB0_532:                              # =>This Inner Loop Header: Depth=1
  3248  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
  3249  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  3250  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
  3251  	subpd	xmm0, xmm2
  3252  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 16]
  3253  	subpd	xmm1, xmm2
  3254  	movupd	xmmword ptr [r8 + 8*rdi], xmm0
  3255  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm1
  3256  	movupd	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  3257  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  3258  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  3259  	subpd	xmm0, xmm2
  3260  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 48]
  3261  	subpd	xmm1, xmm2
  3262  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm0
  3263  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm1
  3264  	add	rdi, 8
  3265  	add	rax, 2
  3266  	jne	.LBB0_532
  3267  	jmp	.LBB0_533
  3268  .LBB0_354:
  3269  	lea	rsi, [r8 + 8*r10]
  3270  	lea	rax, [rdx + 8*r10]
  3271  	cmp	rax, r8
  3272  	seta	r9b
  3273  	lea	rax, [rcx + 8*r10]
  3274  	cmp	rsi, rdx
  3275  	seta	r11b
  3276  	cmp	rax, r8
  3277  	seta	al
  3278  	cmp	rsi, rcx
  3279  	seta	dil
  3280  	xor	esi, esi
  3281  	test	r9b, r11b
  3282  	jne	.LBB0_363
  3283  # %bb.355:
  3284  	and	al, dil
  3285  	jne	.LBB0_363
  3286  # %bb.356:
  3287  	mov	esi, r10d
  3288  	and	esi, -4
  3289  	lea	rax, [rsi - 4]
  3290  	mov	r9, rax
  3291  	shr	r9, 2
  3292  	add	r9, 1
  3293  	test	rax, rax
  3294  	je	.LBB0_357
  3295  # %bb.358:
  3296  	mov	rax, r9
  3297  	and	rax, -2
  3298  	neg	rax
  3299  	xor	edi, edi
  3300  .LBB0_359:                              # =>This Inner Loop Header: Depth=1
  3301  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
  3302  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  3303  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
  3304  	addpd	xmm2, xmm0
  3305  	movupd	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  3306  	addpd	xmm0, xmm1
  3307  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
  3308  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm0
  3309  	movupd	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  3310  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  3311  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  3312  	addpd	xmm2, xmm0
  3313  	movupd	xmm0, xmmword ptr [rcx + 8*rdi + 48]
  3314  	addpd	xmm0, xmm1
  3315  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
  3316  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm0
  3317  	add	rdi, 8
  3318  	add	rax, 2
  3319  	jne	.LBB0_359
  3320  	jmp	.LBB0_360
  3321  .LBB0_700:
  3322  	lea	rsi, [r8 + 8*r10]
  3323  	lea	rax, [rdx + 8*r10]
  3324  	cmp	rax, r8
  3325  	seta	r9b
  3326  	lea	rax, [rcx + 8*r10]
  3327  	cmp	rsi, rdx
  3328  	seta	r11b
  3329  	cmp	rax, r8
  3330  	seta	al
  3331  	cmp	rsi, rcx
  3332  	seta	dil
  3333  	xor	esi, esi
  3334  	test	r9b, r11b
  3335  	jne	.LBB0_709
  3336  # %bb.701:
  3337  	and	al, dil
  3338  	jne	.LBB0_709
  3339  # %bb.702:
  3340  	mov	esi, r10d
  3341  	and	esi, -4
  3342  	lea	rax, [rsi - 4]
  3343  	mov	r9, rax
  3344  	shr	r9, 2
  3345  	add	r9, 1
  3346  	test	rax, rax
  3347  	je	.LBB0_703
  3348  # %bb.704:
  3349  	mov	rax, r9
  3350  	and	rax, -2
  3351  	neg	rax
  3352  	xor	edi, edi
  3353  .LBB0_705:                              # =>This Inner Loop Header: Depth=1
  3354  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
  3355  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  3356  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
  3357  	subpd	xmm0, xmm2
  3358  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 16]
  3359  	subpd	xmm1, xmm2
  3360  	movupd	xmmword ptr [r8 + 8*rdi], xmm0
  3361  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm1
  3362  	movupd	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  3363  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  3364  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  3365  	subpd	xmm0, xmm2
  3366  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 48]
  3367  	subpd	xmm1, xmm2
  3368  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm0
  3369  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm1
  3370  	add	rdi, 8
  3371  	add	rax, 2
  3372  	jne	.LBB0_705
  3373  	jmp	.LBB0_706
  3374  .LBB0_747:
  3375  	lea	rsi, [r8 + r10]
  3376  	lea	rax, [rdx + r10]
  3377  	cmp	rax, r8
  3378  	seta	r9b
  3379  	lea	rax, [rcx + r10]
  3380  	cmp	rsi, rdx
  3381  	seta	r11b
  3382  	cmp	rax, r8
  3383  	seta	al
  3384  	cmp	rsi, rcx
  3385  	seta	sil
  3386  	xor	edi, edi
  3387  	test	r9b, r11b
  3388  	jne	.LBB0_756
  3389  # %bb.748:
  3390  	and	al, sil
  3391  	jne	.LBB0_756
  3392  # %bb.749:
  3393  	mov	edi, r10d
  3394  	and	edi, -32
  3395  	lea	rax, [rdi - 32]
  3396  	mov	r9, rax
  3397  	shr	r9, 5
  3398  	add	r9, 1
  3399  	test	rax, rax
  3400  	je	.LBB0_750
  3401  # %bb.751:
  3402  	mov	rsi, r9
  3403  	and	rsi, -2
  3404  	neg	rsi
  3405  	xor	eax, eax
  3406  	movdqa	xmm0, xmmword ptr [rip + .LCPI0_0] # xmm0 = [255,255,255,255,255,255,255,255]
  3407  .LBB0_752:                              # =>This Inner Loop Header: Depth=1
  3408  	movdqu	xmm1, xmmword ptr [rdx + rax]
  3409  	movdqu	xmm2, xmmword ptr [rdx + rax + 16]
  3410  	movdqu	xmm3, xmmword ptr [rcx + rax]
  3411  	movdqu	xmm4, xmmword ptr [rcx + rax + 16]
  3412  	pmovzxbw	xmm5, xmm1                      # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
  3413  	punpckhbw	xmm1, xmm1              # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3414  	pmovzxbw	xmm6, xmm3                      # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
  3415  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3416  	pmullw	xmm3, xmm1
  3417  	pand	xmm3, xmm0
  3418  	pmullw	xmm6, xmm5
  3419  	pand	xmm6, xmm0
  3420  	packuswb	xmm6, xmm3
  3421  	pmovzxbw	xmm1, xmm2                      # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
  3422  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3423  	pmovzxbw	xmm3, xmm4                      # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
  3424  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3425  	pmullw	xmm4, xmm2
  3426  	pand	xmm4, xmm0
  3427  	pmullw	xmm3, xmm1
  3428  	pand	xmm3, xmm0
  3429  	packuswb	xmm3, xmm4
  3430  	movdqu	xmmword ptr [r8 + rax], xmm6
  3431  	movdqu	xmmword ptr [r8 + rax + 16], xmm3
  3432  	movdqu	xmm1, xmmword ptr [rdx + rax + 32]
  3433  	movdqu	xmm2, xmmword ptr [rdx + rax + 48]
  3434  	movdqu	xmm3, xmmword ptr [rcx + rax + 32]
  3435  	movdqu	xmm4, xmmword ptr [rcx + rax + 48]
  3436  	pmovzxbw	xmm5, xmm1                      # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
  3437  	punpckhbw	xmm1, xmm1              # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3438  	pmovzxbw	xmm6, xmm3                      # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
  3439  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3440  	pmullw	xmm3, xmm1
  3441  	pand	xmm3, xmm0
  3442  	pmullw	xmm6, xmm5
  3443  	pand	xmm6, xmm0
  3444  	packuswb	xmm6, xmm3
  3445  	pmovzxbw	xmm1, xmm2                      # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
  3446  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3447  	pmovzxbw	xmm3, xmm4                      # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
  3448  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3449  	pmullw	xmm4, xmm2
  3450  	pand	xmm4, xmm0
  3451  	pmullw	xmm3, xmm1
  3452  	pand	xmm3, xmm0
  3453  	packuswb	xmm3, xmm4
  3454  	movdqu	xmmword ptr [r8 + rax + 32], xmm6
  3455  	movdqu	xmmword ptr [r8 + rax + 48], xmm3
  3456  	add	rax, 64
  3457  	add	rsi, 2
  3458  	jne	.LBB0_752
  3459  	jmp	.LBB0_753
  3460  .LBB0_897:
  3461  	lea	rsi, [r8 + r10]
  3462  	lea	rax, [rdx + r10]
  3463  	cmp	rax, r8
  3464  	seta	r9b
  3465  	lea	rax, [rcx + r10]
  3466  	cmp	rsi, rdx
  3467  	seta	r11b
  3468  	cmp	rax, r8
  3469  	seta	al
  3470  	cmp	rsi, rcx
  3471  	seta	sil
  3472  	xor	edi, edi
  3473  	test	r9b, r11b
  3474  	jne	.LBB0_906
  3475  # %bb.898:
  3476  	and	al, sil
  3477  	jne	.LBB0_906
  3478  # %bb.899:
  3479  	mov	edi, r10d
  3480  	and	edi, -32
  3481  	lea	rax, [rdi - 32]
  3482  	mov	r9, rax
  3483  	shr	r9, 5
  3484  	add	r9, 1
  3485  	test	rax, rax
  3486  	je	.LBB0_900
  3487  # %bb.901:
  3488  	mov	rsi, r9
  3489  	and	rsi, -2
  3490  	neg	rsi
  3491  	xor	eax, eax
  3492  	movdqa	xmm0, xmmword ptr [rip + .LCPI0_0] # xmm0 = [255,255,255,255,255,255,255,255]
  3493  .LBB0_902:                              # =>This Inner Loop Header: Depth=1
  3494  	movdqu	xmm1, xmmword ptr [rdx + rax]
  3495  	movdqu	xmm2, xmmword ptr [rdx + rax + 16]
  3496  	movdqu	xmm3, xmmword ptr [rcx + rax]
  3497  	movdqu	xmm4, xmmword ptr [rcx + rax + 16]
  3498  	pmovzxbw	xmm5, xmm1                      # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
  3499  	punpckhbw	xmm1, xmm1              # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3500  	pmovzxbw	xmm6, xmm3                      # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
  3501  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3502  	pmullw	xmm3, xmm1
  3503  	pand	xmm3, xmm0
  3504  	pmullw	xmm6, xmm5
  3505  	pand	xmm6, xmm0
  3506  	packuswb	xmm6, xmm3
  3507  	pmovzxbw	xmm1, xmm2                      # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
  3508  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3509  	pmovzxbw	xmm3, xmm4                      # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
  3510  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3511  	pmullw	xmm4, xmm2
  3512  	pand	xmm4, xmm0
  3513  	pmullw	xmm3, xmm1
  3514  	pand	xmm3, xmm0
  3515  	packuswb	xmm3, xmm4
  3516  	movdqu	xmmword ptr [r8 + rax], xmm6
  3517  	movdqu	xmmword ptr [r8 + rax + 16], xmm3
  3518  	movdqu	xmm1, xmmword ptr [rdx + rax + 32]
  3519  	movdqu	xmm2, xmmword ptr [rdx + rax + 48]
  3520  	movdqu	xmm3, xmmword ptr [rcx + rax + 32]
  3521  	movdqu	xmm4, xmmword ptr [rcx + rax + 48]
  3522  	pmovzxbw	xmm5, xmm1                      # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
  3523  	punpckhbw	xmm1, xmm1              # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3524  	pmovzxbw	xmm6, xmm3                      # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
  3525  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3526  	pmullw	xmm3, xmm1
  3527  	pand	xmm3, xmm0
  3528  	pmullw	xmm6, xmm5
  3529  	pand	xmm6, xmm0
  3530  	packuswb	xmm6, xmm3
  3531  	pmovzxbw	xmm1, xmm2                      # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
  3532  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3533  	pmovzxbw	xmm3, xmm4                      # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
  3534  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  3535  	pmullw	xmm4, xmm2
  3536  	pand	xmm4, xmm0
  3537  	pmullw	xmm3, xmm1
  3538  	pand	xmm3, xmm0
  3539  	packuswb	xmm3, xmm4
  3540  	movdqu	xmmword ptr [r8 + rax + 32], xmm6
  3541  	movdqu	xmmword ptr [r8 + rax + 48], xmm3
  3542  	add	rax, 64
  3543  	add	rsi, 2
  3544  	jne	.LBB0_902
  3545  	jmp	.LBB0_903
  3546  .LBB0_62:
  3547  	lea	rsi, [r8 + r10]
  3548  	lea	rax, [rdx + r10]
  3549  	cmp	rax, r8
  3550  	seta	r9b
  3551  	lea	rax, [rcx + r10]
  3552  	cmp	rsi, rdx
  3553  	seta	r11b
  3554  	cmp	rax, r8
  3555  	seta	al
  3556  	cmp	rsi, rcx
  3557  	seta	dil
  3558  	xor	esi, esi
  3559  	test	r9b, r11b
  3560  	jne	.LBB0_71
  3561  # %bb.63:
  3562  	and	al, dil
  3563  	jne	.LBB0_71
  3564  # %bb.64:
  3565  	mov	esi, r10d
  3566  	and	esi, -32
  3567  	lea	rax, [rsi - 32]
  3568  	mov	r9, rax
  3569  	shr	r9, 5
  3570  	add	r9, 1
  3571  	test	rax, rax
  3572  	je	.LBB0_65
  3573  # %bb.66:
  3574  	mov	rax, r9
  3575  	and	rax, -2
  3576  	neg	rax
  3577  	xor	edi, edi
  3578  .LBB0_67:                               # =>This Inner Loop Header: Depth=1
  3579  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  3580  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  3581  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  3582  	paddb	xmm2, xmm0
  3583  	movdqu	xmm0, xmmword ptr [rcx + rdi + 16]
  3584  	paddb	xmm0, xmm1
  3585  	movdqu	xmmword ptr [r8 + rdi], xmm2
  3586  	movdqu	xmmword ptr [r8 + rdi + 16], xmm0
  3587  	movdqu	xmm0, xmmword ptr [rdx + rdi + 32]
  3588  	movdqu	xmm1, xmmword ptr [rdx + rdi + 48]
  3589  	movdqu	xmm2, xmmword ptr [rcx + rdi + 32]
  3590  	paddb	xmm2, xmm0
  3591  	movdqu	xmm0, xmmword ptr [rcx + rdi + 48]
  3592  	paddb	xmm0, xmm1
  3593  	movdqu	xmmword ptr [r8 + rdi + 32], xmm2
  3594  	movdqu	xmmword ptr [r8 + rdi + 48], xmm0
  3595  	add	rdi, 64
  3596  	add	rax, 2
  3597  	jne	.LBB0_67
  3598  	jmp	.LBB0_68
  3599  .LBB0_408:
  3600  	lea	rsi, [r8 + r10]
  3601  	lea	rax, [rdx + r10]
  3602  	cmp	rax, r8
  3603  	seta	r9b
  3604  	lea	rax, [rcx + r10]
  3605  	cmp	rsi, rdx
  3606  	seta	r11b
  3607  	cmp	rax, r8
  3608  	seta	al
  3609  	cmp	rsi, rcx
  3610  	seta	dil
  3611  	xor	esi, esi
  3612  	test	r9b, r11b
  3613  	jne	.LBB0_417
  3614  # %bb.409:
  3615  	and	al, dil
  3616  	jne	.LBB0_417
  3617  # %bb.410:
  3618  	mov	esi, r10d
  3619  	and	esi, -32
  3620  	lea	rax, [rsi - 32]
  3621  	mov	r9, rax
  3622  	shr	r9, 5
  3623  	add	r9, 1
  3624  	test	rax, rax
  3625  	je	.LBB0_411
  3626  # %bb.412:
  3627  	mov	rax, r9
  3628  	and	rax, -2
  3629  	neg	rax
  3630  	xor	edi, edi
  3631  .LBB0_413:                              # =>This Inner Loop Header: Depth=1
  3632  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  3633  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  3634  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  3635  	psubb	xmm0, xmm2
  3636  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
  3637  	psubb	xmm1, xmm2
  3638  	movdqu	xmmword ptr [r8 + rdi], xmm0
  3639  	movdqu	xmmword ptr [r8 + rdi + 16], xmm1
  3640  	movdqu	xmm0, xmmword ptr [rdx + rdi + 32]
  3641  	movdqu	xmm1, xmmword ptr [rdx + rdi + 48]
  3642  	movdqu	xmm2, xmmword ptr [rcx + rdi + 32]
  3643  	psubb	xmm0, xmm2
  3644  	movdqu	xmm2, xmmword ptr [rcx + rdi + 48]
  3645  	psubb	xmm1, xmm2
  3646  	movdqu	xmmword ptr [r8 + rdi + 32], xmm0
  3647  	movdqu	xmmword ptr [r8 + rdi + 48], xmm1
  3648  	add	rdi, 64
  3649  	add	rax, 2
  3650  	jne	.LBB0_413
  3651  	jmp	.LBB0_414
  3652  .LBB0_235:
  3653  	lea	rsi, [r8 + r10]
  3654  	lea	rax, [rdx + r10]
  3655  	cmp	rax, r8
  3656  	seta	r9b
  3657  	lea	rax, [rcx + r10]
  3658  	cmp	rsi, rdx
  3659  	seta	r11b
  3660  	cmp	rax, r8
  3661  	seta	al
  3662  	cmp	rsi, rcx
  3663  	seta	dil
  3664  	xor	esi, esi
  3665  	test	r9b, r11b
  3666  	jne	.LBB0_244
  3667  # %bb.236:
  3668  	and	al, dil
  3669  	jne	.LBB0_244
  3670  # %bb.237:
  3671  	mov	esi, r10d
  3672  	and	esi, -32
  3673  	lea	rax, [rsi - 32]
  3674  	mov	r9, rax
  3675  	shr	r9, 5
  3676  	add	r9, 1
  3677  	test	rax, rax
  3678  	je	.LBB0_238
  3679  # %bb.239:
  3680  	mov	rax, r9
  3681  	and	rax, -2
  3682  	neg	rax
  3683  	xor	edi, edi
  3684  .LBB0_240:                              # =>This Inner Loop Header: Depth=1
  3685  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  3686  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  3687  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  3688  	paddb	xmm2, xmm0
  3689  	movdqu	xmm0, xmmword ptr [rcx + rdi + 16]
  3690  	paddb	xmm0, xmm1
  3691  	movdqu	xmmword ptr [r8 + rdi], xmm2
  3692  	movdqu	xmmword ptr [r8 + rdi + 16], xmm0
  3693  	movdqu	xmm0, xmmword ptr [rdx + rdi + 32]
  3694  	movdqu	xmm1, xmmword ptr [rdx + rdi + 48]
  3695  	movdqu	xmm2, xmmword ptr [rcx + rdi + 32]
  3696  	paddb	xmm2, xmm0
  3697  	movdqu	xmm0, xmmword ptr [rcx + rdi + 48]
  3698  	paddb	xmm0, xmm1
  3699  	movdqu	xmmword ptr [r8 + rdi + 32], xmm2
  3700  	movdqu	xmmword ptr [r8 + rdi + 48], xmm0
  3701  	add	rdi, 64
  3702  	add	rax, 2
  3703  	jne	.LBB0_240
  3704  	jmp	.LBB0_241
  3705  .LBB0_581:
  3706  	lea	rsi, [r8 + r10]
  3707  	lea	rax, [rdx + r10]
  3708  	cmp	rax, r8
  3709  	seta	r9b
  3710  	lea	rax, [rcx + r10]
  3711  	cmp	rsi, rdx
  3712  	seta	r11b
  3713  	cmp	rax, r8
  3714  	seta	al
  3715  	cmp	rsi, rcx
  3716  	seta	dil
  3717  	xor	esi, esi
  3718  	test	r9b, r11b
  3719  	jne	.LBB0_590
  3720  # %bb.582:
  3721  	and	al, dil
  3722  	jne	.LBB0_590
  3723  # %bb.583:
  3724  	mov	esi, r10d
  3725  	and	esi, -32
  3726  	lea	rax, [rsi - 32]
  3727  	mov	r9, rax
  3728  	shr	r9, 5
  3729  	add	r9, 1
  3730  	test	rax, rax
  3731  	je	.LBB0_584
  3732  # %bb.585:
  3733  	mov	rax, r9
  3734  	and	rax, -2
  3735  	neg	rax
  3736  	xor	edi, edi
  3737  .LBB0_586:                              # =>This Inner Loop Header: Depth=1
  3738  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  3739  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  3740  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  3741  	psubb	xmm0, xmm2
  3742  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
  3743  	psubb	xmm1, xmm2
  3744  	movdqu	xmmword ptr [r8 + rdi], xmm0
  3745  	movdqu	xmmword ptr [r8 + rdi + 16], xmm1
  3746  	movdqu	xmm0, xmmword ptr [rdx + rdi + 32]
  3747  	movdqu	xmm1, xmmword ptr [rdx + rdi + 48]
  3748  	movdqu	xmm2, xmmword ptr [rcx + rdi + 32]
  3749  	psubb	xmm0, xmm2
  3750  	movdqu	xmm2, xmmword ptr [rcx + rdi + 48]
  3751  	psubb	xmm1, xmm2
  3752  	movdqu	xmmword ptr [r8 + rdi + 32], xmm0
  3753  	movdqu	xmmword ptr [r8 + rdi + 48], xmm1
  3754  	add	rdi, 64
  3755  	add	rax, 2
  3756  	jne	.LBB0_586
  3757  	jmp	.LBB0_587
  3758  .LBB0_821:
  3759  	and	esi, -4
  3760  	xor	edi, edi
  3761  .LBB0_822:                              # =>This Inner Loop Header: Depth=1
  3762  	mov	rax, qword ptr [rcx + 8*rdi]
  3763  	imul	rax, qword ptr [rdx + 8*rdi]
  3764  	mov	qword ptr [r8 + 8*rdi], rax
  3765  	mov	rax, qword ptr [rcx + 8*rdi + 8]
  3766  	imul	rax, qword ptr [rdx + 8*rdi + 8]
  3767  	mov	qword ptr [r8 + 8*rdi + 8], rax
  3768  	mov	rax, qword ptr [rcx + 8*rdi + 16]
  3769  	imul	rax, qword ptr [rdx + 8*rdi + 16]
  3770  	mov	qword ptr [r8 + 8*rdi + 16], rax
  3771  	mov	rax, qword ptr [rcx + 8*rdi + 24]
  3772  	imul	rax, qword ptr [rdx + 8*rdi + 24]
  3773  	mov	qword ptr [r8 + 8*rdi + 24], rax
  3774  	add	rdi, 4
  3775  	cmp	rsi, rdi
  3776  	jne	.LBB0_822
  3777  .LBB0_823:
  3778  	test	r9, r9
  3779  	je	.LBB0_1013
  3780  # %bb.824:
  3781  	lea	rsi, [r8 + 8*rdi]
  3782  	lea	rcx, [rcx + 8*rdi]
  3783  	lea	rdx, [rdx + 8*rdi]
  3784  	xor	edi, edi
  3785  .LBB0_825:                              # =>This Inner Loop Header: Depth=1
  3786  	mov	rax, qword ptr [rcx + 8*rdi]
  3787  	imul	rax, qword ptr [rdx + 8*rdi]
  3788  	mov	qword ptr [rsi + 8*rdi], rax
  3789  	add	rdi, 1
  3790  	cmp	r9, rdi
  3791  	jne	.LBB0_825
  3792  	jmp	.LBB0_1013
  3793  .LBB0_971:
  3794  	and	esi, -4
  3795  	xor	edi, edi
  3796  .LBB0_972:                              # =>This Inner Loop Header: Depth=1
  3797  	mov	rax, qword ptr [rcx + 8*rdi]
  3798  	imul	rax, qword ptr [rdx + 8*rdi]
  3799  	mov	qword ptr [r8 + 8*rdi], rax
  3800  	mov	rax, qword ptr [rcx + 8*rdi + 8]
  3801  	imul	rax, qword ptr [rdx + 8*rdi + 8]
  3802  	mov	qword ptr [r8 + 8*rdi + 8], rax
  3803  	mov	rax, qword ptr [rcx + 8*rdi + 16]
  3804  	imul	rax, qword ptr [rdx + 8*rdi + 16]
  3805  	mov	qword ptr [r8 + 8*rdi + 16], rax
  3806  	mov	rax, qword ptr [rcx + 8*rdi + 24]
  3807  	imul	rax, qword ptr [rdx + 8*rdi + 24]
  3808  	mov	qword ptr [r8 + 8*rdi + 24], rax
  3809  	add	rdi, 4
  3810  	cmp	rsi, rdi
  3811  	jne	.LBB0_972
  3812  .LBB0_973:
  3813  	test	r9, r9
  3814  	je	.LBB0_1013
  3815  # %bb.974:
  3816  	lea	rsi, [r8 + 8*rdi]
  3817  	lea	rcx, [rcx + 8*rdi]
  3818  	lea	rdx, [rdx + 8*rdi]
  3819  	xor	edi, edi
  3820  .LBB0_975:                              # =>This Inner Loop Header: Depth=1
  3821  	mov	rax, qword ptr [rcx + 8*rdi]
  3822  	imul	rax, qword ptr [rdx + 8*rdi]
  3823  	mov	qword ptr [rsi + 8*rdi], rax
  3824  	add	rdi, 1
  3825  	cmp	r9, rdi
  3826  	jne	.LBB0_975
  3827  	jmp	.LBB0_1013
  3828  .LBB0_136:
  3829  	lea	rsi, [r8 + 8*r10]
  3830  	lea	rax, [rdx + 8*r10]
  3831  	cmp	rax, r8
  3832  	seta	r9b
  3833  	lea	rax, [rcx + 8*r10]
  3834  	cmp	rsi, rdx
  3835  	seta	r11b
  3836  	cmp	rax, r8
  3837  	seta	al
  3838  	cmp	rsi, rcx
  3839  	seta	dil
  3840  	xor	esi, esi
  3841  	test	r9b, r11b
  3842  	jne	.LBB0_145
  3843  # %bb.137:
  3844  	and	al, dil
  3845  	jne	.LBB0_145
  3846  # %bb.138:
  3847  	mov	esi, r10d
  3848  	and	esi, -4
  3849  	lea	rax, [rsi - 4]
  3850  	mov	r9, rax
  3851  	shr	r9, 2
  3852  	add	r9, 1
  3853  	test	rax, rax
  3854  	je	.LBB0_139
  3855  # %bb.140:
  3856  	mov	rax, r9
  3857  	and	rax, -2
  3858  	neg	rax
  3859  	xor	edi, edi
  3860  .LBB0_141:                              # =>This Inner Loop Header: Depth=1
  3861  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  3862  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  3863  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  3864  	paddq	xmm2, xmm0
  3865  	movdqu	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  3866  	paddq	xmm0, xmm1
  3867  	movdqu	xmmword ptr [r8 + 8*rdi], xmm2
  3868  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm0
  3869  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  3870  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  3871  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  3872  	paddq	xmm2, xmm0
  3873  	movdqu	xmm0, xmmword ptr [rcx + 8*rdi + 48]
  3874  	paddq	xmm0, xmm1
  3875  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm2
  3876  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm0
  3877  	add	rdi, 8
  3878  	add	rax, 2
  3879  	jne	.LBB0_141
  3880  	jmp	.LBB0_142
  3881  .LBB0_482:
  3882  	lea	rsi, [r8 + 8*r10]
  3883  	lea	rax, [rdx + 8*r10]
  3884  	cmp	rax, r8
  3885  	seta	r9b
  3886  	lea	rax, [rcx + 8*r10]
  3887  	cmp	rsi, rdx
  3888  	seta	r11b
  3889  	cmp	rax, r8
  3890  	seta	al
  3891  	cmp	rsi, rcx
  3892  	seta	dil
  3893  	xor	esi, esi
  3894  	test	r9b, r11b
  3895  	jne	.LBB0_491
  3896  # %bb.483:
  3897  	and	al, dil
  3898  	jne	.LBB0_491
  3899  # %bb.484:
  3900  	mov	esi, r10d
  3901  	and	esi, -4
  3902  	lea	rax, [rsi - 4]
  3903  	mov	r9, rax
  3904  	shr	r9, 2
  3905  	add	r9, 1
  3906  	test	rax, rax
  3907  	je	.LBB0_485
  3908  # %bb.486:
  3909  	mov	rax, r9
  3910  	and	rax, -2
  3911  	neg	rax
  3912  	xor	edi, edi
  3913  .LBB0_487:                              # =>This Inner Loop Header: Depth=1
  3914  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  3915  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  3916  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  3917  	psubq	xmm0, xmm2
  3918  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
  3919  	psubq	xmm1, xmm2
  3920  	movdqu	xmmword ptr [r8 + 8*rdi], xmm0
  3921  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm1
  3922  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  3923  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  3924  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  3925  	psubq	xmm0, xmm2
  3926  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 48]
  3927  	psubq	xmm1, xmm2
  3928  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm0
  3929  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm1
  3930  	add	rdi, 8
  3931  	add	rax, 2
  3932  	jne	.LBB0_487
  3933  	jmp	.LBB0_488
  3934  .LBB0_309:
  3935  	lea	rsi, [r8 + 8*r10]
  3936  	lea	rax, [rdx + 8*r10]
  3937  	cmp	rax, r8
  3938  	seta	r9b
  3939  	lea	rax, [rcx + 8*r10]
  3940  	cmp	rsi, rdx
  3941  	seta	r11b
  3942  	cmp	rax, r8
  3943  	seta	al
  3944  	cmp	rsi, rcx
  3945  	seta	dil
  3946  	xor	esi, esi
  3947  	test	r9b, r11b
  3948  	jne	.LBB0_318
  3949  # %bb.310:
  3950  	and	al, dil
  3951  	jne	.LBB0_318
  3952  # %bb.311:
  3953  	mov	esi, r10d
  3954  	and	esi, -4
  3955  	lea	rax, [rsi - 4]
  3956  	mov	r9, rax
  3957  	shr	r9, 2
  3958  	add	r9, 1
  3959  	test	rax, rax
  3960  	je	.LBB0_312
  3961  # %bb.313:
  3962  	mov	rax, r9
  3963  	and	rax, -2
  3964  	neg	rax
  3965  	xor	edi, edi
  3966  .LBB0_314:                              # =>This Inner Loop Header: Depth=1
  3967  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  3968  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  3969  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  3970  	paddq	xmm2, xmm0
  3971  	movdqu	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  3972  	paddq	xmm0, xmm1
  3973  	movdqu	xmmword ptr [r8 + 8*rdi], xmm2
  3974  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm0
  3975  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  3976  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  3977  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  3978  	paddq	xmm2, xmm0
  3979  	movdqu	xmm0, xmmword ptr [rcx + 8*rdi + 48]
  3980  	paddq	xmm0, xmm1
  3981  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm2
  3982  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm0
  3983  	add	rdi, 8
  3984  	add	rax, 2
  3985  	jne	.LBB0_314
  3986  	jmp	.LBB0_315
  3987  .LBB0_655:
  3988  	lea	rsi, [r8 + 8*r10]
  3989  	lea	rax, [rdx + 8*r10]
  3990  	cmp	rax, r8
  3991  	seta	r9b
  3992  	lea	rax, [rcx + 8*r10]
  3993  	cmp	rsi, rdx
  3994  	seta	r11b
  3995  	cmp	rax, r8
  3996  	seta	al
  3997  	cmp	rsi, rcx
  3998  	seta	dil
  3999  	xor	esi, esi
  4000  	test	r9b, r11b
  4001  	jne	.LBB0_664
  4002  # %bb.656:
  4003  	and	al, dil
  4004  	jne	.LBB0_664
  4005  # %bb.657:
  4006  	mov	esi, r10d
  4007  	and	esi, -4
  4008  	lea	rax, [rsi - 4]
  4009  	mov	r9, rax
  4010  	shr	r9, 2
  4011  	add	r9, 1
  4012  	test	rax, rax
  4013  	je	.LBB0_658
  4014  # %bb.659:
  4015  	mov	rax, r9
  4016  	and	rax, -2
  4017  	neg	rax
  4018  	xor	edi, edi
  4019  .LBB0_660:                              # =>This Inner Loop Header: Depth=1
  4020  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  4021  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  4022  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  4023  	psubq	xmm0, xmm2
  4024  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
  4025  	psubq	xmm1, xmm2
  4026  	movdqu	xmmword ptr [r8 + 8*rdi], xmm0
  4027  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm1
  4028  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  4029  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  4030  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  4031  	psubq	xmm0, xmm2
  4032  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 48]
  4033  	psubq	xmm1, xmm2
  4034  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm0
  4035  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm1
  4036  	add	rdi, 8
  4037  	add	rax, 2
  4038  	jne	.LBB0_660
  4039  	jmp	.LBB0_661
  4040  .LBB0_763:
  4041  	lea	rsi, [r8 + 2*r10]
  4042  	lea	rax, [rdx + 2*r10]
  4043  	cmp	rax, r8
  4044  	seta	r9b
  4045  	lea	rax, [rcx + 2*r10]
  4046  	cmp	rsi, rdx
  4047  	seta	r11b
  4048  	cmp	rax, r8
  4049  	seta	al
  4050  	cmp	rsi, rcx
  4051  	seta	dil
  4052  	xor	esi, esi
  4053  	test	r9b, r11b
  4054  	jne	.LBB0_772
  4055  # %bb.764:
  4056  	and	al, dil
  4057  	jne	.LBB0_772
  4058  # %bb.765:
  4059  	mov	esi, r10d
  4060  	and	esi, -16
  4061  	lea	rax, [rsi - 16]
  4062  	mov	r9, rax
  4063  	shr	r9, 4
  4064  	add	r9, 1
  4065  	test	rax, rax
  4066  	je	.LBB0_766
  4067  # %bb.767:
  4068  	mov	rax, r9
  4069  	and	rax, -2
  4070  	neg	rax
  4071  	xor	edi, edi
  4072  .LBB0_768:                              # =>This Inner Loop Header: Depth=1
  4073  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  4074  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  4075  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  4076  	pmullw	xmm2, xmm0
  4077  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  4078  	pmullw	xmm0, xmm1
  4079  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  4080  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  4081  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
  4082  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
  4083  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 32]
  4084  	pmullw	xmm2, xmm0
  4085  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 48]
  4086  	pmullw	xmm0, xmm1
  4087  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm2
  4088  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm0
  4089  	add	rdi, 32
  4090  	add	rax, 2
  4091  	jne	.LBB0_768
  4092  	jmp	.LBB0_769
  4093  .LBB0_779:
  4094  	lea	rsi, [r8 + 2*r10]
  4095  	lea	rax, [rdx + 2*r10]
  4096  	cmp	rax, r8
  4097  	seta	r9b
  4098  	lea	rax, [rcx + 2*r10]
  4099  	cmp	rsi, rdx
  4100  	seta	r11b
  4101  	cmp	rax, r8
  4102  	seta	al
  4103  	cmp	rsi, rcx
  4104  	seta	dil
  4105  	xor	esi, esi
  4106  	test	r9b, r11b
  4107  	jne	.LBB0_788
  4108  # %bb.780:
  4109  	and	al, dil
  4110  	jne	.LBB0_788
  4111  # %bb.781:
  4112  	mov	esi, r10d
  4113  	and	esi, -16
  4114  	lea	rax, [rsi - 16]
  4115  	mov	r9, rax
  4116  	shr	r9, 4
  4117  	add	r9, 1
  4118  	test	rax, rax
  4119  	je	.LBB0_782
  4120  # %bb.783:
  4121  	mov	rax, r9
  4122  	and	rax, -2
  4123  	neg	rax
  4124  	xor	edi, edi
  4125  .LBB0_784:                              # =>This Inner Loop Header: Depth=1
  4126  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  4127  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  4128  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  4129  	pmullw	xmm2, xmm0
  4130  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  4131  	pmullw	xmm0, xmm1
  4132  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  4133  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  4134  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
  4135  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
  4136  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 32]
  4137  	pmullw	xmm2, xmm0
  4138  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 48]
  4139  	pmullw	xmm0, xmm1
  4140  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm2
  4141  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm0
  4142  	add	rdi, 32
  4143  	add	rax, 2
  4144  	jne	.LBB0_784
  4145  	jmp	.LBB0_785
  4146  .LBB0_913:
  4147  	lea	rsi, [r8 + 2*r10]
  4148  	lea	rax, [rdx + 2*r10]
  4149  	cmp	rax, r8
  4150  	seta	r9b
  4151  	lea	rax, [rcx + 2*r10]
  4152  	cmp	rsi, rdx
  4153  	seta	r11b
  4154  	cmp	rax, r8
  4155  	seta	al
  4156  	cmp	rsi, rcx
  4157  	seta	dil
  4158  	xor	esi, esi
  4159  	test	r9b, r11b
  4160  	jne	.LBB0_922
  4161  # %bb.914:
  4162  	and	al, dil
  4163  	jne	.LBB0_922
  4164  # %bb.915:
  4165  	mov	esi, r10d
  4166  	and	esi, -16
  4167  	lea	rax, [rsi - 16]
  4168  	mov	r9, rax
  4169  	shr	r9, 4
  4170  	add	r9, 1
  4171  	test	rax, rax
  4172  	je	.LBB0_916
  4173  # %bb.917:
  4174  	mov	rax, r9
  4175  	and	rax, -2
  4176  	neg	rax
  4177  	xor	edi, edi
  4178  .LBB0_918:                              # =>This Inner Loop Header: Depth=1
  4179  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  4180  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  4181  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  4182  	pmullw	xmm2, xmm0
  4183  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  4184  	pmullw	xmm0, xmm1
  4185  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  4186  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  4187  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
  4188  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
  4189  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 32]
  4190  	pmullw	xmm2, xmm0
  4191  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 48]
  4192  	pmullw	xmm0, xmm1
  4193  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm2
  4194  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm0
  4195  	add	rdi, 32
  4196  	add	rax, 2
  4197  	jne	.LBB0_918
  4198  	jmp	.LBB0_919
  4199  .LBB0_929:
  4200  	lea	rsi, [r8 + 2*r10]
  4201  	lea	rax, [rdx + 2*r10]
  4202  	cmp	rax, r8
  4203  	seta	r9b
  4204  	lea	rax, [rcx + 2*r10]
  4205  	cmp	rsi, rdx
  4206  	seta	r11b
  4207  	cmp	rax, r8
  4208  	seta	al
  4209  	cmp	rsi, rcx
  4210  	seta	dil
  4211  	xor	esi, esi
  4212  	test	r9b, r11b
  4213  	jne	.LBB0_938
  4214  # %bb.930:
  4215  	and	al, dil
  4216  	jne	.LBB0_938
  4217  # %bb.931:
  4218  	mov	esi, r10d
  4219  	and	esi, -16
  4220  	lea	rax, [rsi - 16]
  4221  	mov	r9, rax
  4222  	shr	r9, 4
  4223  	add	r9, 1
  4224  	test	rax, rax
  4225  	je	.LBB0_932
  4226  # %bb.933:
  4227  	mov	rax, r9
  4228  	and	rax, -2
  4229  	neg	rax
  4230  	xor	edi, edi
  4231  .LBB0_934:                              # =>This Inner Loop Header: Depth=1
  4232  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  4233  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  4234  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  4235  	pmullw	xmm2, xmm0
  4236  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  4237  	pmullw	xmm0, xmm1
  4238  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  4239  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  4240  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
  4241  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
  4242  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 32]
  4243  	pmullw	xmm2, xmm0
  4244  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 48]
  4245  	pmullw	xmm0, xmm1
  4246  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm2
  4247  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm0
  4248  	add	rdi, 32
  4249  	add	rax, 2
  4250  	jne	.LBB0_934
  4251  	jmp	.LBB0_935
  4252  .LBB0_78:
  4253  	lea	rsi, [r8 + 2*r10]
  4254  	lea	rax, [rdx + 2*r10]
  4255  	cmp	rax, r8
  4256  	seta	r9b
  4257  	lea	rax, [rcx + 2*r10]
  4258  	cmp	rsi, rdx
  4259  	seta	r11b
  4260  	cmp	rax, r8
  4261  	seta	al
  4262  	cmp	rsi, rcx
  4263  	seta	dil
  4264  	xor	esi, esi
  4265  	test	r9b, r11b
  4266  	jne	.LBB0_87
  4267  # %bb.79:
  4268  	and	al, dil
  4269  	jne	.LBB0_87
  4270  # %bb.80:
  4271  	mov	esi, r10d
  4272  	and	esi, -16
  4273  	lea	rax, [rsi - 16]
  4274  	mov	r9, rax
  4275  	shr	r9, 4
  4276  	add	r9, 1
  4277  	test	rax, rax
  4278  	je	.LBB0_81
  4279  # %bb.82:
  4280  	mov	rax, r9
  4281  	and	rax, -2
  4282  	neg	rax
  4283  	xor	edi, edi
  4284  .LBB0_83:                               # =>This Inner Loop Header: Depth=1
  4285  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  4286  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  4287  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  4288  	paddw	xmm2, xmm0
  4289  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  4290  	paddw	xmm0, xmm1
  4291  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  4292  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  4293  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
  4294  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
  4295  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 32]
  4296  	paddw	xmm2, xmm0
  4297  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 48]
  4298  	paddw	xmm0, xmm1
  4299  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm2
  4300  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm0
  4301  	add	rdi, 32
  4302  	add	rax, 2
  4303  	jne	.LBB0_83
  4304  	jmp	.LBB0_84
  4305  .LBB0_94:
  4306  	lea	rsi, [r8 + 2*r10]
  4307  	lea	rax, [rdx + 2*r10]
  4308  	cmp	rax, r8
  4309  	seta	r9b
  4310  	lea	rax, [rcx + 2*r10]
  4311  	cmp	rsi, rdx
  4312  	seta	r11b
  4313  	cmp	rax, r8
  4314  	seta	al
  4315  	cmp	rsi, rcx
  4316  	seta	dil
  4317  	xor	esi, esi
  4318  	test	r9b, r11b
  4319  	jne	.LBB0_103
  4320  # %bb.95:
  4321  	and	al, dil
  4322  	jne	.LBB0_103
  4323  # %bb.96:
  4324  	mov	esi, r10d
  4325  	and	esi, -16
  4326  	lea	rax, [rsi - 16]
  4327  	mov	r9, rax
  4328  	shr	r9, 4
  4329  	add	r9, 1
  4330  	test	rax, rax
  4331  	je	.LBB0_97
  4332  # %bb.98:
  4333  	mov	rax, r9
  4334  	and	rax, -2
  4335  	neg	rax
  4336  	xor	edi, edi
  4337  .LBB0_99:                               # =>This Inner Loop Header: Depth=1
  4338  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  4339  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  4340  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  4341  	paddw	xmm2, xmm0
  4342  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  4343  	paddw	xmm0, xmm1
  4344  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  4345  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  4346  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
  4347  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
  4348  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 32]
  4349  	paddw	xmm2, xmm0
  4350  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 48]
  4351  	paddw	xmm0, xmm1
  4352  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm2
  4353  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm0
  4354  	add	rdi, 32
  4355  	add	rax, 2
  4356  	jne	.LBB0_99
  4357  	jmp	.LBB0_100
  4358  .LBB0_424:
  4359  	lea	rsi, [r8 + 2*r10]
  4360  	lea	rax, [rdx + 2*r10]
  4361  	cmp	rax, r8
  4362  	seta	r9b
  4363  	lea	rax, [rcx + 2*r10]
  4364  	cmp	rsi, rdx
  4365  	seta	r11b
  4366  	cmp	rax, r8
  4367  	seta	al
  4368  	cmp	rsi, rcx
  4369  	seta	dil
  4370  	xor	esi, esi
  4371  	test	r9b, r11b
  4372  	jne	.LBB0_433
  4373  # %bb.425:
  4374  	and	al, dil
  4375  	jne	.LBB0_433
  4376  # %bb.426:
  4377  	mov	esi, r10d
  4378  	and	esi, -16
  4379  	lea	rax, [rsi - 16]
  4380  	mov	r9, rax
  4381  	shr	r9, 4
  4382  	add	r9, 1
  4383  	test	rax, rax
  4384  	je	.LBB0_427
  4385  # %bb.428:
  4386  	mov	rax, r9
  4387  	and	rax, -2
  4388  	neg	rax
  4389  	xor	edi, edi
  4390  .LBB0_429:                              # =>This Inner Loop Header: Depth=1
  4391  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  4392  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  4393  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  4394  	psubw	xmm0, xmm2
  4395  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
  4396  	psubw	xmm1, xmm2
  4397  	movdqu	xmmword ptr [r8 + 2*rdi], xmm0
  4398  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm1
  4399  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
  4400  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
  4401  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 32]
  4402  	psubw	xmm0, xmm2
  4403  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
  4404  	psubw	xmm1, xmm2
  4405  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm0
  4406  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm1
  4407  	add	rdi, 32
  4408  	add	rax, 2
  4409  	jne	.LBB0_429
  4410  	jmp	.LBB0_430
  4411  .LBB0_440:
  4412  	lea	rsi, [r8 + 2*r10]
  4413  	lea	rax, [rdx + 2*r10]
  4414  	cmp	rax, r8
  4415  	seta	r9b
  4416  	lea	rax, [rcx + 2*r10]
  4417  	cmp	rsi, rdx
  4418  	seta	r11b
  4419  	cmp	rax, r8
  4420  	seta	al
  4421  	cmp	rsi, rcx
  4422  	seta	dil
  4423  	xor	esi, esi
  4424  	test	r9b, r11b
  4425  	jne	.LBB0_449
  4426  # %bb.441:
  4427  	and	al, dil
  4428  	jne	.LBB0_449
  4429  # %bb.442:
  4430  	mov	esi, r10d
  4431  	and	esi, -16
  4432  	lea	rax, [rsi - 16]
  4433  	mov	r9, rax
  4434  	shr	r9, 4
  4435  	add	r9, 1
  4436  	test	rax, rax
  4437  	je	.LBB0_443
  4438  # %bb.444:
  4439  	mov	rax, r9
  4440  	and	rax, -2
  4441  	neg	rax
  4442  	xor	edi, edi
  4443  .LBB0_445:                              # =>This Inner Loop Header: Depth=1
  4444  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  4445  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  4446  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  4447  	psubw	xmm0, xmm2
  4448  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
  4449  	psubw	xmm1, xmm2
  4450  	movdqu	xmmword ptr [r8 + 2*rdi], xmm0
  4451  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm1
  4452  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
  4453  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
  4454  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 32]
  4455  	psubw	xmm0, xmm2
  4456  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
  4457  	psubw	xmm1, xmm2
  4458  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm0
  4459  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm1
  4460  	add	rdi, 32
  4461  	add	rax, 2
  4462  	jne	.LBB0_445
  4463  	jmp	.LBB0_446
  4464  .LBB0_251:
  4465  	lea	rsi, [r8 + 2*r10]
  4466  	lea	rax, [rdx + 2*r10]
  4467  	cmp	rax, r8
  4468  	seta	r9b
  4469  	lea	rax, [rcx + 2*r10]
  4470  	cmp	rsi, rdx
  4471  	seta	r11b
  4472  	cmp	rax, r8
  4473  	seta	al
  4474  	cmp	rsi, rcx
  4475  	seta	dil
  4476  	xor	esi, esi
  4477  	test	r9b, r11b
  4478  	jne	.LBB0_260
  4479  # %bb.252:
  4480  	and	al, dil
  4481  	jne	.LBB0_260
  4482  # %bb.253:
  4483  	mov	esi, r10d
  4484  	and	esi, -16
  4485  	lea	rax, [rsi - 16]
  4486  	mov	r9, rax
  4487  	shr	r9, 4
  4488  	add	r9, 1
  4489  	test	rax, rax
  4490  	je	.LBB0_254
  4491  # %bb.255:
  4492  	mov	rax, r9
  4493  	and	rax, -2
  4494  	neg	rax
  4495  	xor	edi, edi
  4496  .LBB0_256:                              # =>This Inner Loop Header: Depth=1
  4497  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  4498  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  4499  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  4500  	paddw	xmm2, xmm0
  4501  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  4502  	paddw	xmm0, xmm1
  4503  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  4504  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  4505  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
  4506  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
  4507  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 32]
  4508  	paddw	xmm2, xmm0
  4509  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 48]
  4510  	paddw	xmm0, xmm1
  4511  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm2
  4512  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm0
  4513  	add	rdi, 32
  4514  	add	rax, 2
  4515  	jne	.LBB0_256
  4516  	jmp	.LBB0_257
  4517  .LBB0_267:
  4518  	lea	rsi, [r8 + 2*r10]
  4519  	lea	rax, [rdx + 2*r10]
  4520  	cmp	rax, r8
  4521  	seta	r9b
  4522  	lea	rax, [rcx + 2*r10]
  4523  	cmp	rsi, rdx
  4524  	seta	r11b
  4525  	cmp	rax, r8
  4526  	seta	al
  4527  	cmp	rsi, rcx
  4528  	seta	dil
  4529  	xor	esi, esi
  4530  	test	r9b, r11b
  4531  	jne	.LBB0_276
  4532  # %bb.268:
  4533  	and	al, dil
  4534  	jne	.LBB0_276
  4535  # %bb.269:
  4536  	mov	esi, r10d
  4537  	and	esi, -16
  4538  	lea	rax, [rsi - 16]
  4539  	mov	r9, rax
  4540  	shr	r9, 4
  4541  	add	r9, 1
  4542  	test	rax, rax
  4543  	je	.LBB0_270
  4544  # %bb.271:
  4545  	mov	rax, r9
  4546  	and	rax, -2
  4547  	neg	rax
  4548  	xor	edi, edi
  4549  .LBB0_272:                              # =>This Inner Loop Header: Depth=1
  4550  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  4551  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  4552  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  4553  	paddw	xmm2, xmm0
  4554  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  4555  	paddw	xmm0, xmm1
  4556  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  4557  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  4558  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
  4559  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
  4560  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 32]
  4561  	paddw	xmm2, xmm0
  4562  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 48]
  4563  	paddw	xmm0, xmm1
  4564  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm2
  4565  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm0
  4566  	add	rdi, 32
  4567  	add	rax, 2
  4568  	jne	.LBB0_272
  4569  	jmp	.LBB0_273
  4570  .LBB0_597:
  4571  	lea	rsi, [r8 + 2*r10]
  4572  	lea	rax, [rdx + 2*r10]
  4573  	cmp	rax, r8
  4574  	seta	r9b
  4575  	lea	rax, [rcx + 2*r10]
  4576  	cmp	rsi, rdx
  4577  	seta	r11b
  4578  	cmp	rax, r8
  4579  	seta	al
  4580  	cmp	rsi, rcx
  4581  	seta	dil
  4582  	xor	esi, esi
  4583  	test	r9b, r11b
  4584  	jne	.LBB0_606
  4585  # %bb.598:
  4586  	and	al, dil
  4587  	jne	.LBB0_606
  4588  # %bb.599:
  4589  	mov	esi, r10d
  4590  	and	esi, -16
  4591  	lea	rax, [rsi - 16]
  4592  	mov	r9, rax
  4593  	shr	r9, 4
  4594  	add	r9, 1
  4595  	test	rax, rax
  4596  	je	.LBB0_600
  4597  # %bb.601:
  4598  	mov	rax, r9
  4599  	and	rax, -2
  4600  	neg	rax
  4601  	xor	edi, edi
  4602  .LBB0_602:                              # =>This Inner Loop Header: Depth=1
  4603  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  4604  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  4605  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  4606  	psubw	xmm0, xmm2
  4607  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
  4608  	psubw	xmm1, xmm2
  4609  	movdqu	xmmword ptr [r8 + 2*rdi], xmm0
  4610  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm1
  4611  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
  4612  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
  4613  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 32]
  4614  	psubw	xmm0, xmm2
  4615  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
  4616  	psubw	xmm1, xmm2
  4617  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm0
  4618  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm1
  4619  	add	rdi, 32
  4620  	add	rax, 2
  4621  	jne	.LBB0_602
  4622  	jmp	.LBB0_603
  4623  .LBB0_613:
  4624  	lea	rsi, [r8 + 2*r10]
  4625  	lea	rax, [rdx + 2*r10]
  4626  	cmp	rax, r8
  4627  	seta	r9b
  4628  	lea	rax, [rcx + 2*r10]
  4629  	cmp	rsi, rdx
  4630  	seta	r11b
  4631  	cmp	rax, r8
  4632  	seta	al
  4633  	cmp	rsi, rcx
  4634  	seta	dil
  4635  	xor	esi, esi
  4636  	test	r9b, r11b
  4637  	jne	.LBB0_622
  4638  # %bb.614:
  4639  	and	al, dil
  4640  	jne	.LBB0_622
  4641  # %bb.615:
  4642  	mov	esi, r10d
  4643  	and	esi, -16
  4644  	lea	rax, [rsi - 16]
  4645  	mov	r9, rax
  4646  	shr	r9, 4
  4647  	add	r9, 1
  4648  	test	rax, rax
  4649  	je	.LBB0_616
  4650  # %bb.617:
  4651  	mov	rax, r9
  4652  	and	rax, -2
  4653  	neg	rax
  4654  	xor	edi, edi
  4655  .LBB0_618:                              # =>This Inner Loop Header: Depth=1
  4656  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  4657  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  4658  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  4659  	psubw	xmm0, xmm2
  4660  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
  4661  	psubw	xmm1, xmm2
  4662  	movdqu	xmmword ptr [r8 + 2*rdi], xmm0
  4663  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm1
  4664  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
  4665  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
  4666  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 32]
  4667  	psubw	xmm0, xmm2
  4668  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
  4669  	psubw	xmm1, xmm2
  4670  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm0
  4671  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm1
  4672  	add	rdi, 32
  4673  	add	rax, 2
  4674  	jne	.LBB0_618
  4675  	jmp	.LBB0_619
  4676  .LBB0_829:
  4677  	and	esi, -4
  4678  	xor	edi, edi
  4679  .LBB0_830:                              # =>This Inner Loop Header: Depth=1
  4680  	mov	rax, qword ptr [rcx + 8*rdi]
  4681  	imul	rax, qword ptr [rdx + 8*rdi]
  4682  	mov	qword ptr [r8 + 8*rdi], rax
  4683  	mov	rax, qword ptr [rcx + 8*rdi + 8]
  4684  	imul	rax, qword ptr [rdx + 8*rdi + 8]
  4685  	mov	qword ptr [r8 + 8*rdi + 8], rax
  4686  	mov	rax, qword ptr [rcx + 8*rdi + 16]
  4687  	imul	rax, qword ptr [rdx + 8*rdi + 16]
  4688  	mov	qword ptr [r8 + 8*rdi + 16], rax
  4689  	mov	rax, qword ptr [rcx + 8*rdi + 24]
  4690  	imul	rax, qword ptr [rdx + 8*rdi + 24]
  4691  	mov	qword ptr [r8 + 8*rdi + 24], rax
  4692  	add	rdi, 4
  4693  	cmp	rsi, rdi
  4694  	jne	.LBB0_830
  4695  .LBB0_831:
  4696  	test	r9, r9
  4697  	je	.LBB0_1013
  4698  # %bb.832:
  4699  	lea	rsi, [r8 + 8*rdi]
  4700  	lea	rcx, [rcx + 8*rdi]
  4701  	lea	rdx, [rdx + 8*rdi]
  4702  	xor	edi, edi
  4703  .LBB0_833:                              # =>This Inner Loop Header: Depth=1
  4704  	mov	rax, qword ptr [rcx + 8*rdi]
  4705  	imul	rax, qword ptr [rdx + 8*rdi]
  4706  	mov	qword ptr [rsi + 8*rdi], rax
  4707  	add	rdi, 1
  4708  	cmp	r9, rdi
  4709  	jne	.LBB0_833
  4710  	jmp	.LBB0_1013
  4711  .LBB0_837:
  4712  	lea	rsi, [r8 + 4*r10]
  4713  	lea	rax, [rdx + 4*r10]
  4714  	cmp	rax, r8
  4715  	seta	r9b
  4716  	lea	rax, [rcx + 4*r10]
  4717  	cmp	rsi, rdx
  4718  	seta	r11b
  4719  	cmp	rax, r8
  4720  	seta	al
  4721  	cmp	rsi, rcx
  4722  	seta	dil
  4723  	xor	esi, esi
  4724  	test	r9b, r11b
  4725  	jne	.LBB0_846
  4726  # %bb.838:
  4727  	and	al, dil
  4728  	jne	.LBB0_846
  4729  # %bb.839:
  4730  	mov	esi, r10d
  4731  	and	esi, -8
  4732  	lea	rax, [rsi - 8]
  4733  	mov	r9, rax
  4734  	shr	r9, 3
  4735  	add	r9, 1
  4736  	test	rax, rax
  4737  	je	.LBB0_840
  4738  # %bb.841:
  4739  	mov	rax, r9
  4740  	and	rax, -2
  4741  	neg	rax
  4742  	xor	edi, edi
  4743  .LBB0_842:                              # =>This Inner Loop Header: Depth=1
  4744  	movups	xmm0, xmmword ptr [rdx + 4*rdi]
  4745  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  4746  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
  4747  	mulps	xmm2, xmm0
  4748  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  4749  	mulps	xmm0, xmm1
  4750  	movups	xmmword ptr [r8 + 4*rdi], xmm2
  4751  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm0
  4752  	movups	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  4753  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  4754  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  4755  	mulps	xmm2, xmm0
  4756  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 48]
  4757  	mulps	xmm0, xmm1
  4758  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
  4759  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm0
  4760  	add	rdi, 16
  4761  	add	rax, 2
  4762  	jne	.LBB0_842
  4763  	jmp	.LBB0_843
  4764  .LBB0_979:
  4765  	and	esi, -4
  4766  	xor	edi, edi
  4767  .LBB0_980:                              # =>This Inner Loop Header: Depth=1
  4768  	mov	rax, qword ptr [rcx + 8*rdi]
  4769  	imul	rax, qword ptr [rdx + 8*rdi]
  4770  	mov	qword ptr [r8 + 8*rdi], rax
  4771  	mov	rax, qword ptr [rcx + 8*rdi + 8]
  4772  	imul	rax, qword ptr [rdx + 8*rdi + 8]
  4773  	mov	qword ptr [r8 + 8*rdi + 8], rax
  4774  	mov	rax, qword ptr [rcx + 8*rdi + 16]
  4775  	imul	rax, qword ptr [rdx + 8*rdi + 16]
  4776  	mov	qword ptr [r8 + 8*rdi + 16], rax
  4777  	mov	rax, qword ptr [rcx + 8*rdi + 24]
  4778  	imul	rax, qword ptr [rdx + 8*rdi + 24]
  4779  	mov	qword ptr [r8 + 8*rdi + 24], rax
  4780  	add	rdi, 4
  4781  	cmp	rsi, rdi
  4782  	jne	.LBB0_980
  4783  .LBB0_981:
  4784  	test	r9, r9
  4785  	je	.LBB0_1013
  4786  # %bb.982:
  4787  	lea	rsi, [r8 + 8*rdi]
  4788  	lea	rcx, [rcx + 8*rdi]
  4789  	lea	rdx, [rdx + 8*rdi]
  4790  	xor	edi, edi
  4791  .LBB0_983:                              # =>This Inner Loop Header: Depth=1
  4792  	mov	rax, qword ptr [rcx + 8*rdi]
  4793  	imul	rax, qword ptr [rdx + 8*rdi]
  4794  	mov	qword ptr [rsi + 8*rdi], rax
  4795  	add	rdi, 1
  4796  	cmp	r9, rdi
  4797  	jne	.LBB0_983
  4798  .LBB0_1013:
  4799  	mov	rsp, rbp
  4800  	pop	rbp
  4801  	ret
  4802  .LBB0_987:
  4803  	lea	rsi, [r8 + 4*r10]
  4804  	lea	rax, [rdx + 4*r10]
  4805  	cmp	rax, r8
  4806  	seta	r9b
  4807  	lea	rax, [rcx + 4*r10]
  4808  	cmp	rsi, rdx
  4809  	seta	r11b
  4810  	cmp	rax, r8
  4811  	seta	al
  4812  	cmp	rsi, rcx
  4813  	seta	dil
  4814  	xor	esi, esi
  4815  	test	r9b, r11b
  4816  	jne	.LBB0_996
  4817  # %bb.988:
  4818  	and	al, dil
  4819  	jne	.LBB0_996
  4820  # %bb.989:
  4821  	mov	esi, r10d
  4822  	and	esi, -8
  4823  	lea	rax, [rsi - 8]
  4824  	mov	r9, rax
  4825  	shr	r9, 3
  4826  	add	r9, 1
  4827  	test	rax, rax
  4828  	je	.LBB0_990
  4829  # %bb.991:
  4830  	mov	rax, r9
  4831  	and	rax, -2
  4832  	neg	rax
  4833  	xor	edi, edi
  4834  .LBB0_992:                              # =>This Inner Loop Header: Depth=1
  4835  	movups	xmm0, xmmword ptr [rdx + 4*rdi]
  4836  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  4837  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
  4838  	mulps	xmm2, xmm0
  4839  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  4840  	mulps	xmm0, xmm1
  4841  	movups	xmmword ptr [r8 + 4*rdi], xmm2
  4842  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm0
  4843  	movups	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  4844  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  4845  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  4846  	mulps	xmm2, xmm0
  4847  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 48]
  4848  	mulps	xmm0, xmm1
  4849  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
  4850  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm0
  4851  	add	rdi, 16
  4852  	add	rax, 2
  4853  	jne	.LBB0_992
  4854  	jmp	.LBB0_993
  4855  .LBB0_152:
  4856  	lea	rsi, [r8 + 8*r10]
  4857  	lea	rax, [rdx + 8*r10]
  4858  	cmp	rax, r8
  4859  	seta	r9b
  4860  	lea	rax, [rcx + 8*r10]
  4861  	cmp	rsi, rdx
  4862  	seta	r11b
  4863  	cmp	rax, r8
  4864  	seta	al
  4865  	cmp	rsi, rcx
  4866  	seta	dil
  4867  	xor	esi, esi
  4868  	test	r9b, r11b
  4869  	jne	.LBB0_161
  4870  # %bb.153:
  4871  	and	al, dil
  4872  	jne	.LBB0_161
  4873  # %bb.154:
  4874  	mov	esi, r10d
  4875  	and	esi, -4
  4876  	lea	rax, [rsi - 4]
  4877  	mov	r9, rax
  4878  	shr	r9, 2
  4879  	add	r9, 1
  4880  	test	rax, rax
  4881  	je	.LBB0_155
  4882  # %bb.156:
  4883  	mov	rax, r9
  4884  	and	rax, -2
  4885  	neg	rax
  4886  	xor	edi, edi
  4887  .LBB0_157:                              # =>This Inner Loop Header: Depth=1
  4888  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  4889  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  4890  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  4891  	paddq	xmm2, xmm0
  4892  	movdqu	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  4893  	paddq	xmm0, xmm1
  4894  	movdqu	xmmword ptr [r8 + 8*rdi], xmm2
  4895  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm0
  4896  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  4897  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  4898  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  4899  	paddq	xmm2, xmm0
  4900  	movdqu	xmm0, xmmword ptr [rcx + 8*rdi + 48]
  4901  	paddq	xmm0, xmm1
  4902  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm2
  4903  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm0
  4904  	add	rdi, 8
  4905  	add	rax, 2
  4906  	jne	.LBB0_157
  4907  	jmp	.LBB0_158
  4908  .LBB0_168:
  4909  	lea	rsi, [r8 + 4*r10]
  4910  	lea	rax, [rdx + 4*r10]
  4911  	cmp	rax, r8
  4912  	seta	r9b
  4913  	lea	rax, [rcx + 4*r10]
  4914  	cmp	rsi, rdx
  4915  	seta	r11b
  4916  	cmp	rax, r8
  4917  	seta	al
  4918  	cmp	rsi, rcx
  4919  	seta	dil
  4920  	xor	esi, esi
  4921  	test	r9b, r11b
  4922  	jne	.LBB0_177
  4923  # %bb.169:
  4924  	and	al, dil
  4925  	jne	.LBB0_177
  4926  # %bb.170:
  4927  	mov	esi, r10d
  4928  	and	esi, -8
  4929  	lea	rax, [rsi - 8]
  4930  	mov	r9, rax
  4931  	shr	r9, 3
  4932  	add	r9, 1
  4933  	test	rax, rax
  4934  	je	.LBB0_171
  4935  # %bb.172:
  4936  	mov	rax, r9
  4937  	and	rax, -2
  4938  	neg	rax
  4939  	xor	edi, edi
  4940  .LBB0_173:                              # =>This Inner Loop Header: Depth=1
  4941  	movups	xmm0, xmmword ptr [rdx + 4*rdi]
  4942  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  4943  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
  4944  	addps	xmm2, xmm0
  4945  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  4946  	addps	xmm0, xmm1
  4947  	movups	xmmword ptr [r8 + 4*rdi], xmm2
  4948  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm0
  4949  	movups	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  4950  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  4951  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  4952  	addps	xmm2, xmm0
  4953  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 48]
  4954  	addps	xmm0, xmm1
  4955  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
  4956  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm0
  4957  	add	rdi, 16
  4958  	add	rax, 2
  4959  	jne	.LBB0_173
  4960  	jmp	.LBB0_174
  4961  .LBB0_498:
  4962  	lea	rsi, [r8 + 8*r10]
  4963  	lea	rax, [rdx + 8*r10]
  4964  	cmp	rax, r8
  4965  	seta	r9b
  4966  	lea	rax, [rcx + 8*r10]
  4967  	cmp	rsi, rdx
  4968  	seta	r11b
  4969  	cmp	rax, r8
  4970  	seta	al
  4971  	cmp	rsi, rcx
  4972  	seta	dil
  4973  	xor	esi, esi
  4974  	test	r9b, r11b
  4975  	jne	.LBB0_507
  4976  # %bb.499:
  4977  	and	al, dil
  4978  	jne	.LBB0_507
  4979  # %bb.500:
  4980  	mov	esi, r10d
  4981  	and	esi, -4
  4982  	lea	rax, [rsi - 4]
  4983  	mov	r9, rax
  4984  	shr	r9, 2
  4985  	add	r9, 1
  4986  	test	rax, rax
  4987  	je	.LBB0_501
  4988  # %bb.502:
  4989  	mov	rax, r9
  4990  	and	rax, -2
  4991  	neg	rax
  4992  	xor	edi, edi
  4993  .LBB0_503:                              # =>This Inner Loop Header: Depth=1
  4994  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  4995  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  4996  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  4997  	psubq	xmm0, xmm2
  4998  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
  4999  	psubq	xmm1, xmm2
  5000  	movdqu	xmmword ptr [r8 + 8*rdi], xmm0
  5001  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm1
  5002  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  5003  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  5004  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  5005  	psubq	xmm0, xmm2
  5006  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 48]
  5007  	psubq	xmm1, xmm2
  5008  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm0
  5009  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm1
  5010  	add	rdi, 8
  5011  	add	rax, 2
  5012  	jne	.LBB0_503
  5013  	jmp	.LBB0_504
  5014  .LBB0_514:
  5015  	lea	rsi, [r8 + 4*r10]
  5016  	lea	rax, [rdx + 4*r10]
  5017  	cmp	rax, r8
  5018  	seta	r9b
  5019  	lea	rax, [rcx + 4*r10]
  5020  	cmp	rsi, rdx
  5021  	seta	r11b
  5022  	cmp	rax, r8
  5023  	seta	al
  5024  	cmp	rsi, rcx
  5025  	seta	dil
  5026  	xor	esi, esi
  5027  	test	r9b, r11b
  5028  	jne	.LBB0_523
  5029  # %bb.515:
  5030  	and	al, dil
  5031  	jne	.LBB0_523
  5032  # %bb.516:
  5033  	mov	esi, r10d
  5034  	and	esi, -8
  5035  	lea	rax, [rsi - 8]
  5036  	mov	r9, rax
  5037  	shr	r9, 3
  5038  	add	r9, 1
  5039  	test	rax, rax
  5040  	je	.LBB0_517
  5041  # %bb.518:
  5042  	mov	rax, r9
  5043  	and	rax, -2
  5044  	neg	rax
  5045  	xor	edi, edi
  5046  .LBB0_519:                              # =>This Inner Loop Header: Depth=1
  5047  	movups	xmm0, xmmword ptr [rdx + 4*rdi]
  5048  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  5049  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
  5050  	subps	xmm0, xmm2
  5051  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 16]
  5052  	subps	xmm1, xmm2
  5053  	movups	xmmword ptr [r8 + 4*rdi], xmm0
  5054  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm1
  5055  	movups	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  5056  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  5057  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  5058  	subps	xmm0, xmm2
  5059  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 48]
  5060  	subps	xmm1, xmm2
  5061  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm0
  5062  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm1
  5063  	add	rdi, 16
  5064  	add	rax, 2
  5065  	jne	.LBB0_519
  5066  	jmp	.LBB0_520
  5067  .LBB0_325:
  5068  	lea	rsi, [r8 + 8*r10]
  5069  	lea	rax, [rdx + 8*r10]
  5070  	cmp	rax, r8
  5071  	seta	r9b
  5072  	lea	rax, [rcx + 8*r10]
  5073  	cmp	rsi, rdx
  5074  	seta	r11b
  5075  	cmp	rax, r8
  5076  	seta	al
  5077  	cmp	rsi, rcx
  5078  	seta	dil
  5079  	xor	esi, esi
  5080  	test	r9b, r11b
  5081  	jne	.LBB0_334
  5082  # %bb.326:
  5083  	and	al, dil
  5084  	jne	.LBB0_334
  5085  # %bb.327:
  5086  	mov	esi, r10d
  5087  	and	esi, -4
  5088  	lea	rax, [rsi - 4]
  5089  	mov	r9, rax
  5090  	shr	r9, 2
  5091  	add	r9, 1
  5092  	test	rax, rax
  5093  	je	.LBB0_328
  5094  # %bb.329:
  5095  	mov	rax, r9
  5096  	and	rax, -2
  5097  	neg	rax
  5098  	xor	edi, edi
  5099  .LBB0_330:                              # =>This Inner Loop Header: Depth=1
  5100  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  5101  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  5102  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  5103  	paddq	xmm2, xmm0
  5104  	movdqu	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  5105  	paddq	xmm0, xmm1
  5106  	movdqu	xmmword ptr [r8 + 8*rdi], xmm2
  5107  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm0
  5108  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  5109  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  5110  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  5111  	paddq	xmm2, xmm0
  5112  	movdqu	xmm0, xmmword ptr [rcx + 8*rdi + 48]
  5113  	paddq	xmm0, xmm1
  5114  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm2
  5115  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm0
  5116  	add	rdi, 8
  5117  	add	rax, 2
  5118  	jne	.LBB0_330
  5119  	jmp	.LBB0_331
  5120  .LBB0_341:
  5121  	lea	rsi, [r8 + 4*r10]
  5122  	lea	rax, [rdx + 4*r10]
  5123  	cmp	rax, r8
  5124  	seta	r9b
  5125  	lea	rax, [rcx + 4*r10]
  5126  	cmp	rsi, rdx
  5127  	seta	r11b
  5128  	cmp	rax, r8
  5129  	seta	al
  5130  	cmp	rsi, rcx
  5131  	seta	dil
  5132  	xor	esi, esi
  5133  	test	r9b, r11b
  5134  	jne	.LBB0_350
  5135  # %bb.342:
  5136  	and	al, dil
  5137  	jne	.LBB0_350
  5138  # %bb.343:
  5139  	mov	esi, r10d
  5140  	and	esi, -8
  5141  	lea	rax, [rsi - 8]
  5142  	mov	r9, rax
  5143  	shr	r9, 3
  5144  	add	r9, 1
  5145  	test	rax, rax
  5146  	je	.LBB0_344
  5147  # %bb.345:
  5148  	mov	rax, r9
  5149  	and	rax, -2
  5150  	neg	rax
  5151  	xor	edi, edi
  5152  .LBB0_346:                              # =>This Inner Loop Header: Depth=1
  5153  	movups	xmm0, xmmword ptr [rdx + 4*rdi]
  5154  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  5155  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
  5156  	addps	xmm2, xmm0
  5157  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  5158  	addps	xmm0, xmm1
  5159  	movups	xmmword ptr [r8 + 4*rdi], xmm2
  5160  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm0
  5161  	movups	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  5162  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  5163  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  5164  	addps	xmm2, xmm0
  5165  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 48]
  5166  	addps	xmm0, xmm1
  5167  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
  5168  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm0
  5169  	add	rdi, 16
  5170  	add	rax, 2
  5171  	jne	.LBB0_346
  5172  	jmp	.LBB0_347
  5173  .LBB0_671:
  5174  	lea	rsi, [r8 + 8*r10]
  5175  	lea	rax, [rdx + 8*r10]
  5176  	cmp	rax, r8
  5177  	seta	r9b
  5178  	lea	rax, [rcx + 8*r10]
  5179  	cmp	rsi, rdx
  5180  	seta	r11b
  5181  	cmp	rax, r8
  5182  	seta	al
  5183  	cmp	rsi, rcx
  5184  	seta	dil
  5185  	xor	esi, esi
  5186  	test	r9b, r11b
  5187  	jne	.LBB0_680
  5188  # %bb.672:
  5189  	and	al, dil
  5190  	jne	.LBB0_680
  5191  # %bb.673:
  5192  	mov	esi, r10d
  5193  	and	esi, -4
  5194  	lea	rax, [rsi - 4]
  5195  	mov	r9, rax
  5196  	shr	r9, 2
  5197  	add	r9, 1
  5198  	test	rax, rax
  5199  	je	.LBB0_674
  5200  # %bb.675:
  5201  	mov	rax, r9
  5202  	and	rax, -2
  5203  	neg	rax
  5204  	xor	edi, edi
  5205  .LBB0_676:                              # =>This Inner Loop Header: Depth=1
  5206  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  5207  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  5208  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  5209  	psubq	xmm0, xmm2
  5210  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
  5211  	psubq	xmm1, xmm2
  5212  	movdqu	xmmword ptr [r8 + 8*rdi], xmm0
  5213  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm1
  5214  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi + 32]
  5215  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 48]
  5216  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 32]
  5217  	psubq	xmm0, xmm2
  5218  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 48]
  5219  	psubq	xmm1, xmm2
  5220  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm0
  5221  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm1
  5222  	add	rdi, 8
  5223  	add	rax, 2
  5224  	jne	.LBB0_676
  5225  	jmp	.LBB0_677
  5226  .LBB0_687:
  5227  	lea	rsi, [r8 + 4*r10]
  5228  	lea	rax, [rdx + 4*r10]
  5229  	cmp	rax, r8
  5230  	seta	r9b
  5231  	lea	rax, [rcx + 4*r10]
  5232  	cmp	rsi, rdx
  5233  	seta	r11b
  5234  	cmp	rax, r8
  5235  	seta	al
  5236  	cmp	rsi, rcx
  5237  	seta	dil
  5238  	xor	esi, esi
  5239  	test	r9b, r11b
  5240  	jne	.LBB0_696
  5241  # %bb.688:
  5242  	and	al, dil
  5243  	jne	.LBB0_696
  5244  # %bb.689:
  5245  	mov	esi, r10d
  5246  	and	esi, -8
  5247  	lea	rax, [rsi - 8]
  5248  	mov	r9, rax
  5249  	shr	r9, 3
  5250  	add	r9, 1
  5251  	test	rax, rax
  5252  	je	.LBB0_690
  5253  # %bb.691:
  5254  	mov	rax, r9
  5255  	and	rax, -2
  5256  	neg	rax
  5257  	xor	edi, edi
  5258  .LBB0_692:                              # =>This Inner Loop Header: Depth=1
  5259  	movups	xmm0, xmmword ptr [rdx + 4*rdi]
  5260  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  5261  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
  5262  	subps	xmm0, xmm2
  5263  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 16]
  5264  	subps	xmm1, xmm2
  5265  	movups	xmmword ptr [r8 + 4*rdi], xmm0
  5266  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm1
  5267  	movups	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  5268  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  5269  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  5270  	subps	xmm0, xmm2
  5271  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 48]
  5272  	subps	xmm1, xmm2
  5273  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm0
  5274  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm1
  5275  	add	rdi, 16
  5276  	add	rax, 2
  5277  	jne	.LBB0_692
  5278  	jmp	.LBB0_693
  5279  .LBB0_734:
  5280  	lea	rsi, [r8 + r10]
  5281  	lea	rax, [rdx + r10]
  5282  	cmp	rax, r8
  5283  	seta	r9b
  5284  	lea	rax, [rcx + r10]
  5285  	cmp	rsi, rdx
  5286  	seta	r11b
  5287  	cmp	rax, r8
  5288  	seta	al
  5289  	cmp	rsi, rcx
  5290  	seta	sil
  5291  	xor	edi, edi
  5292  	test	r9b, r11b
  5293  	jne	.LBB0_743
  5294  # %bb.735:
  5295  	and	al, sil
  5296  	jne	.LBB0_743
  5297  # %bb.736:
  5298  	mov	edi, r10d
  5299  	and	edi, -32
  5300  	lea	rax, [rdi - 32]
  5301  	mov	r9, rax
  5302  	shr	r9, 5
  5303  	add	r9, 1
  5304  	test	rax, rax
  5305  	je	.LBB0_737
  5306  # %bb.738:
  5307  	mov	rsi, r9
  5308  	and	rsi, -2
  5309  	neg	rsi
  5310  	xor	eax, eax
  5311  	movdqa	xmm0, xmmword ptr [rip + .LCPI0_0] # xmm0 = [255,255,255,255,255,255,255,255]
  5312  .LBB0_739:                              # =>This Inner Loop Header: Depth=1
  5313  	movdqu	xmm1, xmmword ptr [rdx + rax]
  5314  	movdqu	xmm2, xmmword ptr [rdx + rax + 16]
  5315  	movdqu	xmm3, xmmword ptr [rcx + rax]
  5316  	movdqu	xmm4, xmmword ptr [rcx + rax + 16]
  5317  	pmovzxbw	xmm5, xmm1                      # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
  5318  	punpckhbw	xmm1, xmm1              # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5319  	pmovzxbw	xmm6, xmm3                      # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
  5320  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5321  	pmullw	xmm3, xmm1
  5322  	pand	xmm3, xmm0
  5323  	pmullw	xmm6, xmm5
  5324  	pand	xmm6, xmm0
  5325  	packuswb	xmm6, xmm3
  5326  	pmovzxbw	xmm1, xmm2                      # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
  5327  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5328  	pmovzxbw	xmm3, xmm4                      # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
  5329  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5330  	pmullw	xmm4, xmm2
  5331  	pand	xmm4, xmm0
  5332  	pmullw	xmm3, xmm1
  5333  	pand	xmm3, xmm0
  5334  	packuswb	xmm3, xmm4
  5335  	movdqu	xmmword ptr [r8 + rax], xmm6
  5336  	movdqu	xmmword ptr [r8 + rax + 16], xmm3
  5337  	movdqu	xmm1, xmmword ptr [rdx + rax + 32]
  5338  	movdqu	xmm2, xmmword ptr [rdx + rax + 48]
  5339  	movdqu	xmm3, xmmword ptr [rcx + rax + 32]
  5340  	movdqu	xmm4, xmmword ptr [rcx + rax + 48]
  5341  	pmovzxbw	xmm5, xmm1                      # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
  5342  	punpckhbw	xmm1, xmm1              # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5343  	pmovzxbw	xmm6, xmm3                      # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
  5344  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5345  	pmullw	xmm3, xmm1
  5346  	pand	xmm3, xmm0
  5347  	pmullw	xmm6, xmm5
  5348  	pand	xmm6, xmm0
  5349  	packuswb	xmm6, xmm3
  5350  	pmovzxbw	xmm1, xmm2                      # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
  5351  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5352  	pmovzxbw	xmm3, xmm4                      # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
  5353  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5354  	pmullw	xmm4, xmm2
  5355  	pand	xmm4, xmm0
  5356  	pmullw	xmm3, xmm1
  5357  	pand	xmm3, xmm0
  5358  	packuswb	xmm3, xmm4
  5359  	movdqu	xmmword ptr [r8 + rax + 32], xmm6
  5360  	movdqu	xmmword ptr [r8 + rax + 48], xmm3
  5361  	add	rax, 64
  5362  	add	rsi, 2
  5363  	jne	.LBB0_739
  5364  	jmp	.LBB0_740
  5365  .LBB0_884:
  5366  	lea	rsi, [r8 + r10]
  5367  	lea	rax, [rdx + r10]
  5368  	cmp	rax, r8
  5369  	seta	r9b
  5370  	lea	rax, [rcx + r10]
  5371  	cmp	rsi, rdx
  5372  	seta	r11b
  5373  	cmp	rax, r8
  5374  	seta	al
  5375  	cmp	rsi, rcx
  5376  	seta	sil
  5377  	xor	edi, edi
  5378  	test	r9b, r11b
  5379  	jne	.LBB0_893
  5380  # %bb.885:
  5381  	and	al, sil
  5382  	jne	.LBB0_893
  5383  # %bb.886:
  5384  	mov	edi, r10d
  5385  	and	edi, -32
  5386  	lea	rax, [rdi - 32]
  5387  	mov	r9, rax
  5388  	shr	r9, 5
  5389  	add	r9, 1
  5390  	test	rax, rax
  5391  	je	.LBB0_887
  5392  # %bb.888:
  5393  	mov	rsi, r9
  5394  	and	rsi, -2
  5395  	neg	rsi
  5396  	xor	eax, eax
  5397  	movdqa	xmm0, xmmword ptr [rip + .LCPI0_0] # xmm0 = [255,255,255,255,255,255,255,255]
  5398  .LBB0_889:                              # =>This Inner Loop Header: Depth=1
  5399  	movdqu	xmm1, xmmword ptr [rdx + rax]
  5400  	movdqu	xmm2, xmmword ptr [rdx + rax + 16]
  5401  	movdqu	xmm3, xmmword ptr [rcx + rax]
  5402  	movdqu	xmm4, xmmword ptr [rcx + rax + 16]
  5403  	pmovzxbw	xmm5, xmm1                      # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
  5404  	punpckhbw	xmm1, xmm1              # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5405  	pmovzxbw	xmm6, xmm3                      # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
  5406  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5407  	pmullw	xmm3, xmm1
  5408  	pand	xmm3, xmm0
  5409  	pmullw	xmm6, xmm5
  5410  	pand	xmm6, xmm0
  5411  	packuswb	xmm6, xmm3
  5412  	pmovzxbw	xmm1, xmm2                      # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
  5413  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5414  	pmovzxbw	xmm3, xmm4                      # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
  5415  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5416  	pmullw	xmm4, xmm2
  5417  	pand	xmm4, xmm0
  5418  	pmullw	xmm3, xmm1
  5419  	pand	xmm3, xmm0
  5420  	packuswb	xmm3, xmm4
  5421  	movdqu	xmmword ptr [r8 + rax], xmm6
  5422  	movdqu	xmmword ptr [r8 + rax + 16], xmm3
  5423  	movdqu	xmm1, xmmword ptr [rdx + rax + 32]
  5424  	movdqu	xmm2, xmmword ptr [rdx + rax + 48]
  5425  	movdqu	xmm3, xmmword ptr [rcx + rax + 32]
  5426  	movdqu	xmm4, xmmword ptr [rcx + rax + 48]
  5427  	pmovzxbw	xmm5, xmm1                      # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
  5428  	punpckhbw	xmm1, xmm1              # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5429  	pmovzxbw	xmm6, xmm3                      # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
  5430  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5431  	pmullw	xmm3, xmm1
  5432  	pand	xmm3, xmm0
  5433  	pmullw	xmm6, xmm5
  5434  	pand	xmm6, xmm0
  5435  	packuswb	xmm6, xmm3
  5436  	pmovzxbw	xmm1, xmm2                      # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
  5437  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5438  	pmovzxbw	xmm3, xmm4                      # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
  5439  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  5440  	pmullw	xmm4, xmm2
  5441  	pand	xmm4, xmm0
  5442  	pmullw	xmm3, xmm1
  5443  	pand	xmm3, xmm0
  5444  	packuswb	xmm3, xmm4
  5445  	movdqu	xmmword ptr [r8 + rax + 32], xmm6
  5446  	movdqu	xmmword ptr [r8 + rax + 48], xmm3
  5447  	add	rax, 64
  5448  	add	rsi, 2
  5449  	jne	.LBB0_889
  5450  	jmp	.LBB0_890
  5451  .LBB0_49:
  5452  	lea	rsi, [r8 + r10]
  5453  	lea	rax, [rdx + r10]
  5454  	cmp	rax, r8
  5455  	seta	r9b
  5456  	lea	rax, [rcx + r10]
  5457  	cmp	rsi, rdx
  5458  	seta	r11b
  5459  	cmp	rax, r8
  5460  	seta	al
  5461  	cmp	rsi, rcx
  5462  	seta	dil
  5463  	xor	esi, esi
  5464  	test	r9b, r11b
  5465  	jne	.LBB0_58
  5466  # %bb.50:
  5467  	and	al, dil
  5468  	jne	.LBB0_58
  5469  # %bb.51:
  5470  	mov	esi, r10d
  5471  	and	esi, -32
  5472  	lea	rax, [rsi - 32]
  5473  	mov	r9, rax
  5474  	shr	r9, 5
  5475  	add	r9, 1
  5476  	test	rax, rax
  5477  	je	.LBB0_52
  5478  # %bb.53:
  5479  	mov	rax, r9
  5480  	and	rax, -2
  5481  	neg	rax
  5482  	xor	edi, edi
  5483  .LBB0_54:                               # =>This Inner Loop Header: Depth=1
  5484  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  5485  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  5486  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  5487  	paddb	xmm2, xmm0
  5488  	movdqu	xmm0, xmmword ptr [rcx + rdi + 16]
  5489  	paddb	xmm0, xmm1
  5490  	movdqu	xmmword ptr [r8 + rdi], xmm2
  5491  	movdqu	xmmword ptr [r8 + rdi + 16], xmm0
  5492  	movdqu	xmm0, xmmword ptr [rdx + rdi + 32]
  5493  	movdqu	xmm1, xmmword ptr [rdx + rdi + 48]
  5494  	movdqu	xmm2, xmmword ptr [rcx + rdi + 32]
  5495  	paddb	xmm2, xmm0
  5496  	movdqu	xmm0, xmmword ptr [rcx + rdi + 48]
  5497  	paddb	xmm0, xmm1
  5498  	movdqu	xmmword ptr [r8 + rdi + 32], xmm2
  5499  	movdqu	xmmword ptr [r8 + rdi + 48], xmm0
  5500  	add	rdi, 64
  5501  	add	rax, 2
  5502  	jne	.LBB0_54
  5503  	jmp	.LBB0_55
  5504  .LBB0_395:
  5505  	lea	rsi, [r8 + r10]
  5506  	lea	rax, [rdx + r10]
  5507  	cmp	rax, r8
  5508  	seta	r9b
  5509  	lea	rax, [rcx + r10]
  5510  	cmp	rsi, rdx
  5511  	seta	r11b
  5512  	cmp	rax, r8
  5513  	seta	al
  5514  	cmp	rsi, rcx
  5515  	seta	dil
  5516  	xor	esi, esi
  5517  	test	r9b, r11b
  5518  	jne	.LBB0_404
  5519  # %bb.396:
  5520  	and	al, dil
  5521  	jne	.LBB0_404
  5522  # %bb.397:
  5523  	mov	esi, r10d
  5524  	and	esi, -32
  5525  	lea	rax, [rsi - 32]
  5526  	mov	r9, rax
  5527  	shr	r9, 5
  5528  	add	r9, 1
  5529  	test	rax, rax
  5530  	je	.LBB0_398
  5531  # %bb.399:
  5532  	mov	rax, r9
  5533  	and	rax, -2
  5534  	neg	rax
  5535  	xor	edi, edi
  5536  .LBB0_400:                              # =>This Inner Loop Header: Depth=1
  5537  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  5538  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  5539  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  5540  	psubb	xmm0, xmm2
  5541  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
  5542  	psubb	xmm1, xmm2
  5543  	movdqu	xmmword ptr [r8 + rdi], xmm0
  5544  	movdqu	xmmword ptr [r8 + rdi + 16], xmm1
  5545  	movdqu	xmm0, xmmword ptr [rdx + rdi + 32]
  5546  	movdqu	xmm1, xmmword ptr [rdx + rdi + 48]
  5547  	movdqu	xmm2, xmmword ptr [rcx + rdi + 32]
  5548  	psubb	xmm0, xmm2
  5549  	movdqu	xmm2, xmmword ptr [rcx + rdi + 48]
  5550  	psubb	xmm1, xmm2
  5551  	movdqu	xmmword ptr [r8 + rdi + 32], xmm0
  5552  	movdqu	xmmword ptr [r8 + rdi + 48], xmm1
  5553  	add	rdi, 64
  5554  	add	rax, 2
  5555  	jne	.LBB0_400
  5556  	jmp	.LBB0_401
  5557  .LBB0_222:
  5558  	lea	rsi, [r8 + r10]
  5559  	lea	rax, [rdx + r10]
  5560  	cmp	rax, r8
  5561  	seta	r9b
  5562  	lea	rax, [rcx + r10]
  5563  	cmp	rsi, rdx
  5564  	seta	r11b
  5565  	cmp	rax, r8
  5566  	seta	al
  5567  	cmp	rsi, rcx
  5568  	seta	dil
  5569  	xor	esi, esi
  5570  	test	r9b, r11b
  5571  	jne	.LBB0_231
  5572  # %bb.223:
  5573  	and	al, dil
  5574  	jne	.LBB0_231
  5575  # %bb.224:
  5576  	mov	esi, r10d
  5577  	and	esi, -32
  5578  	lea	rax, [rsi - 32]
  5579  	mov	r9, rax
  5580  	shr	r9, 5
  5581  	add	r9, 1
  5582  	test	rax, rax
  5583  	je	.LBB0_225
  5584  # %bb.226:
  5585  	mov	rax, r9
  5586  	and	rax, -2
  5587  	neg	rax
  5588  	xor	edi, edi
  5589  .LBB0_227:                              # =>This Inner Loop Header: Depth=1
  5590  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  5591  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  5592  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  5593  	paddb	xmm2, xmm0
  5594  	movdqu	xmm0, xmmword ptr [rcx + rdi + 16]
  5595  	paddb	xmm0, xmm1
  5596  	movdqu	xmmword ptr [r8 + rdi], xmm2
  5597  	movdqu	xmmword ptr [r8 + rdi + 16], xmm0
  5598  	movdqu	xmm0, xmmword ptr [rdx + rdi + 32]
  5599  	movdqu	xmm1, xmmword ptr [rdx + rdi + 48]
  5600  	movdqu	xmm2, xmmword ptr [rcx + rdi + 32]
  5601  	paddb	xmm2, xmm0
  5602  	movdqu	xmm0, xmmword ptr [rcx + rdi + 48]
  5603  	paddb	xmm0, xmm1
  5604  	movdqu	xmmword ptr [r8 + rdi + 32], xmm2
  5605  	movdqu	xmmword ptr [r8 + rdi + 48], xmm0
  5606  	add	rdi, 64
  5607  	add	rax, 2
  5608  	jne	.LBB0_227
  5609  	jmp	.LBB0_228
  5610  .LBB0_568:
  5611  	lea	rsi, [r8 + r10]
  5612  	lea	rax, [rdx + r10]
  5613  	cmp	rax, r8
  5614  	seta	r9b
  5615  	lea	rax, [rcx + r10]
  5616  	cmp	rsi, rdx
  5617  	seta	r11b
  5618  	cmp	rax, r8
  5619  	seta	al
  5620  	cmp	rsi, rcx
  5621  	seta	dil
  5622  	xor	esi, esi
  5623  	test	r9b, r11b
  5624  	jne	.LBB0_577
  5625  # %bb.569:
  5626  	and	al, dil
  5627  	jne	.LBB0_577
  5628  # %bb.570:
  5629  	mov	esi, r10d
  5630  	and	esi, -32
  5631  	lea	rax, [rsi - 32]
  5632  	mov	r9, rax
  5633  	shr	r9, 5
  5634  	add	r9, 1
  5635  	test	rax, rax
  5636  	je	.LBB0_571
  5637  # %bb.572:
  5638  	mov	rax, r9
  5639  	and	rax, -2
  5640  	neg	rax
  5641  	xor	edi, edi
  5642  .LBB0_573:                              # =>This Inner Loop Header: Depth=1
  5643  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  5644  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  5645  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  5646  	psubb	xmm0, xmm2
  5647  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
  5648  	psubb	xmm1, xmm2
  5649  	movdqu	xmmword ptr [r8 + rdi], xmm0
  5650  	movdqu	xmmword ptr [r8 + rdi + 16], xmm1
  5651  	movdqu	xmm0, xmmword ptr [rdx + rdi + 32]
  5652  	movdqu	xmm1, xmmword ptr [rdx + rdi + 48]
  5653  	movdqu	xmm2, xmmword ptr [rcx + rdi + 32]
  5654  	psubb	xmm0, xmm2
  5655  	movdqu	xmm2, xmmword ptr [rcx + rdi + 48]
  5656  	psubb	xmm1, xmm2
  5657  	movdqu	xmmword ptr [r8 + rdi + 32], xmm0
  5658  	movdqu	xmmword ptr [r8 + rdi + 48], xmm1
  5659  	add	rdi, 64
  5660  	add	rax, 2
  5661  	jne	.LBB0_573
  5662  	jmp	.LBB0_574
  5663  .LBB0_808:
  5664  	lea	rsi, [r8 + 4*r10]
  5665  	lea	rax, [rdx + 4*r10]
  5666  	cmp	rax, r8
  5667  	seta	r9b
  5668  	lea	rax, [rcx + 4*r10]
  5669  	cmp	rsi, rdx
  5670  	seta	r11b
  5671  	cmp	rax, r8
  5672  	seta	al
  5673  	cmp	rsi, rcx
  5674  	seta	dil
  5675  	xor	esi, esi
  5676  	test	r9b, r11b
  5677  	jne	.LBB0_817
  5678  # %bb.809:
  5679  	and	al, dil
  5680  	jne	.LBB0_817
  5681  # %bb.810:
  5682  	mov	esi, r10d
  5683  	and	esi, -8
  5684  	lea	rax, [rsi - 8]
  5685  	mov	r9, rax
  5686  	shr	r9, 3
  5687  	add	r9, 1
  5688  	test	rax, rax
  5689  	je	.LBB0_811
  5690  # %bb.812:
  5691  	mov	rax, r9
  5692  	and	rax, -2
  5693  	neg	rax
  5694  	xor	edi, edi
  5695  .LBB0_813:                              # =>This Inner Loop Header: Depth=1
  5696  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  5697  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  5698  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  5699  	pmulld	xmm2, xmm0
  5700  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  5701  	pmulld	xmm0, xmm1
  5702  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  5703  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  5704  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  5705  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  5706  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  5707  	pmulld	xmm2, xmm0
  5708  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 48]
  5709  	pmulld	xmm0, xmm1
  5710  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm2
  5711  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm0
  5712  	add	rdi, 16
  5713  	add	rax, 2
  5714  	jne	.LBB0_813
  5715  	jmp	.LBB0_814
  5716  .LBB0_958:
  5717  	lea	rsi, [r8 + 4*r10]
  5718  	lea	rax, [rdx + 4*r10]
  5719  	cmp	rax, r8
  5720  	seta	r9b
  5721  	lea	rax, [rcx + 4*r10]
  5722  	cmp	rsi, rdx
  5723  	seta	r11b
  5724  	cmp	rax, r8
  5725  	seta	al
  5726  	cmp	rsi, rcx
  5727  	seta	dil
  5728  	xor	esi, esi
  5729  	test	r9b, r11b
  5730  	jne	.LBB0_967
  5731  # %bb.959:
  5732  	and	al, dil
  5733  	jne	.LBB0_967
  5734  # %bb.960:
  5735  	mov	esi, r10d
  5736  	and	esi, -8
  5737  	lea	rax, [rsi - 8]
  5738  	mov	r9, rax
  5739  	shr	r9, 3
  5740  	add	r9, 1
  5741  	test	rax, rax
  5742  	je	.LBB0_961
  5743  # %bb.962:
  5744  	mov	rax, r9
  5745  	and	rax, -2
  5746  	neg	rax
  5747  	xor	edi, edi
  5748  .LBB0_963:                              # =>This Inner Loop Header: Depth=1
  5749  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  5750  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  5751  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  5752  	pmulld	xmm2, xmm0
  5753  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  5754  	pmulld	xmm0, xmm1
  5755  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  5756  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  5757  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  5758  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  5759  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  5760  	pmulld	xmm2, xmm0
  5761  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 48]
  5762  	pmulld	xmm0, xmm1
  5763  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm2
  5764  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm0
  5765  	add	rdi, 16
  5766  	add	rax, 2
  5767  	jne	.LBB0_963
  5768  	jmp	.LBB0_964
  5769  .LBB0_123:
  5770  	lea	rsi, [r8 + 4*r10]
  5771  	lea	rax, [rdx + 4*r10]
  5772  	cmp	rax, r8
  5773  	seta	r9b
  5774  	lea	rax, [rcx + 4*r10]
  5775  	cmp	rsi, rdx
  5776  	seta	r11b
  5777  	cmp	rax, r8
  5778  	seta	al
  5779  	cmp	rsi, rcx
  5780  	seta	dil
  5781  	xor	esi, esi
  5782  	test	r9b, r11b
  5783  	jne	.LBB0_132
  5784  # %bb.124:
  5785  	and	al, dil
  5786  	jne	.LBB0_132
  5787  # %bb.125:
  5788  	mov	esi, r10d
  5789  	and	esi, -8
  5790  	lea	rax, [rsi - 8]
  5791  	mov	r9, rax
  5792  	shr	r9, 3
  5793  	add	r9, 1
  5794  	test	rax, rax
  5795  	je	.LBB0_126
  5796  # %bb.127:
  5797  	mov	rax, r9
  5798  	and	rax, -2
  5799  	neg	rax
  5800  	xor	edi, edi
  5801  .LBB0_128:                              # =>This Inner Loop Header: Depth=1
  5802  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  5803  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  5804  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  5805  	paddd	xmm2, xmm0
  5806  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  5807  	paddd	xmm0, xmm1
  5808  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  5809  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  5810  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  5811  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  5812  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  5813  	paddd	xmm2, xmm0
  5814  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 48]
  5815  	paddd	xmm0, xmm1
  5816  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm2
  5817  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm0
  5818  	add	rdi, 16
  5819  	add	rax, 2
  5820  	jne	.LBB0_128
  5821  	jmp	.LBB0_129
  5822  .LBB0_469:
  5823  	lea	rsi, [r8 + 4*r10]
  5824  	lea	rax, [rdx + 4*r10]
  5825  	cmp	rax, r8
  5826  	seta	r9b
  5827  	lea	rax, [rcx + 4*r10]
  5828  	cmp	rsi, rdx
  5829  	seta	r11b
  5830  	cmp	rax, r8
  5831  	seta	al
  5832  	cmp	rsi, rcx
  5833  	seta	dil
  5834  	xor	esi, esi
  5835  	test	r9b, r11b
  5836  	jne	.LBB0_478
  5837  # %bb.470:
  5838  	and	al, dil
  5839  	jne	.LBB0_478
  5840  # %bb.471:
  5841  	mov	esi, r10d
  5842  	and	esi, -8
  5843  	lea	rax, [rsi - 8]
  5844  	mov	r9, rax
  5845  	shr	r9, 3
  5846  	add	r9, 1
  5847  	test	rax, rax
  5848  	je	.LBB0_472
  5849  # %bb.473:
  5850  	mov	rax, r9
  5851  	and	rax, -2
  5852  	neg	rax
  5853  	xor	edi, edi
  5854  .LBB0_474:                              # =>This Inner Loop Header: Depth=1
  5855  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  5856  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  5857  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  5858  	psubd	xmm0, xmm2
  5859  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
  5860  	psubd	xmm1, xmm2
  5861  	movdqu	xmmword ptr [r8 + 4*rdi], xmm0
  5862  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm1
  5863  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  5864  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  5865  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  5866  	psubd	xmm0, xmm2
  5867  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
  5868  	psubd	xmm1, xmm2
  5869  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm0
  5870  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm1
  5871  	add	rdi, 16
  5872  	add	rax, 2
  5873  	jne	.LBB0_474
  5874  	jmp	.LBB0_475
  5875  .LBB0_296:
  5876  	lea	rsi, [r8 + 4*r10]
  5877  	lea	rax, [rdx + 4*r10]
  5878  	cmp	rax, r8
  5879  	seta	r9b
  5880  	lea	rax, [rcx + 4*r10]
  5881  	cmp	rsi, rdx
  5882  	seta	r11b
  5883  	cmp	rax, r8
  5884  	seta	al
  5885  	cmp	rsi, rcx
  5886  	seta	dil
  5887  	xor	esi, esi
  5888  	test	r9b, r11b
  5889  	jne	.LBB0_305
  5890  # %bb.297:
  5891  	and	al, dil
  5892  	jne	.LBB0_305
  5893  # %bb.298:
  5894  	mov	esi, r10d
  5895  	and	esi, -8
  5896  	lea	rax, [rsi - 8]
  5897  	mov	r9, rax
  5898  	shr	r9, 3
  5899  	add	r9, 1
  5900  	test	rax, rax
  5901  	je	.LBB0_299
  5902  # %bb.300:
  5903  	mov	rax, r9
  5904  	and	rax, -2
  5905  	neg	rax
  5906  	xor	edi, edi
  5907  .LBB0_301:                              # =>This Inner Loop Header: Depth=1
  5908  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  5909  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  5910  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  5911  	paddd	xmm2, xmm0
  5912  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  5913  	paddd	xmm0, xmm1
  5914  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  5915  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  5916  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  5917  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  5918  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  5919  	paddd	xmm2, xmm0
  5920  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 48]
  5921  	paddd	xmm0, xmm1
  5922  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm2
  5923  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm0
  5924  	add	rdi, 16
  5925  	add	rax, 2
  5926  	jne	.LBB0_301
  5927  	jmp	.LBB0_302
  5928  .LBB0_642:
  5929  	lea	rsi, [r8 + 4*r10]
  5930  	lea	rax, [rdx + 4*r10]
  5931  	cmp	rax, r8
  5932  	seta	r9b
  5933  	lea	rax, [rcx + 4*r10]
  5934  	cmp	rsi, rdx
  5935  	seta	r11b
  5936  	cmp	rax, r8
  5937  	seta	al
  5938  	cmp	rsi, rcx
  5939  	seta	dil
  5940  	xor	esi, esi
  5941  	test	r9b, r11b
  5942  	jne	.LBB0_651
  5943  # %bb.643:
  5944  	and	al, dil
  5945  	jne	.LBB0_651
  5946  # %bb.644:
  5947  	mov	esi, r10d
  5948  	and	esi, -8
  5949  	lea	rax, [rsi - 8]
  5950  	mov	r9, rax
  5951  	shr	r9, 3
  5952  	add	r9, 1
  5953  	test	rax, rax
  5954  	je	.LBB0_645
  5955  # %bb.646:
  5956  	mov	rax, r9
  5957  	and	rax, -2
  5958  	neg	rax
  5959  	xor	edi, edi
  5960  .LBB0_647:                              # =>This Inner Loop Header: Depth=1
  5961  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  5962  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  5963  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  5964  	psubd	xmm0, xmm2
  5965  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
  5966  	psubd	xmm1, xmm2
  5967  	movdqu	xmmword ptr [r8 + 4*rdi], xmm0
  5968  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm1
  5969  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
  5970  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
  5971  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 32]
  5972  	psubd	xmm0, xmm2
  5973  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
  5974  	psubd	xmm1, xmm2
  5975  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm0
  5976  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm1
  5977  	add	rdi, 16
  5978  	add	rax, 2
  5979  	jne	.LBB0_647
  5980  	jmp	.LBB0_648
  5981  .LBB0_795:
  5982  	xor	edi, edi
  5983  .LBB0_798:
  5984  	test	r9b, 1
  5985  	je	.LBB0_800
  5986  # %bb.799:
  5987  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  5988  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  5989  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  5990  	pmulld	xmm2, xmm0
  5991  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  5992  	pmulld	xmm0, xmm1
  5993  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  5994  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  5995  .LBB0_800:
  5996  	cmp	rsi, r10
  5997  	jne	.LBB0_801
  5998  	jmp	.LBB0_1013
  5999  .LBB0_945:
  6000  	xor	edi, edi
  6001  .LBB0_948:
  6002  	test	r9b, 1
  6003  	je	.LBB0_950
  6004  # %bb.949:
  6005  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  6006  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6007  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  6008  	pmulld	xmm2, xmm0
  6009  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  6010  	pmulld	xmm0, xmm1
  6011  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  6012  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  6013  .LBB0_950:
  6014  	cmp	rsi, r10
  6015  	jne	.LBB0_951
  6016  	jmp	.LBB0_1013
  6017  .LBB0_110:
  6018  	xor	edi, edi
  6019  .LBB0_113:
  6020  	test	r9b, 1
  6021  	je	.LBB0_115
  6022  # %bb.114:
  6023  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  6024  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6025  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  6026  	paddd	xmm2, xmm0
  6027  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  6028  	paddd	xmm0, xmm1
  6029  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  6030  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  6031  .LBB0_115:
  6032  	cmp	rsi, r10
  6033  	je	.LBB0_1013
  6034  	jmp	.LBB0_116
  6035  .LBB0_456:
  6036  	xor	edi, edi
  6037  .LBB0_459:
  6038  	test	r9b, 1
  6039  	je	.LBB0_461
  6040  # %bb.460:
  6041  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  6042  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6043  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  6044  	psubd	xmm0, xmm2
  6045  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
  6046  	psubd	xmm1, xmm2
  6047  	movdqu	xmmword ptr [r8 + 4*rdi], xmm0
  6048  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm1
  6049  .LBB0_461:
  6050  	cmp	rsi, r10
  6051  	jne	.LBB0_462
  6052  	jmp	.LBB0_1013
  6053  .LBB0_283:
  6054  	xor	edi, edi
  6055  .LBB0_286:
  6056  	test	r9b, 1
  6057  	je	.LBB0_288
  6058  # %bb.287:
  6059  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  6060  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6061  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  6062  	paddd	xmm2, xmm0
  6063  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  6064  	paddd	xmm0, xmm1
  6065  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  6066  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  6067  .LBB0_288:
  6068  	cmp	rsi, r10
  6069  	je	.LBB0_1013
  6070  	jmp	.LBB0_289
  6071  .LBB0_629:
  6072  	xor	edi, edi
  6073  .LBB0_632:
  6074  	test	r9b, 1
  6075  	je	.LBB0_634
  6076  # %bb.633:
  6077  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  6078  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6079  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  6080  	psubd	xmm0, xmm2
  6081  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
  6082  	psubd	xmm1, xmm2
  6083  	movdqu	xmmword ptr [r8 + 4*rdi], xmm0
  6084  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm1
  6085  .LBB0_634:
  6086  	cmp	rsi, r10
  6087  	jne	.LBB0_635
  6088  	jmp	.LBB0_1013
  6089  .LBB0_853:
  6090  	xor	edi, edi
  6091  .LBB0_856:
  6092  	test	r9b, 1
  6093  	je	.LBB0_858
  6094  # %bb.857:
  6095  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
  6096  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6097  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
  6098  	mulpd	xmm2, xmm0
  6099  	movupd	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  6100  	mulpd	xmm0, xmm1
  6101  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
  6102  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm0
  6103  .LBB0_858:
  6104  	cmp	rsi, r10
  6105  	jne	.LBB0_859
  6106  	jmp	.LBB0_1013
  6107  .LBB0_1003:
  6108  	xor	edi, edi
  6109  .LBB0_1006:
  6110  	test	r9b, 1
  6111  	je	.LBB0_1008
  6112  # %bb.1007:
  6113  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
  6114  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6115  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
  6116  	mulpd	xmm2, xmm0
  6117  	movupd	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  6118  	mulpd	xmm0, xmm1
  6119  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
  6120  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm0
  6121  .LBB0_1008:
  6122  	cmp	rsi, r10
  6123  	jne	.LBB0_1009
  6124  	jmp	.LBB0_1013
  6125  .LBB0_184:
  6126  	xor	edi, edi
  6127  .LBB0_187:
  6128  	test	r9b, 1
  6129  	je	.LBB0_189
  6130  # %bb.188:
  6131  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
  6132  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6133  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
  6134  	addpd	xmm2, xmm0
  6135  	movupd	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  6136  	addpd	xmm0, xmm1
  6137  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
  6138  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm0
  6139  .LBB0_189:
  6140  	cmp	rsi, r10
  6141  	je	.LBB0_1013
  6142  	jmp	.LBB0_190
  6143  .LBB0_530:
  6144  	xor	edi, edi
  6145  .LBB0_533:
  6146  	test	r9b, 1
  6147  	je	.LBB0_535
  6148  # %bb.534:
  6149  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
  6150  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6151  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
  6152  	subpd	xmm0, xmm2
  6153  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 16]
  6154  	subpd	xmm1, xmm2
  6155  	movupd	xmmword ptr [r8 + 8*rdi], xmm0
  6156  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm1
  6157  .LBB0_535:
  6158  	cmp	rsi, r10
  6159  	jne	.LBB0_536
  6160  	jmp	.LBB0_1013
  6161  .LBB0_357:
  6162  	xor	edi, edi
  6163  .LBB0_360:
  6164  	test	r9b, 1
  6165  	je	.LBB0_362
  6166  # %bb.361:
  6167  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
  6168  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6169  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
  6170  	addpd	xmm2, xmm0
  6171  	movupd	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  6172  	addpd	xmm0, xmm1
  6173  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
  6174  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm0
  6175  .LBB0_362:
  6176  	cmp	rsi, r10
  6177  	jne	.LBB0_363
  6178  	jmp	.LBB0_1013
  6179  .LBB0_703:
  6180  	xor	edi, edi
  6181  .LBB0_706:
  6182  	test	r9b, 1
  6183  	je	.LBB0_708
  6184  # %bb.707:
  6185  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
  6186  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6187  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
  6188  	subpd	xmm0, xmm2
  6189  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 16]
  6190  	subpd	xmm1, xmm2
  6191  	movupd	xmmword ptr [r8 + 8*rdi], xmm0
  6192  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm1
  6193  .LBB0_708:
  6194  	cmp	rsi, r10
  6195  	jne	.LBB0_709
  6196  	jmp	.LBB0_1013
  6197  .LBB0_750:
  6198  	xor	eax, eax
  6199  .LBB0_753:
  6200  	test	r9b, 1
  6201  	je	.LBB0_755
  6202  # %bb.754:
  6203  	movdqu	xmm1, xmmword ptr [rdx + rax]
  6204  	movdqu	xmm2, xmmword ptr [rdx + rax + 16]
  6205  	movdqu	xmm3, xmmword ptr [rcx + rax]
  6206  	movdqu	xmm0, xmmword ptr [rcx + rax + 16]
  6207  	pmovzxbw	xmm4, xmm1                      # xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
  6208  	punpckhbw	xmm1, xmm1              # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6209  	pmovzxbw	xmm5, xmm3                      # xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
  6210  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6211  	pmullw	xmm3, xmm1
  6212  	movdqa	xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,255,255,255,255,255,255,255]
  6213  	pand	xmm3, xmm1
  6214  	pmullw	xmm5, xmm4
  6215  	pand	xmm5, xmm1
  6216  	packuswb	xmm5, xmm3
  6217  	pmovzxbw	xmm3, xmm2                      # xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
  6218  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6219  	pmovzxbw	xmm4, xmm0                      # xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
  6220  	punpckhbw	xmm0, xmm0              # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6221  	pmullw	xmm0, xmm2
  6222  	pand	xmm0, xmm1
  6223  	pmullw	xmm4, xmm3
  6224  	pand	xmm4, xmm1
  6225  	packuswb	xmm4, xmm0
  6226  	movdqu	xmmword ptr [r8 + rax], xmm5
  6227  	movdqu	xmmword ptr [r8 + rax + 16], xmm4
  6228  .LBB0_755:
  6229  	cmp	rdi, r10
  6230  	jne	.LBB0_756
  6231  	jmp	.LBB0_1013
  6232  .LBB0_900:
  6233  	xor	eax, eax
  6234  .LBB0_903:
  6235  	test	r9b, 1
  6236  	je	.LBB0_905
  6237  # %bb.904:
  6238  	movdqu	xmm1, xmmword ptr [rdx + rax]
  6239  	movdqu	xmm2, xmmword ptr [rdx + rax + 16]
  6240  	movdqu	xmm3, xmmword ptr [rcx + rax]
  6241  	movdqu	xmm0, xmmword ptr [rcx + rax + 16]
  6242  	pmovzxbw	xmm4, xmm1                      # xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
  6243  	punpckhbw	xmm1, xmm1              # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6244  	pmovzxbw	xmm5, xmm3                      # xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
  6245  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6246  	pmullw	xmm3, xmm1
  6247  	movdqa	xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,255,255,255,255,255,255,255]
  6248  	pand	xmm3, xmm1
  6249  	pmullw	xmm5, xmm4
  6250  	pand	xmm5, xmm1
  6251  	packuswb	xmm5, xmm3
  6252  	pmovzxbw	xmm3, xmm2                      # xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
  6253  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6254  	pmovzxbw	xmm4, xmm0                      # xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
  6255  	punpckhbw	xmm0, xmm0              # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6256  	pmullw	xmm0, xmm2
  6257  	pand	xmm0, xmm1
  6258  	pmullw	xmm4, xmm3
  6259  	pand	xmm4, xmm1
  6260  	packuswb	xmm4, xmm0
  6261  	movdqu	xmmword ptr [r8 + rax], xmm5
  6262  	movdqu	xmmword ptr [r8 + rax + 16], xmm4
  6263  .LBB0_905:
  6264  	cmp	rdi, r10
  6265  	jne	.LBB0_906
  6266  	jmp	.LBB0_1013
  6267  .LBB0_65:
  6268  	xor	edi, edi
  6269  .LBB0_68:
  6270  	test	r9b, 1
  6271  	je	.LBB0_70
  6272  # %bb.69:
  6273  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  6274  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  6275  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  6276  	paddb	xmm2, xmm0
  6277  	movdqu	xmm0, xmmword ptr [rcx + rdi + 16]
  6278  	paddb	xmm0, xmm1
  6279  	movdqu	xmmword ptr [r8 + rdi], xmm2
  6280  	movdqu	xmmword ptr [r8 + rdi + 16], xmm0
  6281  .LBB0_70:
  6282  	cmp	rsi, r10
  6283  	je	.LBB0_1013
  6284  	jmp	.LBB0_71
  6285  .LBB0_411:
  6286  	xor	edi, edi
  6287  .LBB0_414:
  6288  	test	r9b, 1
  6289  	je	.LBB0_416
  6290  # %bb.415:
  6291  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  6292  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  6293  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  6294  	psubb	xmm0, xmm2
  6295  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
  6296  	psubb	xmm1, xmm2
  6297  	movdqu	xmmword ptr [r8 + rdi], xmm0
  6298  	movdqu	xmmword ptr [r8 + rdi + 16], xmm1
  6299  .LBB0_416:
  6300  	cmp	rsi, r10
  6301  	jne	.LBB0_417
  6302  	jmp	.LBB0_1013
  6303  .LBB0_238:
  6304  	xor	edi, edi
  6305  .LBB0_241:
  6306  	test	r9b, 1
  6307  	je	.LBB0_243
  6308  # %bb.242:
  6309  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  6310  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  6311  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  6312  	paddb	xmm2, xmm0
  6313  	movdqu	xmm0, xmmword ptr [rcx + rdi + 16]
  6314  	paddb	xmm0, xmm1
  6315  	movdqu	xmmword ptr [r8 + rdi], xmm2
  6316  	movdqu	xmmword ptr [r8 + rdi + 16], xmm0
  6317  .LBB0_243:
  6318  	cmp	rsi, r10
  6319  	je	.LBB0_1013
  6320  	jmp	.LBB0_244
  6321  .LBB0_584:
  6322  	xor	edi, edi
  6323  .LBB0_587:
  6324  	test	r9b, 1
  6325  	je	.LBB0_589
  6326  # %bb.588:
  6327  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  6328  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  6329  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  6330  	psubb	xmm0, xmm2
  6331  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
  6332  	psubb	xmm1, xmm2
  6333  	movdqu	xmmword ptr [r8 + rdi], xmm0
  6334  	movdqu	xmmword ptr [r8 + rdi + 16], xmm1
  6335  .LBB0_589:
  6336  	cmp	rsi, r10
  6337  	jne	.LBB0_590
  6338  	jmp	.LBB0_1013
  6339  .LBB0_139:
  6340  	xor	edi, edi
  6341  .LBB0_142:
  6342  	test	r9b, 1
  6343  	je	.LBB0_144
  6344  # %bb.143:
  6345  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  6346  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6347  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  6348  	paddq	xmm2, xmm0
  6349  	movdqu	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  6350  	paddq	xmm0, xmm1
  6351  	movdqu	xmmword ptr [r8 + 8*rdi], xmm2
  6352  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm0
  6353  .LBB0_144:
  6354  	cmp	rsi, r10
  6355  	je	.LBB0_1013
  6356  	jmp	.LBB0_145
  6357  .LBB0_485:
  6358  	xor	edi, edi
  6359  .LBB0_488:
  6360  	test	r9b, 1
  6361  	je	.LBB0_490
  6362  # %bb.489:
  6363  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  6364  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6365  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  6366  	psubq	xmm0, xmm2
  6367  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
  6368  	psubq	xmm1, xmm2
  6369  	movdqu	xmmword ptr [r8 + 8*rdi], xmm0
  6370  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm1
  6371  .LBB0_490:
  6372  	cmp	rsi, r10
  6373  	jne	.LBB0_491
  6374  	jmp	.LBB0_1013
  6375  .LBB0_312:
  6376  	xor	edi, edi
  6377  .LBB0_315:
  6378  	test	r9b, 1
  6379  	je	.LBB0_317
  6380  # %bb.316:
  6381  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  6382  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6383  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  6384  	paddq	xmm2, xmm0
  6385  	movdqu	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  6386  	paddq	xmm0, xmm1
  6387  	movdqu	xmmword ptr [r8 + 8*rdi], xmm2
  6388  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm0
  6389  .LBB0_317:
  6390  	cmp	rsi, r10
  6391  	jne	.LBB0_318
  6392  	jmp	.LBB0_1013
  6393  .LBB0_658:
  6394  	xor	edi, edi
  6395  .LBB0_661:
  6396  	test	r9b, 1
  6397  	je	.LBB0_663
  6398  # %bb.662:
  6399  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  6400  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6401  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  6402  	psubq	xmm0, xmm2
  6403  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
  6404  	psubq	xmm1, xmm2
  6405  	movdqu	xmmword ptr [r8 + 8*rdi], xmm0
  6406  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm1
  6407  .LBB0_663:
  6408  	cmp	rsi, r10
  6409  	jne	.LBB0_664
  6410  	jmp	.LBB0_1013
  6411  .LBB0_766:
  6412  	xor	edi, edi
  6413  .LBB0_769:
  6414  	test	r9b, 1
  6415  	je	.LBB0_771
  6416  # %bb.770:
  6417  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  6418  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  6419  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  6420  	pmullw	xmm2, xmm0
  6421  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  6422  	pmullw	xmm0, xmm1
  6423  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  6424  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  6425  .LBB0_771:
  6426  	cmp	rsi, r10
  6427  	jne	.LBB0_772
  6428  	jmp	.LBB0_1013
  6429  .LBB0_782:
  6430  	xor	edi, edi
  6431  .LBB0_785:
  6432  	test	r9b, 1
  6433  	je	.LBB0_787
  6434  # %bb.786:
  6435  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  6436  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  6437  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  6438  	pmullw	xmm2, xmm0
  6439  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  6440  	pmullw	xmm0, xmm1
  6441  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  6442  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  6443  .LBB0_787:
  6444  	cmp	rsi, r10
  6445  	jne	.LBB0_788
  6446  	jmp	.LBB0_1013
  6447  .LBB0_916:
  6448  	xor	edi, edi
  6449  .LBB0_919:
  6450  	test	r9b, 1
  6451  	je	.LBB0_921
  6452  # %bb.920:
  6453  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  6454  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  6455  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  6456  	pmullw	xmm2, xmm0
  6457  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  6458  	pmullw	xmm0, xmm1
  6459  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  6460  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  6461  .LBB0_921:
  6462  	cmp	rsi, r10
  6463  	jne	.LBB0_922
  6464  	jmp	.LBB0_1013
  6465  .LBB0_932:
  6466  	xor	edi, edi
  6467  .LBB0_935:
  6468  	test	r9b, 1
  6469  	je	.LBB0_937
  6470  # %bb.936:
  6471  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  6472  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  6473  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  6474  	pmullw	xmm2, xmm0
  6475  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  6476  	pmullw	xmm0, xmm1
  6477  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  6478  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  6479  .LBB0_937:
  6480  	cmp	rsi, r10
  6481  	jne	.LBB0_938
  6482  	jmp	.LBB0_1013
  6483  .LBB0_81:
  6484  	xor	edi, edi
  6485  .LBB0_84:
  6486  	test	r9b, 1
  6487  	je	.LBB0_86
  6488  # %bb.85:
  6489  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  6490  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  6491  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  6492  	paddw	xmm2, xmm0
  6493  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  6494  	paddw	xmm0, xmm1
  6495  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  6496  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  6497  .LBB0_86:
  6498  	cmp	rsi, r10
  6499  	je	.LBB0_1013
  6500  	jmp	.LBB0_87
  6501  .LBB0_97:
  6502  	xor	edi, edi
  6503  .LBB0_100:
  6504  	test	r9b, 1
  6505  	je	.LBB0_102
  6506  # %bb.101:
  6507  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  6508  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  6509  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  6510  	paddw	xmm2, xmm0
  6511  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  6512  	paddw	xmm0, xmm1
  6513  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  6514  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  6515  .LBB0_102:
  6516  	cmp	rsi, r10
  6517  	je	.LBB0_1013
  6518  	jmp	.LBB0_103
  6519  .LBB0_427:
  6520  	xor	edi, edi
  6521  .LBB0_430:
  6522  	test	r9b, 1
  6523  	je	.LBB0_432
  6524  # %bb.431:
  6525  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  6526  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  6527  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  6528  	psubw	xmm0, xmm2
  6529  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
  6530  	psubw	xmm1, xmm2
  6531  	movdqu	xmmword ptr [r8 + 2*rdi], xmm0
  6532  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm1
  6533  .LBB0_432:
  6534  	cmp	rsi, r10
  6535  	jne	.LBB0_433
  6536  	jmp	.LBB0_1013
  6537  .LBB0_443:
  6538  	xor	edi, edi
  6539  .LBB0_446:
  6540  	test	r9b, 1
  6541  	je	.LBB0_448
  6542  # %bb.447:
  6543  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  6544  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  6545  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  6546  	psubw	xmm0, xmm2
  6547  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
  6548  	psubw	xmm1, xmm2
  6549  	movdqu	xmmword ptr [r8 + 2*rdi], xmm0
  6550  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm1
  6551  .LBB0_448:
  6552  	cmp	rsi, r10
  6553  	jne	.LBB0_449
  6554  	jmp	.LBB0_1013
  6555  .LBB0_254:
  6556  	xor	edi, edi
  6557  .LBB0_257:
  6558  	test	r9b, 1
  6559  	je	.LBB0_259
  6560  # %bb.258:
  6561  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  6562  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  6563  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  6564  	paddw	xmm2, xmm0
  6565  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  6566  	paddw	xmm0, xmm1
  6567  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  6568  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  6569  .LBB0_259:
  6570  	cmp	rsi, r10
  6571  	je	.LBB0_1013
  6572  	jmp	.LBB0_260
  6573  .LBB0_270:
  6574  	xor	edi, edi
  6575  .LBB0_273:
  6576  	test	r9b, 1
  6577  	je	.LBB0_275
  6578  # %bb.274:
  6579  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  6580  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  6581  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  6582  	paddw	xmm2, xmm0
  6583  	movdqu	xmm0, xmmword ptr [rcx + 2*rdi + 16]
  6584  	paddw	xmm0, xmm1
  6585  	movdqu	xmmword ptr [r8 + 2*rdi], xmm2
  6586  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
  6587  .LBB0_275:
  6588  	cmp	rsi, r10
  6589  	je	.LBB0_1013
  6590  	jmp	.LBB0_276
  6591  .LBB0_600:
  6592  	xor	edi, edi
  6593  .LBB0_603:
  6594  	test	r9b, 1
  6595  	je	.LBB0_605
  6596  # %bb.604:
  6597  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  6598  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  6599  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  6600  	psubw	xmm0, xmm2
  6601  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
  6602  	psubw	xmm1, xmm2
  6603  	movdqu	xmmword ptr [r8 + 2*rdi], xmm0
  6604  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm1
  6605  .LBB0_605:
  6606  	cmp	rsi, r10
  6607  	jne	.LBB0_606
  6608  	jmp	.LBB0_1013
  6609  .LBB0_616:
  6610  	xor	edi, edi
  6611  .LBB0_619:
  6612  	test	r9b, 1
  6613  	je	.LBB0_621
  6614  # %bb.620:
  6615  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
  6616  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
  6617  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi]
  6618  	psubw	xmm0, xmm2
  6619  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
  6620  	psubw	xmm1, xmm2
  6621  	movdqu	xmmword ptr [r8 + 2*rdi], xmm0
  6622  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm1
  6623  .LBB0_621:
  6624  	cmp	rsi, r10
  6625  	jne	.LBB0_622
  6626  	jmp	.LBB0_1013
  6627  .LBB0_840:
  6628  	xor	edi, edi
  6629  .LBB0_843:
  6630  	test	r9b, 1
  6631  	je	.LBB0_845
  6632  # %bb.844:
  6633  	movups	xmm0, xmmword ptr [rdx + 4*rdi]
  6634  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6635  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
  6636  	mulps	xmm2, xmm0
  6637  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  6638  	mulps	xmm0, xmm1
  6639  	movups	xmmword ptr [r8 + 4*rdi], xmm2
  6640  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm0
  6641  .LBB0_845:
  6642  	cmp	rsi, r10
  6643  	jne	.LBB0_846
  6644  	jmp	.LBB0_1013
  6645  .LBB0_990:
  6646  	xor	edi, edi
  6647  .LBB0_993:
  6648  	test	r9b, 1
  6649  	je	.LBB0_995
  6650  # %bb.994:
  6651  	movups	xmm0, xmmword ptr [rdx + 4*rdi]
  6652  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6653  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
  6654  	mulps	xmm2, xmm0
  6655  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  6656  	mulps	xmm0, xmm1
  6657  	movups	xmmword ptr [r8 + 4*rdi], xmm2
  6658  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm0
  6659  .LBB0_995:
  6660  	cmp	rsi, r10
  6661  	jne	.LBB0_996
  6662  	jmp	.LBB0_1013
  6663  .LBB0_155:
  6664  	xor	edi, edi
  6665  .LBB0_158:
  6666  	test	r9b, 1
  6667  	je	.LBB0_160
  6668  # %bb.159:
  6669  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  6670  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6671  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  6672  	paddq	xmm2, xmm0
  6673  	movdqu	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  6674  	paddq	xmm0, xmm1
  6675  	movdqu	xmmword ptr [r8 + 8*rdi], xmm2
  6676  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm0
  6677  .LBB0_160:
  6678  	cmp	rsi, r10
  6679  	je	.LBB0_1013
  6680  	jmp	.LBB0_161
  6681  .LBB0_171:
  6682  	xor	edi, edi
  6683  .LBB0_174:
  6684  	test	r9b, 1
  6685  	je	.LBB0_176
  6686  # %bb.175:
  6687  	movups	xmm0, xmmword ptr [rdx + 4*rdi]
  6688  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6689  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
  6690  	addps	xmm2, xmm0
  6691  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  6692  	addps	xmm0, xmm1
  6693  	movups	xmmword ptr [r8 + 4*rdi], xmm2
  6694  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm0
  6695  .LBB0_176:
  6696  	cmp	rsi, r10
  6697  	je	.LBB0_1013
  6698  	jmp	.LBB0_177
  6699  .LBB0_501:
  6700  	xor	edi, edi
  6701  .LBB0_504:
  6702  	test	r9b, 1
  6703  	je	.LBB0_506
  6704  # %bb.505:
  6705  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  6706  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6707  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  6708  	psubq	xmm0, xmm2
  6709  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
  6710  	psubq	xmm1, xmm2
  6711  	movdqu	xmmword ptr [r8 + 8*rdi], xmm0
  6712  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm1
  6713  .LBB0_506:
  6714  	cmp	rsi, r10
  6715  	jne	.LBB0_507
  6716  	jmp	.LBB0_1013
  6717  .LBB0_517:
  6718  	xor	edi, edi
  6719  .LBB0_520:
  6720  	test	r9b, 1
  6721  	je	.LBB0_522
  6722  # %bb.521:
  6723  	movups	xmm0, xmmword ptr [rdx + 4*rdi]
  6724  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6725  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
  6726  	subps	xmm0, xmm2
  6727  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 16]
  6728  	subps	xmm1, xmm2
  6729  	movups	xmmword ptr [r8 + 4*rdi], xmm0
  6730  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm1
  6731  .LBB0_522:
  6732  	cmp	rsi, r10
  6733  	jne	.LBB0_523
  6734  	jmp	.LBB0_1013
  6735  .LBB0_328:
  6736  	xor	edi, edi
  6737  .LBB0_331:
  6738  	test	r9b, 1
  6739  	je	.LBB0_333
  6740  # %bb.332:
  6741  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  6742  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6743  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  6744  	paddq	xmm2, xmm0
  6745  	movdqu	xmm0, xmmword ptr [rcx + 8*rdi + 16]
  6746  	paddq	xmm0, xmm1
  6747  	movdqu	xmmword ptr [r8 + 8*rdi], xmm2
  6748  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm0
  6749  .LBB0_333:
  6750  	cmp	rsi, r10
  6751  	jne	.LBB0_334
  6752  	jmp	.LBB0_1013
  6753  .LBB0_344:
  6754  	xor	edi, edi
  6755  .LBB0_347:
  6756  	test	r9b, 1
  6757  	je	.LBB0_349
  6758  # %bb.348:
  6759  	movups	xmm0, xmmword ptr [rdx + 4*rdi]
  6760  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6761  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
  6762  	addps	xmm2, xmm0
  6763  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  6764  	addps	xmm0, xmm1
  6765  	movups	xmmword ptr [r8 + 4*rdi], xmm2
  6766  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm0
  6767  .LBB0_349:
  6768  	cmp	rsi, r10
  6769  	jne	.LBB0_350
  6770  	jmp	.LBB0_1013
  6771  .LBB0_674:
  6772  	xor	edi, edi
  6773  .LBB0_677:
  6774  	test	r9b, 1
  6775  	je	.LBB0_679
  6776  # %bb.678:
  6777  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
  6778  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
  6779  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi]
  6780  	psubq	xmm0, xmm2
  6781  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
  6782  	psubq	xmm1, xmm2
  6783  	movdqu	xmmword ptr [r8 + 8*rdi], xmm0
  6784  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm1
  6785  .LBB0_679:
  6786  	cmp	rsi, r10
  6787  	jne	.LBB0_680
  6788  	jmp	.LBB0_1013
  6789  .LBB0_690:
  6790  	xor	edi, edi
  6791  .LBB0_693:
  6792  	test	r9b, 1
  6793  	je	.LBB0_695
  6794  # %bb.694:
  6795  	movups	xmm0, xmmword ptr [rdx + 4*rdi]
  6796  	movups	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6797  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
  6798  	subps	xmm0, xmm2
  6799  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 16]
  6800  	subps	xmm1, xmm2
  6801  	movups	xmmword ptr [r8 + 4*rdi], xmm0
  6802  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm1
  6803  .LBB0_695:
  6804  	cmp	rsi, r10
  6805  	jne	.LBB0_696
  6806  	jmp	.LBB0_1013
  6807  .LBB0_737:
  6808  	xor	eax, eax
  6809  .LBB0_740:
  6810  	test	r9b, 1
  6811  	je	.LBB0_742
  6812  # %bb.741:
  6813  	movdqu	xmm1, xmmword ptr [rdx + rax]
  6814  	movdqu	xmm2, xmmword ptr [rdx + rax + 16]
  6815  	movdqu	xmm3, xmmword ptr [rcx + rax]
  6816  	movdqu	xmm0, xmmword ptr [rcx + rax + 16]
  6817  	pmovzxbw	xmm4, xmm1                      # xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
  6818  	punpckhbw	xmm1, xmm1              # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6819  	pmovzxbw	xmm5, xmm3                      # xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
  6820  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6821  	pmullw	xmm3, xmm1
  6822  	movdqa	xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,255,255,255,255,255,255,255]
  6823  	pand	xmm3, xmm1
  6824  	pmullw	xmm5, xmm4
  6825  	pand	xmm5, xmm1
  6826  	packuswb	xmm5, xmm3
  6827  	pmovzxbw	xmm3, xmm2                      # xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
  6828  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6829  	pmovzxbw	xmm4, xmm0                      # xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
  6830  	punpckhbw	xmm0, xmm0              # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6831  	pmullw	xmm0, xmm2
  6832  	pand	xmm0, xmm1
  6833  	pmullw	xmm4, xmm3
  6834  	pand	xmm4, xmm1
  6835  	packuswb	xmm4, xmm0
  6836  	movdqu	xmmword ptr [r8 + rax], xmm5
  6837  	movdqu	xmmword ptr [r8 + rax + 16], xmm4
  6838  .LBB0_742:
  6839  	cmp	rdi, r10
  6840  	jne	.LBB0_743
  6841  	jmp	.LBB0_1013
  6842  .LBB0_887:
  6843  	xor	eax, eax
  6844  .LBB0_890:
  6845  	test	r9b, 1
  6846  	je	.LBB0_892
  6847  # %bb.891:
  6848  	movdqu	xmm1, xmmword ptr [rdx + rax]
  6849  	movdqu	xmm2, xmmword ptr [rdx + rax + 16]
  6850  	movdqu	xmm3, xmmword ptr [rcx + rax]
  6851  	movdqu	xmm0, xmmword ptr [rcx + rax + 16]
  6852  	pmovzxbw	xmm4, xmm1                      # xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
  6853  	punpckhbw	xmm1, xmm1              # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6854  	pmovzxbw	xmm5, xmm3                      # xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
  6855  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6856  	pmullw	xmm3, xmm1
  6857  	movdqa	xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,255,255,255,255,255,255,255]
  6858  	pand	xmm3, xmm1
  6859  	pmullw	xmm5, xmm4
  6860  	pand	xmm5, xmm1
  6861  	packuswb	xmm5, xmm3
  6862  	pmovzxbw	xmm3, xmm2                      # xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
  6863  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6864  	pmovzxbw	xmm4, xmm0                      # xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
  6865  	punpckhbw	xmm0, xmm0              # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  6866  	pmullw	xmm0, xmm2
  6867  	pand	xmm0, xmm1
  6868  	pmullw	xmm4, xmm3
  6869  	pand	xmm4, xmm1
  6870  	packuswb	xmm4, xmm0
  6871  	movdqu	xmmword ptr [r8 + rax], xmm5
  6872  	movdqu	xmmword ptr [r8 + rax + 16], xmm4
  6873  .LBB0_892:
  6874  	cmp	rdi, r10
  6875  	jne	.LBB0_893
  6876  	jmp	.LBB0_1013
  6877  .LBB0_52:
  6878  	xor	edi, edi
  6879  .LBB0_55:
  6880  	test	r9b, 1
  6881  	je	.LBB0_57
  6882  # %bb.56:
  6883  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  6884  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  6885  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  6886  	paddb	xmm2, xmm0
  6887  	movdqu	xmm0, xmmword ptr [rcx + rdi + 16]
  6888  	paddb	xmm0, xmm1
  6889  	movdqu	xmmword ptr [r8 + rdi], xmm2
  6890  	movdqu	xmmword ptr [r8 + rdi + 16], xmm0
  6891  .LBB0_57:
  6892  	cmp	rsi, r10
  6893  	je	.LBB0_1013
  6894  	jmp	.LBB0_58
  6895  .LBB0_398:
  6896  	xor	edi, edi
  6897  .LBB0_401:
  6898  	test	r9b, 1
  6899  	je	.LBB0_403
  6900  # %bb.402:
  6901  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  6902  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  6903  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  6904  	psubb	xmm0, xmm2
  6905  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
  6906  	psubb	xmm1, xmm2
  6907  	movdqu	xmmword ptr [r8 + rdi], xmm0
  6908  	movdqu	xmmword ptr [r8 + rdi + 16], xmm1
  6909  .LBB0_403:
  6910  	cmp	rsi, r10
  6911  	jne	.LBB0_404
  6912  	jmp	.LBB0_1013
  6913  .LBB0_225:
  6914  	xor	edi, edi
  6915  .LBB0_228:
  6916  	test	r9b, 1
  6917  	je	.LBB0_230
  6918  # %bb.229:
  6919  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  6920  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  6921  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  6922  	paddb	xmm2, xmm0
  6923  	movdqu	xmm0, xmmword ptr [rcx + rdi + 16]
  6924  	paddb	xmm0, xmm1
  6925  	movdqu	xmmword ptr [r8 + rdi], xmm2
  6926  	movdqu	xmmword ptr [r8 + rdi + 16], xmm0
  6927  .LBB0_230:
  6928  	cmp	rsi, r10
  6929  	je	.LBB0_1013
  6930  	jmp	.LBB0_231
  6931  .LBB0_571:
  6932  	xor	edi, edi
  6933  .LBB0_574:
  6934  	test	r9b, 1
  6935  	je	.LBB0_576
  6936  # %bb.575:
  6937  	movdqu	xmm0, xmmword ptr [rdx + rdi]
  6938  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
  6939  	movdqu	xmm2, xmmword ptr [rcx + rdi]
  6940  	psubb	xmm0, xmm2
  6941  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
  6942  	psubb	xmm1, xmm2
  6943  	movdqu	xmmword ptr [r8 + rdi], xmm0
  6944  	movdqu	xmmword ptr [r8 + rdi + 16], xmm1
  6945  .LBB0_576:
  6946  	cmp	rsi, r10
  6947  	jne	.LBB0_577
  6948  	jmp	.LBB0_1013
  6949  .LBB0_811:
  6950  	xor	edi, edi
  6951  .LBB0_814:
  6952  	test	r9b, 1
  6953  	je	.LBB0_816
  6954  # %bb.815:
  6955  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  6956  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6957  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  6958  	pmulld	xmm2, xmm0
  6959  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  6960  	pmulld	xmm0, xmm1
  6961  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  6962  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  6963  .LBB0_816:
  6964  	cmp	rsi, r10
  6965  	jne	.LBB0_817
  6966  	jmp	.LBB0_1013
  6967  .LBB0_961:
  6968  	xor	edi, edi
  6969  .LBB0_964:
  6970  	test	r9b, 1
  6971  	je	.LBB0_966
  6972  # %bb.965:
  6973  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  6974  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6975  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  6976  	pmulld	xmm2, xmm0
  6977  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  6978  	pmulld	xmm0, xmm1
  6979  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  6980  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  6981  .LBB0_966:
  6982  	cmp	rsi, r10
  6983  	jne	.LBB0_967
  6984  	jmp	.LBB0_1013
  6985  .LBB0_126:
  6986  	xor	edi, edi
  6987  .LBB0_129:
  6988  	test	r9b, 1
  6989  	je	.LBB0_131
  6990  # %bb.130:
  6991  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  6992  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  6993  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  6994  	paddd	xmm2, xmm0
  6995  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  6996  	paddd	xmm0, xmm1
  6997  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  6998  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  6999  .LBB0_131:
  7000  	cmp	rsi, r10
  7001  	je	.LBB0_1013
  7002  	jmp	.LBB0_132
  7003  .LBB0_472:
  7004  	xor	edi, edi
  7005  .LBB0_475:
  7006  	test	r9b, 1
  7007  	je	.LBB0_477
  7008  # %bb.476:
  7009  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  7010  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  7011  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  7012  	psubd	xmm0, xmm2
  7013  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
  7014  	psubd	xmm1, xmm2
  7015  	movdqu	xmmword ptr [r8 + 4*rdi], xmm0
  7016  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm1
  7017  .LBB0_477:
  7018  	cmp	rsi, r10
  7019  	jne	.LBB0_478
  7020  	jmp	.LBB0_1013
  7021  .LBB0_299:
  7022  	xor	edi, edi
  7023  .LBB0_302:
  7024  	test	r9b, 1
  7025  	je	.LBB0_304
  7026  # %bb.303:
  7027  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  7028  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  7029  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  7030  	paddd	xmm2, xmm0
  7031  	movdqu	xmm0, xmmword ptr [rcx + 4*rdi + 16]
  7032  	paddd	xmm0, xmm1
  7033  	movdqu	xmmword ptr [r8 + 4*rdi], xmm2
  7034  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
  7035  .LBB0_304:
  7036  	cmp	rsi, r10
  7037  	je	.LBB0_1013
  7038  	jmp	.LBB0_305
  7039  .LBB0_645:
  7040  	xor	edi, edi
  7041  .LBB0_648:
  7042  	test	r9b, 1
  7043  	je	.LBB0_650
  7044  # %bb.649:
  7045  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
  7046  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
  7047  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi]
  7048  	psubd	xmm0, xmm2
  7049  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
  7050  	psubd	xmm1, xmm2
  7051  	movdqu	xmmword ptr [r8 + 4*rdi], xmm0
  7052  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm1
  7053  .LBB0_650:
  7054  	cmp	rsi, r10
  7055  	jne	.LBB0_651
  7056  	jmp	.LBB0_1013
  7057  .Lfunc_end0:
  7058  	.size	arithmetic_binary_sse4, .Lfunc_end0-arithmetic_binary_sse4
  7059                                          # -- End function
  7060  	.section	.rodata.cst16,"aM",@progbits,16
  7061  	.p2align	4                               # -- Begin function arithmetic_arr_scalar_sse4
  7062  .LCPI1_0:
  7063  	.short	255                             # 0xff
  7064  	.short	255                             # 0xff
  7065  	.short	255                             # 0xff
  7066  	.short	255                             # 0xff
  7067  	.short	255                             # 0xff
  7068  	.short	255                             # 0xff
  7069  	.short	255                             # 0xff
  7070  	.short	255                             # 0xff
  7071  	.text
  7072  	.globl	arithmetic_arr_scalar_sse4
  7073  	.p2align	4, 0x90
  7074  	.type	arithmetic_arr_scalar_sse4,@function
  7075  arithmetic_arr_scalar_sse4:             # @arithmetic_arr_scalar_sse4
  7076  # %bb.0:
  7077  	push	rbp
  7078  	mov	rbp, rsp
  7079  	and	rsp, -8
  7080  	cmp	sil, 20
  7081  	jg	.LBB1_12
  7082  # %bb.1:
  7083  	test	sil, sil
  7084  	je	.LBB1_23
  7085  # %bb.2:
  7086  	cmp	sil, 1
  7087  	je	.LBB1_31
  7088  # %bb.3:
  7089  	cmp	sil, 2
  7090  	jne	.LBB1_1069
  7091  # %bb.4:
  7092  	cmp	edi, 6
  7093  	jg	.LBB1_55
  7094  # %bb.5:
  7095  	cmp	edi, 3
  7096  	jle	.LBB1_97
  7097  # %bb.6:
  7098  	cmp	edi, 4
  7099  	je	.LBB1_157
  7100  # %bb.7:
  7101  	cmp	edi, 5
  7102  	je	.LBB1_160
  7103  # %bb.8:
  7104  	cmp	edi, 6
  7105  	jne	.LBB1_1069
  7106  # %bb.9:
  7107  	test	r9d, r9d
  7108  	jle	.LBB1_1069
  7109  # %bb.10:
  7110  	mov	eax, dword ptr [rcx]
  7111  	mov	r10d, r9d
  7112  	cmp	r9d, 8
  7113  	jb	.LBB1_11
  7114  # %bb.265:
  7115  	lea	rcx, [rdx + 4*r10]
  7116  	cmp	rcx, r8
  7117  	jbe	.LBB1_453
  7118  # %bb.266:
  7119  	lea	rcx, [r8 + 4*r10]
  7120  	cmp	rcx, rdx
  7121  	jbe	.LBB1_453
  7122  .LBB1_11:
  7123  	xor	esi, esi
  7124  .LBB1_625:
  7125  	mov	r9, rsi
  7126  	not	r9
  7127  	add	r9, r10
  7128  	mov	rdi, r10
  7129  	and	rdi, 3
  7130  	je	.LBB1_627
  7131  .LBB1_626:                              # =>This Inner Loop Header: Depth=1
  7132  	mov	ecx, dword ptr [rdx + 4*rsi]
  7133  	imul	ecx, eax
  7134  	mov	dword ptr [r8 + 4*rsi], ecx
  7135  	add	rsi, 1
  7136  	add	rdi, -1
  7137  	jne	.LBB1_626
  7138  .LBB1_627:
  7139  	cmp	r9, 3
  7140  	jb	.LBB1_1069
  7141  .LBB1_628:                              # =>This Inner Loop Header: Depth=1
  7142  	mov	ecx, dword ptr [rdx + 4*rsi]
  7143  	imul	ecx, eax
  7144  	mov	dword ptr [r8 + 4*rsi], ecx
  7145  	mov	ecx, dword ptr [rdx + 4*rsi + 4]
  7146  	imul	ecx, eax
  7147  	mov	dword ptr [r8 + 4*rsi + 4], ecx
  7148  	mov	ecx, dword ptr [rdx + 4*rsi + 8]
  7149  	imul	ecx, eax
  7150  	mov	dword ptr [r8 + 4*rsi + 8], ecx
  7151  	mov	ecx, dword ptr [rdx + 4*rsi + 12]
  7152  	imul	ecx, eax
  7153  	mov	dword ptr [r8 + 4*rsi + 12], ecx
  7154  	add	rsi, 4
  7155  	cmp	r10, rsi
  7156  	jne	.LBB1_628
  7157  	jmp	.LBB1_1069
  7158  .LBB1_12:
  7159  	cmp	sil, 21
  7160  	je	.LBB1_39
  7161  # %bb.13:
  7162  	cmp	sil, 22
  7163  	je	.LBB1_47
  7164  # %bb.14:
  7165  	cmp	sil, 23
  7166  	jne	.LBB1_1069
  7167  # %bb.15:
  7168  	cmp	edi, 6
  7169  	jg	.LBB1_62
  7170  # %bb.16:
  7171  	cmp	edi, 3
  7172  	jle	.LBB1_102
  7173  # %bb.17:
  7174  	cmp	edi, 4
  7175  	je	.LBB1_163
  7176  # %bb.18:
  7177  	cmp	edi, 5
  7178  	je	.LBB1_166
  7179  # %bb.19:
  7180  	cmp	edi, 6
  7181  	jne	.LBB1_1069
  7182  # %bb.20:
  7183  	test	r9d, r9d
  7184  	jle	.LBB1_1069
  7185  # %bb.21:
  7186  	mov	eax, dword ptr [rcx]
  7187  	mov	r10d, r9d
  7188  	cmp	r9d, 8
  7189  	jb	.LBB1_22
  7190  # %bb.268:
  7191  	lea	rcx, [rdx + 4*r10]
  7192  	cmp	rcx, r8
  7193  	jbe	.LBB1_456
  7194  # %bb.269:
  7195  	lea	rcx, [r8 + 4*r10]
  7196  	cmp	rcx, rdx
  7197  	jbe	.LBB1_456
  7198  .LBB1_22:
  7199  	xor	esi, esi
  7200  .LBB1_633:
  7201  	mov	r9, rsi
  7202  	not	r9
  7203  	add	r9, r10
  7204  	mov	rdi, r10
  7205  	and	rdi, 3
  7206  	je	.LBB1_635
  7207  .LBB1_634:                              # =>This Inner Loop Header: Depth=1
  7208  	mov	ecx, dword ptr [rdx + 4*rsi]
  7209  	imul	ecx, eax
  7210  	mov	dword ptr [r8 + 4*rsi], ecx
  7211  	add	rsi, 1
  7212  	add	rdi, -1
  7213  	jne	.LBB1_634
  7214  .LBB1_635:
  7215  	cmp	r9, 3
  7216  	jb	.LBB1_1069
  7217  .LBB1_636:                              # =>This Inner Loop Header: Depth=1
  7218  	mov	ecx, dword ptr [rdx + 4*rsi]
  7219  	imul	ecx, eax
  7220  	mov	dword ptr [r8 + 4*rsi], ecx
  7221  	mov	ecx, dword ptr [rdx + 4*rsi + 4]
  7222  	imul	ecx, eax
  7223  	mov	dword ptr [r8 + 4*rsi + 4], ecx
  7224  	mov	ecx, dword ptr [rdx + 4*rsi + 8]
  7225  	imul	ecx, eax
  7226  	mov	dword ptr [r8 + 4*rsi + 8], ecx
  7227  	mov	ecx, dword ptr [rdx + 4*rsi + 12]
  7228  	imul	ecx, eax
  7229  	mov	dword ptr [r8 + 4*rsi + 12], ecx
  7230  	add	rsi, 4
  7231  	cmp	r10, rsi
  7232  	jne	.LBB1_636
  7233  	jmp	.LBB1_1069
  7234  .LBB1_23:
  7235  	cmp	edi, 6
  7236  	jg	.LBB1_69
  7237  # %bb.24:
  7238  	cmp	edi, 3
  7239  	jle	.LBB1_107
  7240  # %bb.25:
  7241  	cmp	edi, 4
  7242  	je	.LBB1_169
  7243  # %bb.26:
  7244  	cmp	edi, 5
  7245  	je	.LBB1_172
  7246  # %bb.27:
  7247  	cmp	edi, 6
  7248  	jne	.LBB1_1069
  7249  # %bb.28:
  7250  	test	r9d, r9d
  7251  	jle	.LBB1_1069
  7252  # %bb.29:
  7253  	mov	eax, dword ptr [rcx]
  7254  	mov	r10d, r9d
  7255  	cmp	r9d, 8
  7256  	jb	.LBB1_30
  7257  # %bb.271:
  7258  	lea	rcx, [rdx + 4*r10]
  7259  	cmp	rcx, r8
  7260  	jbe	.LBB1_459
  7261  # %bb.272:
  7262  	lea	rcx, [r8 + 4*r10]
  7263  	cmp	rcx, rdx
  7264  	jbe	.LBB1_459
  7265  .LBB1_30:
  7266  	xor	esi, esi
  7267  .LBB1_641:
  7268  	mov	r9, rsi
  7269  	not	r9
  7270  	add	r9, r10
  7271  	mov	rdi, r10
  7272  	and	rdi, 3
  7273  	je	.LBB1_643
  7274  .LBB1_642:                              # =>This Inner Loop Header: Depth=1
  7275  	mov	ecx, dword ptr [rdx + 4*rsi]
  7276  	add	ecx, eax
  7277  	mov	dword ptr [r8 + 4*rsi], ecx
  7278  	add	rsi, 1
  7279  	add	rdi, -1
  7280  	jne	.LBB1_642
  7281  .LBB1_643:
  7282  	cmp	r9, 3
  7283  	jb	.LBB1_1069
  7284  .LBB1_644:                              # =>This Inner Loop Header: Depth=1
  7285  	mov	ecx, dword ptr [rdx + 4*rsi]
  7286  	add	ecx, eax
  7287  	mov	dword ptr [r8 + 4*rsi], ecx
  7288  	mov	ecx, dword ptr [rdx + 4*rsi + 4]
  7289  	add	ecx, eax
  7290  	mov	dword ptr [r8 + 4*rsi + 4], ecx
  7291  	mov	ecx, dword ptr [rdx + 4*rsi + 8]
  7292  	add	ecx, eax
  7293  	mov	dword ptr [r8 + 4*rsi + 8], ecx
  7294  	mov	ecx, dword ptr [rdx + 4*rsi + 12]
  7295  	add	ecx, eax
  7296  	mov	dword ptr [r8 + 4*rsi + 12], ecx
  7297  	add	rsi, 4
  7298  	cmp	r10, rsi
  7299  	jne	.LBB1_644
  7300  	jmp	.LBB1_1069
  7301  .LBB1_31:
  7302  	cmp	edi, 6
  7303  	jg	.LBB1_76
  7304  # %bb.32:
  7305  	cmp	edi, 3
  7306  	jle	.LBB1_112
  7307  # %bb.33:
  7308  	cmp	edi, 4
  7309  	je	.LBB1_175
  7310  # %bb.34:
  7311  	cmp	edi, 5
  7312  	je	.LBB1_178
  7313  # %bb.35:
  7314  	cmp	edi, 6
  7315  	jne	.LBB1_1069
  7316  # %bb.36:
  7317  	test	r9d, r9d
  7318  	jle	.LBB1_1069
  7319  # %bb.37:
  7320  	mov	eax, dword ptr [rcx]
  7321  	mov	r10d, r9d
  7322  	cmp	r9d, 8
  7323  	jb	.LBB1_38
  7324  # %bb.274:
  7325  	lea	rcx, [rdx + 4*r10]
  7326  	cmp	rcx, r8
  7327  	jbe	.LBB1_462
  7328  # %bb.275:
  7329  	lea	rcx, [r8 + 4*r10]
  7330  	cmp	rcx, rdx
  7331  	jbe	.LBB1_462
  7332  .LBB1_38:
  7333  	xor	esi, esi
  7334  .LBB1_649:
  7335  	mov	r9, rsi
  7336  	not	r9
  7337  	add	r9, r10
  7338  	mov	rdi, r10
  7339  	and	rdi, 3
  7340  	je	.LBB1_651
  7341  .LBB1_650:                              # =>This Inner Loop Header: Depth=1
  7342  	mov	ecx, dword ptr [rdx + 4*rsi]
  7343  	sub	ecx, eax
  7344  	mov	dword ptr [r8 + 4*rsi], ecx
  7345  	add	rsi, 1
  7346  	add	rdi, -1
  7347  	jne	.LBB1_650
  7348  .LBB1_651:
  7349  	cmp	r9, 3
  7350  	jb	.LBB1_1069
  7351  .LBB1_652:                              # =>This Inner Loop Header: Depth=1
  7352  	mov	ecx, dword ptr [rdx + 4*rsi]
  7353  	sub	ecx, eax
  7354  	mov	dword ptr [r8 + 4*rsi], ecx
  7355  	mov	ecx, dword ptr [rdx + 4*rsi + 4]
  7356  	sub	ecx, eax
  7357  	mov	dword ptr [r8 + 4*rsi + 4], ecx
  7358  	mov	ecx, dword ptr [rdx + 4*rsi + 8]
  7359  	sub	ecx, eax
  7360  	mov	dword ptr [r8 + 4*rsi + 8], ecx
  7361  	mov	ecx, dword ptr [rdx + 4*rsi + 12]
  7362  	sub	ecx, eax
  7363  	mov	dword ptr [r8 + 4*rsi + 12], ecx
  7364  	add	rsi, 4
  7365  	cmp	r10, rsi
  7366  	jne	.LBB1_652
  7367  	jmp	.LBB1_1069
  7368  .LBB1_39:
  7369  	cmp	edi, 6
  7370  	jg	.LBB1_83
  7371  # %bb.40:
  7372  	cmp	edi, 3
  7373  	jle	.LBB1_117
  7374  # %bb.41:
  7375  	cmp	edi, 4
  7376  	je	.LBB1_181
  7377  # %bb.42:
  7378  	cmp	edi, 5
  7379  	je	.LBB1_184
  7380  # %bb.43:
  7381  	cmp	edi, 6
  7382  	jne	.LBB1_1069
  7383  # %bb.44:
  7384  	test	r9d, r9d
  7385  	jle	.LBB1_1069
  7386  # %bb.45:
  7387  	mov	eax, dword ptr [rcx]
  7388  	mov	r10d, r9d
  7389  	cmp	r9d, 8
  7390  	jb	.LBB1_46
  7391  # %bb.277:
  7392  	lea	rcx, [rdx + 4*r10]
  7393  	cmp	rcx, r8
  7394  	jbe	.LBB1_465
  7395  # %bb.278:
  7396  	lea	rcx, [r8 + 4*r10]
  7397  	cmp	rcx, rdx
  7398  	jbe	.LBB1_465
  7399  .LBB1_46:
  7400  	xor	esi, esi
  7401  .LBB1_657:
  7402  	mov	r9, rsi
  7403  	not	r9
  7404  	add	r9, r10
  7405  	mov	rdi, r10
  7406  	and	rdi, 3
  7407  	je	.LBB1_659
  7408  .LBB1_658:                              # =>This Inner Loop Header: Depth=1
  7409  	mov	ecx, dword ptr [rdx + 4*rsi]
  7410  	add	ecx, eax
  7411  	mov	dword ptr [r8 + 4*rsi], ecx
  7412  	add	rsi, 1
  7413  	add	rdi, -1
  7414  	jne	.LBB1_658
  7415  .LBB1_659:
  7416  	cmp	r9, 3
  7417  	jb	.LBB1_1069
  7418  .LBB1_660:                              # =>This Inner Loop Header: Depth=1
  7419  	mov	ecx, dword ptr [rdx + 4*rsi]
  7420  	add	ecx, eax
  7421  	mov	dword ptr [r8 + 4*rsi], ecx
  7422  	mov	ecx, dword ptr [rdx + 4*rsi + 4]
  7423  	add	ecx, eax
  7424  	mov	dword ptr [r8 + 4*rsi + 4], ecx
  7425  	mov	ecx, dword ptr [rdx + 4*rsi + 8]
  7426  	add	ecx, eax
  7427  	mov	dword ptr [r8 + 4*rsi + 8], ecx
  7428  	mov	ecx, dword ptr [rdx + 4*rsi + 12]
  7429  	add	ecx, eax
  7430  	mov	dword ptr [r8 + 4*rsi + 12], ecx
  7431  	add	rsi, 4
  7432  	cmp	r10, rsi
  7433  	jne	.LBB1_660
  7434  	jmp	.LBB1_1069
  7435  .LBB1_47:
  7436  	cmp	edi, 6
  7437  	jg	.LBB1_90
  7438  # %bb.48:
  7439  	cmp	edi, 3
  7440  	jle	.LBB1_122
  7441  # %bb.49:
  7442  	cmp	edi, 4
  7443  	je	.LBB1_187
  7444  # %bb.50:
  7445  	cmp	edi, 5
  7446  	je	.LBB1_190
  7447  # %bb.51:
  7448  	cmp	edi, 6
  7449  	jne	.LBB1_1069
  7450  # %bb.52:
  7451  	test	r9d, r9d
  7452  	jle	.LBB1_1069
  7453  # %bb.53:
  7454  	mov	eax, dword ptr [rcx]
  7455  	mov	r10d, r9d
  7456  	cmp	r9d, 8
  7457  	jb	.LBB1_54
  7458  # %bb.280:
  7459  	lea	rcx, [rdx + 4*r10]
  7460  	cmp	rcx, r8
  7461  	jbe	.LBB1_468
  7462  # %bb.281:
  7463  	lea	rcx, [r8 + 4*r10]
  7464  	cmp	rcx, rdx
  7465  	jbe	.LBB1_468
  7466  .LBB1_54:
  7467  	xor	esi, esi
  7468  .LBB1_665:
  7469  	mov	r9, rsi
  7470  	not	r9
  7471  	add	r9, r10
  7472  	mov	rdi, r10
  7473  	and	rdi, 3
  7474  	je	.LBB1_667
  7475  .LBB1_666:                              # =>This Inner Loop Header: Depth=1
  7476  	mov	ecx, dword ptr [rdx + 4*rsi]
  7477  	sub	ecx, eax
  7478  	mov	dword ptr [r8 + 4*rsi], ecx
  7479  	add	rsi, 1
  7480  	add	rdi, -1
  7481  	jne	.LBB1_666
  7482  .LBB1_667:
  7483  	cmp	r9, 3
  7484  	jb	.LBB1_1069
  7485  .LBB1_668:                              # =>This Inner Loop Header: Depth=1
  7486  	mov	ecx, dword ptr [rdx + 4*rsi]
  7487  	sub	ecx, eax
  7488  	mov	dword ptr [r8 + 4*rsi], ecx
  7489  	mov	ecx, dword ptr [rdx + 4*rsi + 4]
  7490  	sub	ecx, eax
  7491  	mov	dword ptr [r8 + 4*rsi + 4], ecx
  7492  	mov	ecx, dword ptr [rdx + 4*rsi + 8]
  7493  	sub	ecx, eax
  7494  	mov	dword ptr [r8 + 4*rsi + 8], ecx
  7495  	mov	ecx, dword ptr [rdx + 4*rsi + 12]
  7496  	sub	ecx, eax
  7497  	mov	dword ptr [r8 + 4*rsi + 12], ecx
  7498  	add	rsi, 4
  7499  	cmp	r10, rsi
  7500  	jne	.LBB1_668
  7501  	jmp	.LBB1_1069
  7502  .LBB1_55:
  7503  	cmp	edi, 8
  7504  	jle	.LBB1_127
  7505  # %bb.56:
  7506  	cmp	edi, 9
  7507  	je	.LBB1_193
  7508  # %bb.57:
  7509  	cmp	edi, 11
  7510  	je	.LBB1_196
  7511  # %bb.58:
  7512  	cmp	edi, 12
  7513  	jne	.LBB1_1069
  7514  # %bb.59:
  7515  	test	r9d, r9d
  7516  	jle	.LBB1_1069
  7517  # %bb.60:
  7518  	movsd	xmm0, qword ptr [rcx]           # xmm0 = mem[0],zero
  7519  	mov	eax, r9d
  7520  	cmp	r9d, 4
  7521  	jb	.LBB1_61
  7522  # %bb.283:
  7523  	lea	rcx, [rdx + 8*rax]
  7524  	cmp	rcx, r8
  7525  	jbe	.LBB1_471
  7526  # %bb.284:
  7527  	lea	rcx, [r8 + 8*rax]
  7528  	cmp	rcx, rdx
  7529  	jbe	.LBB1_471
  7530  .LBB1_61:
  7531  	xor	ecx, ecx
  7532  .LBB1_673:
  7533  	mov	rsi, rcx
  7534  	not	rsi
  7535  	add	rsi, rax
  7536  	mov	rdi, rax
  7537  	and	rdi, 3
  7538  	je	.LBB1_675
  7539  .LBB1_674:                              # =>This Inner Loop Header: Depth=1
  7540  	movsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
  7541  	mulsd	xmm1, xmm0
  7542  	movsd	qword ptr [r8 + 8*rcx], xmm1
  7543  	add	rcx, 1
  7544  	add	rdi, -1
  7545  	jne	.LBB1_674
  7546  .LBB1_675:
  7547  	cmp	rsi, 3
  7548  	jb	.LBB1_1069
  7549  .LBB1_676:                              # =>This Inner Loop Header: Depth=1
  7550  	movsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
  7551  	mulsd	xmm1, xmm0
  7552  	movsd	qword ptr [r8 + 8*rcx], xmm1
  7553  	movsd	xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
  7554  	mulsd	xmm1, xmm0
  7555  	movsd	qword ptr [r8 + 8*rcx + 8], xmm1
  7556  	movsd	xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
  7557  	mulsd	xmm1, xmm0
  7558  	movsd	qword ptr [r8 + 8*rcx + 16], xmm1
  7559  	movsd	xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
  7560  	mulsd	xmm1, xmm0
  7561  	movsd	qword ptr [r8 + 8*rcx + 24], xmm1
  7562  	add	rcx, 4
  7563  	cmp	rax, rcx
  7564  	jne	.LBB1_676
  7565  	jmp	.LBB1_1069
  7566  .LBB1_62:
  7567  	cmp	edi, 8
  7568  	jle	.LBB1_132
  7569  # %bb.63:
  7570  	cmp	edi, 9
  7571  	je	.LBB1_199
  7572  # %bb.64:
  7573  	cmp	edi, 11
  7574  	je	.LBB1_202
  7575  # %bb.65:
  7576  	cmp	edi, 12
  7577  	jne	.LBB1_1069
  7578  # %bb.66:
  7579  	test	r9d, r9d
  7580  	jle	.LBB1_1069
  7581  # %bb.67:
  7582  	movsd	xmm0, qword ptr [rcx]           # xmm0 = mem[0],zero
  7583  	mov	eax, r9d
  7584  	cmp	r9d, 4
  7585  	jb	.LBB1_68
  7586  # %bb.286:
  7587  	lea	rcx, [rdx + 8*rax]
  7588  	cmp	rcx, r8
  7589  	jbe	.LBB1_474
  7590  # %bb.287:
  7591  	lea	rcx, [r8 + 8*rax]
  7592  	cmp	rcx, rdx
  7593  	jbe	.LBB1_474
  7594  .LBB1_68:
  7595  	xor	ecx, ecx
  7596  .LBB1_681:
  7597  	mov	rsi, rcx
  7598  	not	rsi
  7599  	add	rsi, rax
  7600  	mov	rdi, rax
  7601  	and	rdi, 3
  7602  	je	.LBB1_683
  7603  .LBB1_682:                              # =>This Inner Loop Header: Depth=1
  7604  	movsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
  7605  	mulsd	xmm1, xmm0
  7606  	movsd	qword ptr [r8 + 8*rcx], xmm1
  7607  	add	rcx, 1
  7608  	add	rdi, -1
  7609  	jne	.LBB1_682
  7610  .LBB1_683:
  7611  	cmp	rsi, 3
  7612  	jb	.LBB1_1069
  7613  .LBB1_684:                              # =>This Inner Loop Header: Depth=1
  7614  	movsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
  7615  	mulsd	xmm1, xmm0
  7616  	movsd	qword ptr [r8 + 8*rcx], xmm1
  7617  	movsd	xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
  7618  	mulsd	xmm1, xmm0
  7619  	movsd	qword ptr [r8 + 8*rcx + 8], xmm1
  7620  	movsd	xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
  7621  	mulsd	xmm1, xmm0
  7622  	movsd	qword ptr [r8 + 8*rcx + 16], xmm1
  7623  	movsd	xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
  7624  	mulsd	xmm1, xmm0
  7625  	movsd	qword ptr [r8 + 8*rcx + 24], xmm1
  7626  	add	rcx, 4
  7627  	cmp	rax, rcx
  7628  	jne	.LBB1_684
  7629  	jmp	.LBB1_1069
  7630  .LBB1_69:
  7631  	cmp	edi, 8
  7632  	jle	.LBB1_137
  7633  # %bb.70:
  7634  	cmp	edi, 9
  7635  	je	.LBB1_205
  7636  # %bb.71:
  7637  	cmp	edi, 11
  7638  	je	.LBB1_208
  7639  # %bb.72:
  7640  	cmp	edi, 12
  7641  	jne	.LBB1_1069
  7642  # %bb.73:
  7643  	test	r9d, r9d
  7644  	jle	.LBB1_1069
  7645  # %bb.74:
  7646  	movsd	xmm0, qword ptr [rcx]           # xmm0 = mem[0],zero
  7647  	mov	eax, r9d
  7648  	cmp	r9d, 4
  7649  	jb	.LBB1_75
  7650  # %bb.289:
  7651  	lea	rcx, [rdx + 8*rax]
  7652  	cmp	rcx, r8
  7653  	jbe	.LBB1_477
  7654  # %bb.290:
  7655  	lea	rcx, [r8 + 8*rax]
  7656  	cmp	rcx, rdx
  7657  	jbe	.LBB1_477
  7658  .LBB1_75:
  7659  	xor	ecx, ecx
  7660  .LBB1_689:
  7661  	mov	rsi, rcx
  7662  	not	rsi
  7663  	add	rsi, rax
  7664  	mov	rdi, rax
  7665  	and	rdi, 3
  7666  	je	.LBB1_691
  7667  .LBB1_690:                              # =>This Inner Loop Header: Depth=1
  7668  	movsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
  7669  	addsd	xmm1, xmm0
  7670  	movsd	qword ptr [r8 + 8*rcx], xmm1
  7671  	add	rcx, 1
  7672  	add	rdi, -1
  7673  	jne	.LBB1_690
  7674  .LBB1_691:
  7675  	cmp	rsi, 3
  7676  	jb	.LBB1_1069
  7677  .LBB1_692:                              # =>This Inner Loop Header: Depth=1
  7678  	movsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
  7679  	addsd	xmm1, xmm0
  7680  	movsd	qword ptr [r8 + 8*rcx], xmm1
  7681  	movsd	xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
  7682  	addsd	xmm1, xmm0
  7683  	movsd	qword ptr [r8 + 8*rcx + 8], xmm1
  7684  	movsd	xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
  7685  	addsd	xmm1, xmm0
  7686  	movsd	qword ptr [r8 + 8*rcx + 16], xmm1
  7687  	movsd	xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
  7688  	addsd	xmm1, xmm0
  7689  	movsd	qword ptr [r8 + 8*rcx + 24], xmm1
  7690  	add	rcx, 4
  7691  	cmp	rax, rcx
  7692  	jne	.LBB1_692
  7693  	jmp	.LBB1_1069
  7694  .LBB1_76:
  7695  	cmp	edi, 8
  7696  	jle	.LBB1_142
  7697  # %bb.77:
  7698  	cmp	edi, 9
  7699  	je	.LBB1_211
  7700  # %bb.78:
  7701  	cmp	edi, 11
  7702  	je	.LBB1_214
  7703  # %bb.79:
  7704  	cmp	edi, 12
  7705  	jne	.LBB1_1069
  7706  # %bb.80:
  7707  	test	r9d, r9d
  7708  	jle	.LBB1_1069
  7709  # %bb.81:
  7710  	movsd	xmm0, qword ptr [rcx]           # xmm0 = mem[0],zero
  7711  	mov	eax, r9d
  7712  	cmp	r9d, 4
  7713  	jb	.LBB1_82
  7714  # %bb.292:
  7715  	lea	rcx, [rdx + 8*rax]
  7716  	cmp	rcx, r8
  7717  	jbe	.LBB1_480
  7718  # %bb.293:
  7719  	lea	rcx, [r8 + 8*rax]
  7720  	cmp	rcx, rdx
  7721  	jbe	.LBB1_480
  7722  .LBB1_82:
  7723  	xor	ecx, ecx
  7724  .LBB1_697:
  7725  	mov	rsi, rcx
  7726  	not	rsi
  7727  	add	rsi, rax
  7728  	mov	rdi, rax
  7729  	and	rdi, 3
  7730  	je	.LBB1_699
  7731  .LBB1_698:                              # =>This Inner Loop Header: Depth=1
  7732  	movsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
  7733  	subsd	xmm1, xmm0
  7734  	movsd	qword ptr [r8 + 8*rcx], xmm1
  7735  	add	rcx, 1
  7736  	add	rdi, -1
  7737  	jne	.LBB1_698
  7738  .LBB1_699:
  7739  	cmp	rsi, 3
  7740  	jb	.LBB1_1069
  7741  .LBB1_700:                              # =>This Inner Loop Header: Depth=1
  7742  	movsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
  7743  	subsd	xmm1, xmm0
  7744  	movsd	qword ptr [r8 + 8*rcx], xmm1
  7745  	movsd	xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
  7746  	subsd	xmm1, xmm0
  7747  	movsd	qword ptr [r8 + 8*rcx + 8], xmm1
  7748  	movsd	xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
  7749  	subsd	xmm1, xmm0
  7750  	movsd	qword ptr [r8 + 8*rcx + 16], xmm1
  7751  	movsd	xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
  7752  	subsd	xmm1, xmm0
  7753  	movsd	qword ptr [r8 + 8*rcx + 24], xmm1
  7754  	add	rcx, 4
  7755  	cmp	rax, rcx
  7756  	jne	.LBB1_700
  7757  	jmp	.LBB1_1069
  7758  .LBB1_83:
  7759  	cmp	edi, 8
  7760  	jle	.LBB1_147
  7761  # %bb.84:
  7762  	cmp	edi, 9
  7763  	je	.LBB1_217
  7764  # %bb.85:
  7765  	cmp	edi, 11
  7766  	je	.LBB1_220
  7767  # %bb.86:
  7768  	cmp	edi, 12
  7769  	jne	.LBB1_1069
  7770  # %bb.87:
  7771  	test	r9d, r9d
  7772  	jle	.LBB1_1069
  7773  # %bb.88:
  7774  	movsd	xmm0, qword ptr [rcx]           # xmm0 = mem[0],zero
  7775  	mov	eax, r9d
  7776  	cmp	r9d, 4
  7777  	jb	.LBB1_89
  7778  # %bb.295:
  7779  	lea	rcx, [rdx + 8*rax]
  7780  	cmp	rcx, r8
  7781  	jbe	.LBB1_483
  7782  # %bb.296:
  7783  	lea	rcx, [r8 + 8*rax]
  7784  	cmp	rcx, rdx
  7785  	jbe	.LBB1_483
  7786  .LBB1_89:
  7787  	xor	ecx, ecx
  7788  .LBB1_705:
  7789  	mov	rsi, rcx
  7790  	not	rsi
  7791  	add	rsi, rax
  7792  	mov	rdi, rax
  7793  	and	rdi, 3
  7794  	je	.LBB1_707
  7795  .LBB1_706:                              # =>This Inner Loop Header: Depth=1
  7796  	movsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
  7797  	addsd	xmm1, xmm0
  7798  	movsd	qword ptr [r8 + 8*rcx], xmm1
  7799  	add	rcx, 1
  7800  	add	rdi, -1
  7801  	jne	.LBB1_706
  7802  .LBB1_707:
  7803  	cmp	rsi, 3
  7804  	jb	.LBB1_1069
  7805  .LBB1_708:                              # =>This Inner Loop Header: Depth=1
  7806  	movsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
  7807  	addsd	xmm1, xmm0
  7808  	movsd	qword ptr [r8 + 8*rcx], xmm1
  7809  	movsd	xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
  7810  	addsd	xmm1, xmm0
  7811  	movsd	qword ptr [r8 + 8*rcx + 8], xmm1
  7812  	movsd	xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
  7813  	addsd	xmm1, xmm0
  7814  	movsd	qword ptr [r8 + 8*rcx + 16], xmm1
  7815  	movsd	xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
  7816  	addsd	xmm1, xmm0
  7817  	movsd	qword ptr [r8 + 8*rcx + 24], xmm1
  7818  	add	rcx, 4
  7819  	cmp	rax, rcx
  7820  	jne	.LBB1_708
  7821  	jmp	.LBB1_1069
  7822  .LBB1_90:
  7823  	cmp	edi, 8
  7824  	jle	.LBB1_152
  7825  # %bb.91:
  7826  	cmp	edi, 9
  7827  	je	.LBB1_223
  7828  # %bb.92:
  7829  	cmp	edi, 11
  7830  	je	.LBB1_226
  7831  # %bb.93:
  7832  	cmp	edi, 12
  7833  	jne	.LBB1_1069
  7834  # %bb.94:
  7835  	test	r9d, r9d
  7836  	jle	.LBB1_1069
  7837  # %bb.95:
  7838  	movsd	xmm0, qword ptr [rcx]           # xmm0 = mem[0],zero
  7839  	mov	eax, r9d
  7840  	cmp	r9d, 4
  7841  	jb	.LBB1_96
  7842  # %bb.298:
  7843  	lea	rcx, [rdx + 8*rax]
  7844  	cmp	rcx, r8
  7845  	jbe	.LBB1_486
  7846  # %bb.299:
  7847  	lea	rcx, [r8 + 8*rax]
  7848  	cmp	rcx, rdx
  7849  	jbe	.LBB1_486
  7850  .LBB1_96:
  7851  	xor	ecx, ecx
  7852  .LBB1_713:
  7853  	mov	rsi, rcx
  7854  	not	rsi
  7855  	add	rsi, rax
  7856  	mov	rdi, rax
  7857  	and	rdi, 3
  7858  	je	.LBB1_715
  7859  .LBB1_714:                              # =>This Inner Loop Header: Depth=1
  7860  	movsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
  7861  	subsd	xmm1, xmm0
  7862  	movsd	qword ptr [r8 + 8*rcx], xmm1
  7863  	add	rcx, 1
  7864  	add	rdi, -1
  7865  	jne	.LBB1_714
  7866  .LBB1_715:
  7867  	cmp	rsi, 3
  7868  	jb	.LBB1_1069
  7869  .LBB1_716:                              # =>This Inner Loop Header: Depth=1
  7870  	movsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
  7871  	subsd	xmm1, xmm0
  7872  	movsd	qword ptr [r8 + 8*rcx], xmm1
  7873  	movsd	xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
  7874  	subsd	xmm1, xmm0
  7875  	movsd	qword ptr [r8 + 8*rcx + 8], xmm1
  7876  	movsd	xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
  7877  	subsd	xmm1, xmm0
  7878  	movsd	qword ptr [r8 + 8*rcx + 16], xmm1
  7879  	movsd	xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
  7880  	subsd	xmm1, xmm0
  7881  	movsd	qword ptr [r8 + 8*rcx + 24], xmm1
  7882  	add	rcx, 4
  7883  	cmp	rax, rcx
  7884  	jne	.LBB1_716
  7885  	jmp	.LBB1_1069
  7886  .LBB1_97:
  7887  	cmp	edi, 2
  7888  	je	.LBB1_229
  7889  # %bb.98:
  7890  	cmp	edi, 3
  7891  	jne	.LBB1_1069
  7892  # %bb.99:
  7893  	test	r9d, r9d
  7894  	jle	.LBB1_1069
  7895  # %bb.100:
  7896  	mov	cl, byte ptr [rcx]
  7897  	mov	r10d, r9d
  7898  	cmp	r9d, 32
  7899  	jb	.LBB1_101
  7900  # %bb.301:
  7901  	lea	rax, [rdx + r10]
  7902  	cmp	rax, r8
  7903  	jbe	.LBB1_489
  7904  # %bb.302:
  7905  	lea	rax, [r8 + r10]
  7906  	cmp	rax, rdx
  7907  	jbe	.LBB1_489
  7908  .LBB1_101:
  7909  	xor	edi, edi
  7910  .LBB1_721:
  7911  	mov	r9, rdi
  7912  	not	r9
  7913  	add	r9, r10
  7914  	mov	rsi, r10
  7915  	and	rsi, 3
  7916  	je	.LBB1_723
  7917  .LBB1_722:                              # =>This Inner Loop Header: Depth=1
  7918  	movzx	eax, byte ptr [rdx + rdi]
  7919  	mul	cl
  7920  	mov	byte ptr [r8 + rdi], al
  7921  	add	rdi, 1
  7922  	add	rsi, -1
  7923  	jne	.LBB1_722
  7924  .LBB1_723:
  7925  	cmp	r9, 3
  7926  	jb	.LBB1_1069
  7927  .LBB1_724:                              # =>This Inner Loop Header: Depth=1
  7928  	movzx	eax, byte ptr [rdx + rdi]
  7929  	mul	cl
  7930  	mov	byte ptr [r8 + rdi], al
  7931  	movzx	eax, byte ptr [rdx + rdi + 1]
  7932  	mul	cl
  7933  	mov	byte ptr [r8 + rdi + 1], al
  7934  	movzx	eax, byte ptr [rdx + rdi + 2]
  7935  	mul	cl
  7936  	mov	byte ptr [r8 + rdi + 2], al
  7937  	movzx	eax, byte ptr [rdx + rdi + 3]
  7938  	mul	cl
  7939  	mov	byte ptr [r8 + rdi + 3], al
  7940  	add	rdi, 4
  7941  	cmp	r10, rdi
  7942  	jne	.LBB1_724
  7943  	jmp	.LBB1_1069
  7944  .LBB1_102:
  7945  	cmp	edi, 2
  7946  	je	.LBB1_232
  7947  # %bb.103:
  7948  	cmp	edi, 3
  7949  	jne	.LBB1_1069
  7950  # %bb.104:
  7951  	test	r9d, r9d
  7952  	jle	.LBB1_1069
  7953  # %bb.105:
  7954  	mov	cl, byte ptr [rcx]
  7955  	mov	r10d, r9d
  7956  	cmp	r9d, 32
  7957  	jb	.LBB1_106
  7958  # %bb.304:
  7959  	lea	rax, [rdx + r10]
  7960  	cmp	rax, r8
  7961  	jbe	.LBB1_492
  7962  # %bb.305:
  7963  	lea	rax, [r8 + r10]
  7964  	cmp	rax, rdx
  7965  	jbe	.LBB1_492
  7966  .LBB1_106:
  7967  	xor	edi, edi
  7968  .LBB1_729:
  7969  	mov	r9, rdi
  7970  	not	r9
  7971  	add	r9, r10
  7972  	mov	rsi, r10
  7973  	and	rsi, 3
  7974  	je	.LBB1_731
  7975  .LBB1_730:                              # =>This Inner Loop Header: Depth=1
  7976  	movzx	eax, byte ptr [rdx + rdi]
  7977  	mul	cl
  7978  	mov	byte ptr [r8 + rdi], al
  7979  	add	rdi, 1
  7980  	add	rsi, -1
  7981  	jne	.LBB1_730
  7982  .LBB1_731:
  7983  	cmp	r9, 3
  7984  	jb	.LBB1_1069
  7985  .LBB1_732:                              # =>This Inner Loop Header: Depth=1
  7986  	movzx	eax, byte ptr [rdx + rdi]
  7987  	mul	cl
  7988  	mov	byte ptr [r8 + rdi], al
  7989  	movzx	eax, byte ptr [rdx + rdi + 1]
  7990  	mul	cl
  7991  	mov	byte ptr [r8 + rdi + 1], al
  7992  	movzx	eax, byte ptr [rdx + rdi + 2]
  7993  	mul	cl
  7994  	mov	byte ptr [r8 + rdi + 2], al
  7995  	movzx	eax, byte ptr [rdx + rdi + 3]
  7996  	mul	cl
  7997  	mov	byte ptr [r8 + rdi + 3], al
  7998  	add	rdi, 4
  7999  	cmp	r10, rdi
  8000  	jne	.LBB1_732
  8001  	jmp	.LBB1_1069
  8002  .LBB1_107:
  8003  	cmp	edi, 2
  8004  	je	.LBB1_235
  8005  # %bb.108:
  8006  	cmp	edi, 3
  8007  	jne	.LBB1_1069
  8008  # %bb.109:
  8009  	test	r9d, r9d
  8010  	jle	.LBB1_1069
  8011  # %bb.110:
  8012  	mov	al, byte ptr [rcx]
  8013  	mov	r10d, r9d
  8014  	cmp	r9d, 32
  8015  	jb	.LBB1_111
  8016  # %bb.307:
  8017  	lea	rcx, [rdx + r10]
  8018  	cmp	rcx, r8
  8019  	jbe	.LBB1_495
  8020  # %bb.308:
  8021  	lea	rcx, [r8 + r10]
  8022  	cmp	rcx, rdx
  8023  	jbe	.LBB1_495
  8024  .LBB1_111:
  8025  	xor	esi, esi
  8026  .LBB1_737:
  8027  	mov	r9, rsi
  8028  	not	r9
  8029  	add	r9, r10
  8030  	mov	rdi, r10
  8031  	and	rdi, 3
  8032  	je	.LBB1_739
  8033  .LBB1_738:                              # =>This Inner Loop Header: Depth=1
  8034  	movzx	ecx, byte ptr [rdx + rsi]
  8035  	add	cl, al
  8036  	mov	byte ptr [r8 + rsi], cl
  8037  	add	rsi, 1
  8038  	add	rdi, -1
  8039  	jne	.LBB1_738
  8040  .LBB1_739:
  8041  	cmp	r9, 3
  8042  	jb	.LBB1_1069
  8043  .LBB1_740:                              # =>This Inner Loop Header: Depth=1
  8044  	movzx	ecx, byte ptr [rdx + rsi]
  8045  	add	cl, al
  8046  	mov	byte ptr [r8 + rsi], cl
  8047  	movzx	ecx, byte ptr [rdx + rsi + 1]
  8048  	add	cl, al
  8049  	mov	byte ptr [r8 + rsi + 1], cl
  8050  	movzx	ecx, byte ptr [rdx + rsi + 2]
  8051  	add	cl, al
  8052  	mov	byte ptr [r8 + rsi + 2], cl
  8053  	movzx	ecx, byte ptr [rdx + rsi + 3]
  8054  	add	cl, al
  8055  	mov	byte ptr [r8 + rsi + 3], cl
  8056  	add	rsi, 4
  8057  	cmp	r10, rsi
  8058  	jne	.LBB1_740
  8059  	jmp	.LBB1_1069
  8060  .LBB1_112:
  8061  	cmp	edi, 2
  8062  	je	.LBB1_238
  8063  # %bb.113:
  8064  	cmp	edi, 3
  8065  	jne	.LBB1_1069
  8066  # %bb.114:
  8067  	test	r9d, r9d
  8068  	jle	.LBB1_1069
  8069  # %bb.115:
  8070  	mov	al, byte ptr [rcx]
  8071  	mov	r10d, r9d
  8072  	cmp	r9d, 32
  8073  	jb	.LBB1_116
  8074  # %bb.310:
  8075  	lea	rcx, [rdx + r10]
  8076  	cmp	rcx, r8
  8077  	jbe	.LBB1_498
  8078  # %bb.311:
  8079  	lea	rcx, [r8 + r10]
  8080  	cmp	rcx, rdx
  8081  	jbe	.LBB1_498
  8082  .LBB1_116:
  8083  	xor	esi, esi
  8084  .LBB1_745:
  8085  	mov	r9, rsi
  8086  	not	r9
  8087  	add	r9, r10
  8088  	mov	rdi, r10
  8089  	and	rdi, 3
  8090  	je	.LBB1_747
  8091  .LBB1_746:                              # =>This Inner Loop Header: Depth=1
  8092  	movzx	ecx, byte ptr [rdx + rsi]
  8093  	sub	cl, al
  8094  	mov	byte ptr [r8 + rsi], cl
  8095  	add	rsi, 1
  8096  	add	rdi, -1
  8097  	jne	.LBB1_746
  8098  .LBB1_747:
  8099  	cmp	r9, 3
  8100  	jb	.LBB1_1069
  8101  .LBB1_748:                              # =>This Inner Loop Header: Depth=1
  8102  	movzx	ecx, byte ptr [rdx + rsi]
  8103  	sub	cl, al
  8104  	mov	byte ptr [r8 + rsi], cl
  8105  	movzx	ecx, byte ptr [rdx + rsi + 1]
  8106  	sub	cl, al
  8107  	mov	byte ptr [r8 + rsi + 1], cl
  8108  	movzx	ecx, byte ptr [rdx + rsi + 2]
  8109  	sub	cl, al
  8110  	mov	byte ptr [r8 + rsi + 2], cl
  8111  	movzx	ecx, byte ptr [rdx + rsi + 3]
  8112  	sub	cl, al
  8113  	mov	byte ptr [r8 + rsi + 3], cl
  8114  	add	rsi, 4
  8115  	cmp	r10, rsi
  8116  	jne	.LBB1_748
  8117  	jmp	.LBB1_1069
  8118  .LBB1_117:
  8119  	cmp	edi, 2
  8120  	je	.LBB1_241
  8121  # %bb.118:
  8122  	cmp	edi, 3
  8123  	jne	.LBB1_1069
  8124  # %bb.119:
  8125  	test	r9d, r9d
  8126  	jle	.LBB1_1069
  8127  # %bb.120:
  8128  	mov	al, byte ptr [rcx]
  8129  	mov	r10d, r9d
  8130  	cmp	r9d, 32
  8131  	jb	.LBB1_121
  8132  # %bb.313:
  8133  	lea	rcx, [rdx + r10]
  8134  	cmp	rcx, r8
  8135  	jbe	.LBB1_501
  8136  # %bb.314:
  8137  	lea	rcx, [r8 + r10]
  8138  	cmp	rcx, rdx
  8139  	jbe	.LBB1_501
  8140  .LBB1_121:
  8141  	xor	esi, esi
  8142  .LBB1_753:
  8143  	mov	r9, rsi
  8144  	not	r9
  8145  	add	r9, r10
  8146  	mov	rdi, r10
  8147  	and	rdi, 3
  8148  	je	.LBB1_755
  8149  .LBB1_754:                              # =>This Inner Loop Header: Depth=1
  8150  	movzx	ecx, byte ptr [rdx + rsi]
  8151  	add	cl, al
  8152  	mov	byte ptr [r8 + rsi], cl
  8153  	add	rsi, 1
  8154  	add	rdi, -1
  8155  	jne	.LBB1_754
  8156  .LBB1_755:
  8157  	cmp	r9, 3
  8158  	jb	.LBB1_1069
  8159  .LBB1_756:                              # =>This Inner Loop Header: Depth=1
  8160  	movzx	ecx, byte ptr [rdx + rsi]
  8161  	add	cl, al
  8162  	mov	byte ptr [r8 + rsi], cl
  8163  	movzx	ecx, byte ptr [rdx + rsi + 1]
  8164  	add	cl, al
  8165  	mov	byte ptr [r8 + rsi + 1], cl
  8166  	movzx	ecx, byte ptr [rdx + rsi + 2]
  8167  	add	cl, al
  8168  	mov	byte ptr [r8 + rsi + 2], cl
  8169  	movzx	ecx, byte ptr [rdx + rsi + 3]
  8170  	add	cl, al
  8171  	mov	byte ptr [r8 + rsi + 3], cl
  8172  	add	rsi, 4
  8173  	cmp	r10, rsi
  8174  	jne	.LBB1_756
  8175  	jmp	.LBB1_1069
  8176  .LBB1_122:
  8177  	cmp	edi, 2
  8178  	je	.LBB1_244
  8179  # %bb.123:
  8180  	cmp	edi, 3
  8181  	jne	.LBB1_1069
  8182  # %bb.124:
  8183  	test	r9d, r9d
  8184  	jle	.LBB1_1069
  8185  # %bb.125:
  8186  	mov	al, byte ptr [rcx]
  8187  	mov	r10d, r9d
  8188  	cmp	r9d, 32
  8189  	jb	.LBB1_126
  8190  # %bb.316:
  8191  	lea	rcx, [rdx + r10]
  8192  	cmp	rcx, r8
  8193  	jbe	.LBB1_504
  8194  # %bb.317:
  8195  	lea	rcx, [r8 + r10]
  8196  	cmp	rcx, rdx
  8197  	jbe	.LBB1_504
  8198  .LBB1_126:
  8199  	xor	esi, esi
  8200  .LBB1_761:
  8201  	mov	r9, rsi
  8202  	not	r9
  8203  	add	r9, r10
  8204  	mov	rdi, r10
  8205  	and	rdi, 3
  8206  	je	.LBB1_763
  8207  .LBB1_762:                              # =>This Inner Loop Header: Depth=1
  8208  	movzx	ecx, byte ptr [rdx + rsi]
  8209  	sub	cl, al
  8210  	mov	byte ptr [r8 + rsi], cl
  8211  	add	rsi, 1
  8212  	add	rdi, -1
  8213  	jne	.LBB1_762
  8214  .LBB1_763:
  8215  	cmp	r9, 3
  8216  	jb	.LBB1_1069
  8217  .LBB1_764:                              # =>This Inner Loop Header: Depth=1
  8218  	movzx	ecx, byte ptr [rdx + rsi]
  8219  	sub	cl, al
  8220  	mov	byte ptr [r8 + rsi], cl
  8221  	movzx	ecx, byte ptr [rdx + rsi + 1]
  8222  	sub	cl, al
  8223  	mov	byte ptr [r8 + rsi + 1], cl
  8224  	movzx	ecx, byte ptr [rdx + rsi + 2]
  8225  	sub	cl, al
  8226  	mov	byte ptr [r8 + rsi + 2], cl
  8227  	movzx	ecx, byte ptr [rdx + rsi + 3]
  8228  	sub	cl, al
  8229  	mov	byte ptr [r8 + rsi + 3], cl
  8230  	add	rsi, 4
  8231  	cmp	r10, rsi
  8232  	jne	.LBB1_764
  8233  	jmp	.LBB1_1069
  8234  .LBB1_127:
  8235  	cmp	edi, 7
  8236  	je	.LBB1_247
  8237  # %bb.128:
  8238  	cmp	edi, 8
  8239  	jne	.LBB1_1069
  8240  # %bb.129:
  8241  	test	r9d, r9d
  8242  	jle	.LBB1_1069
  8243  # %bb.130:
  8244  	mov	rax, qword ptr [rcx]
  8245  	mov	esi, r9d
  8246  	lea	rdi, [rsi - 1]
  8247  	mov	r9d, esi
  8248  	and	r9d, 3
  8249  	cmp	rdi, 3
  8250  	jae	.LBB1_319
  8251  # %bb.131:
  8252  	xor	edi, edi
  8253  	jmp	.LBB1_321
  8254  .LBB1_132:
  8255  	cmp	edi, 7
  8256  	je	.LBB1_250
  8257  # %bb.133:
  8258  	cmp	edi, 8
  8259  	jne	.LBB1_1069
  8260  # %bb.134:
  8261  	test	r9d, r9d
  8262  	jle	.LBB1_1069
  8263  # %bb.135:
  8264  	mov	rax, qword ptr [rcx]
  8265  	mov	esi, r9d
  8266  	lea	rdi, [rsi - 1]
  8267  	mov	r9d, esi
  8268  	and	r9d, 3
  8269  	cmp	rdi, 3
  8270  	jae	.LBB1_324
  8271  # %bb.136:
  8272  	xor	edi, edi
  8273  	jmp	.LBB1_326
  8274  .LBB1_137:
  8275  	cmp	edi, 7
  8276  	je	.LBB1_253
  8277  # %bb.138:
  8278  	cmp	edi, 8
  8279  	jne	.LBB1_1069
  8280  # %bb.139:
  8281  	test	r9d, r9d
  8282  	jle	.LBB1_1069
  8283  # %bb.140:
  8284  	mov	rax, qword ptr [rcx]
  8285  	mov	r10d, r9d
  8286  	cmp	r9d, 4
  8287  	jb	.LBB1_141
  8288  # %bb.329:
  8289  	lea	rcx, [rdx + 8*r10]
  8290  	cmp	rcx, r8
  8291  	jbe	.LBB1_507
  8292  # %bb.330:
  8293  	lea	rcx, [r8 + 8*r10]
  8294  	cmp	rcx, rdx
  8295  	jbe	.LBB1_507
  8296  .LBB1_141:
  8297  	xor	esi, esi
  8298  .LBB1_769:
  8299  	mov	r9, rsi
  8300  	not	r9
  8301  	add	r9, r10
  8302  	mov	rdi, r10
  8303  	and	rdi, 3
  8304  	je	.LBB1_771
  8305  .LBB1_770:                              # =>This Inner Loop Header: Depth=1
  8306  	mov	rcx, qword ptr [rdx + 8*rsi]
  8307  	add	rcx, rax
  8308  	mov	qword ptr [r8 + 8*rsi], rcx
  8309  	add	rsi, 1
  8310  	add	rdi, -1
  8311  	jne	.LBB1_770
  8312  .LBB1_771:
  8313  	cmp	r9, 3
  8314  	jb	.LBB1_1069
  8315  .LBB1_772:                              # =>This Inner Loop Header: Depth=1
  8316  	mov	rcx, qword ptr [rdx + 8*rsi]
  8317  	add	rcx, rax
  8318  	mov	qword ptr [r8 + 8*rsi], rcx
  8319  	mov	rcx, qword ptr [rdx + 8*rsi + 8]
  8320  	add	rcx, rax
  8321  	mov	qword ptr [r8 + 8*rsi + 8], rcx
  8322  	mov	rcx, qword ptr [rdx + 8*rsi + 16]
  8323  	add	rcx, rax
  8324  	mov	qword ptr [r8 + 8*rsi + 16], rcx
  8325  	mov	rcx, qword ptr [rdx + 8*rsi + 24]
  8326  	add	rcx, rax
  8327  	mov	qword ptr [r8 + 8*rsi + 24], rcx
  8328  	add	rsi, 4
  8329  	cmp	r10, rsi
  8330  	jne	.LBB1_772
  8331  	jmp	.LBB1_1069
  8332  .LBB1_142:
  8333  	cmp	edi, 7
  8334  	je	.LBB1_256
  8335  # %bb.143:
  8336  	cmp	edi, 8
  8337  	jne	.LBB1_1069
  8338  # %bb.144:
  8339  	test	r9d, r9d
  8340  	jle	.LBB1_1069
  8341  # %bb.145:
  8342  	mov	rax, qword ptr [rcx]
  8343  	mov	r10d, r9d
  8344  	cmp	r9d, 4
  8345  	jb	.LBB1_146
  8346  # %bb.332:
  8347  	lea	rcx, [rdx + 8*r10]
  8348  	cmp	rcx, r8
  8349  	jbe	.LBB1_510
  8350  # %bb.333:
  8351  	lea	rcx, [r8 + 8*r10]
  8352  	cmp	rcx, rdx
  8353  	jbe	.LBB1_510
  8354  .LBB1_146:
  8355  	xor	esi, esi
  8356  .LBB1_777:
  8357  	mov	r9, rsi
  8358  	not	r9
  8359  	add	r9, r10
  8360  	mov	rdi, r10
  8361  	and	rdi, 3
  8362  	je	.LBB1_779
  8363  .LBB1_778:                              # =>This Inner Loop Header: Depth=1
  8364  	mov	rcx, qword ptr [rdx + 8*rsi]
  8365  	sub	rcx, rax
  8366  	mov	qword ptr [r8 + 8*rsi], rcx
  8367  	add	rsi, 1
  8368  	add	rdi, -1
  8369  	jne	.LBB1_778
  8370  .LBB1_779:
  8371  	cmp	r9, 3
  8372  	jb	.LBB1_1069
  8373  .LBB1_780:                              # =>This Inner Loop Header: Depth=1
  8374  	mov	rcx, qword ptr [rdx + 8*rsi]
  8375  	sub	rcx, rax
  8376  	mov	qword ptr [r8 + 8*rsi], rcx
  8377  	mov	rcx, qword ptr [rdx + 8*rsi + 8]
  8378  	sub	rcx, rax
  8379  	mov	qword ptr [r8 + 8*rsi + 8], rcx
  8380  	mov	rcx, qword ptr [rdx + 8*rsi + 16]
  8381  	sub	rcx, rax
  8382  	mov	qword ptr [r8 + 8*rsi + 16], rcx
  8383  	mov	rcx, qword ptr [rdx + 8*rsi + 24]
  8384  	sub	rcx, rax
  8385  	mov	qword ptr [r8 + 8*rsi + 24], rcx
  8386  	add	rsi, 4
  8387  	cmp	r10, rsi
  8388  	jne	.LBB1_780
  8389  	jmp	.LBB1_1069
  8390  .LBB1_147:
  8391  	cmp	edi, 7
  8392  	je	.LBB1_259
  8393  # %bb.148:
  8394  	cmp	edi, 8
  8395  	jne	.LBB1_1069
  8396  # %bb.149:
  8397  	test	r9d, r9d
  8398  	jle	.LBB1_1069
  8399  # %bb.150:
  8400  	mov	rax, qword ptr [rcx]
  8401  	mov	r10d, r9d
  8402  	cmp	r9d, 4
  8403  	jb	.LBB1_151
  8404  # %bb.335:
  8405  	lea	rcx, [rdx + 8*r10]
  8406  	cmp	rcx, r8
  8407  	jbe	.LBB1_513
  8408  # %bb.336:
  8409  	lea	rcx, [r8 + 8*r10]
  8410  	cmp	rcx, rdx
  8411  	jbe	.LBB1_513
  8412  .LBB1_151:
  8413  	xor	esi, esi
  8414  .LBB1_785:
  8415  	mov	r9, rsi
  8416  	not	r9
  8417  	add	r9, r10
  8418  	mov	rdi, r10
  8419  	and	rdi, 3
  8420  	je	.LBB1_787
  8421  .LBB1_786:                              # =>This Inner Loop Header: Depth=1
  8422  	mov	rcx, qword ptr [rdx + 8*rsi]
  8423  	add	rcx, rax
  8424  	mov	qword ptr [r8 + 8*rsi], rcx
  8425  	add	rsi, 1
  8426  	add	rdi, -1
  8427  	jne	.LBB1_786
  8428  .LBB1_787:
  8429  	cmp	r9, 3
  8430  	jb	.LBB1_1069
  8431  .LBB1_788:                              # =>This Inner Loop Header: Depth=1
  8432  	mov	rcx, qword ptr [rdx + 8*rsi]
  8433  	add	rcx, rax
  8434  	mov	qword ptr [r8 + 8*rsi], rcx
  8435  	mov	rcx, qword ptr [rdx + 8*rsi + 8]
  8436  	add	rcx, rax
  8437  	mov	qword ptr [r8 + 8*rsi + 8], rcx
  8438  	mov	rcx, qword ptr [rdx + 8*rsi + 16]
  8439  	add	rcx, rax
  8440  	mov	qword ptr [r8 + 8*rsi + 16], rcx
  8441  	mov	rcx, qword ptr [rdx + 8*rsi + 24]
  8442  	add	rcx, rax
  8443  	mov	qword ptr [r8 + 8*rsi + 24], rcx
  8444  	add	rsi, 4
  8445  	cmp	r10, rsi
  8446  	jne	.LBB1_788
  8447  	jmp	.LBB1_1069
  8448  .LBB1_152:
  8449  	cmp	edi, 7
  8450  	je	.LBB1_262
  8451  # %bb.153:
  8452  	cmp	edi, 8
  8453  	jne	.LBB1_1069
  8454  # %bb.154:
  8455  	test	r9d, r9d
  8456  	jle	.LBB1_1069
  8457  # %bb.155:
  8458  	mov	rax, qword ptr [rcx]
  8459  	mov	r10d, r9d
  8460  	cmp	r9d, 4
  8461  	jb	.LBB1_156
  8462  # %bb.338:
  8463  	lea	rcx, [rdx + 8*r10]
  8464  	cmp	rcx, r8
  8465  	jbe	.LBB1_516
  8466  # %bb.339:
  8467  	lea	rcx, [r8 + 8*r10]
  8468  	cmp	rcx, rdx
  8469  	jbe	.LBB1_516
  8470  .LBB1_156:
  8471  	xor	esi, esi
  8472  .LBB1_793:
  8473  	mov	r9, rsi
  8474  	not	r9
  8475  	add	r9, r10
  8476  	mov	rdi, r10
  8477  	and	rdi, 3
  8478  	je	.LBB1_795
  8479  .LBB1_794:                              # =>This Inner Loop Header: Depth=1
  8480  	mov	rcx, qword ptr [rdx + 8*rsi]
  8481  	sub	rcx, rax
  8482  	mov	qword ptr [r8 + 8*rsi], rcx
  8483  	add	rsi, 1
  8484  	add	rdi, -1
  8485  	jne	.LBB1_794
  8486  .LBB1_795:
  8487  	cmp	r9, 3
  8488  	jb	.LBB1_1069
  8489  .LBB1_796:                              # =>This Inner Loop Header: Depth=1
  8490  	mov	rcx, qword ptr [rdx + 8*rsi]
  8491  	sub	rcx, rax
  8492  	mov	qword ptr [r8 + 8*rsi], rcx
  8493  	mov	rcx, qword ptr [rdx + 8*rsi + 8]
  8494  	sub	rcx, rax
  8495  	mov	qword ptr [r8 + 8*rsi + 8], rcx
  8496  	mov	rcx, qword ptr [rdx + 8*rsi + 16]
  8497  	sub	rcx, rax
  8498  	mov	qword ptr [r8 + 8*rsi + 16], rcx
  8499  	mov	rcx, qword ptr [rdx + 8*rsi + 24]
  8500  	sub	rcx, rax
  8501  	mov	qword ptr [r8 + 8*rsi + 24], rcx
  8502  	add	rsi, 4
  8503  	cmp	r10, rsi
  8504  	jne	.LBB1_796
  8505  	jmp	.LBB1_1069
  8506  .LBB1_157:
  8507  	test	r9d, r9d
  8508  	jle	.LBB1_1069
  8509  # %bb.158:
  8510  	movzx	eax, word ptr [rcx]
  8511  	mov	r10d, r9d
  8512  	cmp	r9d, 16
  8513  	jb	.LBB1_159
  8514  # %bb.341:
  8515  	lea	rcx, [rdx + 2*r10]
  8516  	cmp	rcx, r8
  8517  	jbe	.LBB1_519
  8518  # %bb.342:
  8519  	lea	rcx, [r8 + 2*r10]
  8520  	cmp	rcx, rdx
  8521  	jbe	.LBB1_519
  8522  .LBB1_159:
  8523  	xor	esi, esi
  8524  .LBB1_801:
  8525  	mov	r9, rsi
  8526  	not	r9
  8527  	add	r9, r10
  8528  	mov	rdi, r10
  8529  	and	rdi, 3
  8530  	je	.LBB1_803
  8531  .LBB1_802:                              # =>This Inner Loop Header: Depth=1
  8532  	movzx	ecx, word ptr [rdx + 2*rsi]
  8533  	imul	cx, ax
  8534  	mov	word ptr [r8 + 2*rsi], cx
  8535  	add	rsi, 1
  8536  	add	rdi, -1
  8537  	jne	.LBB1_802
  8538  .LBB1_803:
  8539  	cmp	r9, 3
  8540  	jb	.LBB1_1069
  8541  .LBB1_804:                              # =>This Inner Loop Header: Depth=1
  8542  	movzx	ecx, word ptr [rdx + 2*rsi]
  8543  	imul	cx, ax
  8544  	mov	word ptr [r8 + 2*rsi], cx
  8545  	movzx	ecx, word ptr [rdx + 2*rsi + 2]
  8546  	imul	cx, ax
  8547  	mov	word ptr [r8 + 2*rsi + 2], cx
  8548  	movzx	ecx, word ptr [rdx + 2*rsi + 4]
  8549  	imul	cx, ax
  8550  	mov	word ptr [r8 + 2*rsi + 4], cx
  8551  	movzx	ecx, word ptr [rdx + 2*rsi + 6]
  8552  	imul	cx, ax
  8553  	mov	word ptr [r8 + 2*rsi + 6], cx
  8554  	add	rsi, 4
  8555  	cmp	r10, rsi
  8556  	jne	.LBB1_804
  8557  	jmp	.LBB1_1069
  8558  .LBB1_160:
  8559  	test	r9d, r9d
  8560  	jle	.LBB1_1069
  8561  # %bb.161:
  8562  	movzx	eax, word ptr [rcx]
  8563  	mov	r10d, r9d
  8564  	cmp	r9d, 16
  8565  	jb	.LBB1_162
  8566  # %bb.344:
  8567  	lea	rcx, [rdx + 2*r10]
  8568  	cmp	rcx, r8
  8569  	jbe	.LBB1_522
  8570  # %bb.345:
  8571  	lea	rcx, [r8 + 2*r10]
  8572  	cmp	rcx, rdx
  8573  	jbe	.LBB1_522
  8574  .LBB1_162:
  8575  	xor	esi, esi
  8576  .LBB1_809:
  8577  	mov	r9, rsi
  8578  	not	r9
  8579  	add	r9, r10
  8580  	mov	rdi, r10
  8581  	and	rdi, 3
  8582  	je	.LBB1_811
  8583  .LBB1_810:                              # =>This Inner Loop Header: Depth=1
  8584  	movzx	ecx, word ptr [rdx + 2*rsi]
  8585  	imul	cx, ax
  8586  	mov	word ptr [r8 + 2*rsi], cx
  8587  	add	rsi, 1
  8588  	add	rdi, -1
  8589  	jne	.LBB1_810
  8590  .LBB1_811:
  8591  	cmp	r9, 3
  8592  	jb	.LBB1_1069
  8593  .LBB1_812:                              # =>This Inner Loop Header: Depth=1
  8594  	movzx	ecx, word ptr [rdx + 2*rsi]
  8595  	imul	cx, ax
  8596  	mov	word ptr [r8 + 2*rsi], cx
  8597  	movzx	ecx, word ptr [rdx + 2*rsi + 2]
  8598  	imul	cx, ax
  8599  	mov	word ptr [r8 + 2*rsi + 2], cx
  8600  	movzx	ecx, word ptr [rdx + 2*rsi + 4]
  8601  	imul	cx, ax
  8602  	mov	word ptr [r8 + 2*rsi + 4], cx
  8603  	movzx	ecx, word ptr [rdx + 2*rsi + 6]
  8604  	imul	cx, ax
  8605  	mov	word ptr [r8 + 2*rsi + 6], cx
  8606  	add	rsi, 4
  8607  	cmp	r10, rsi
  8608  	jne	.LBB1_812
  8609  	jmp	.LBB1_1069
  8610  .LBB1_163:
  8611  	test	r9d, r9d
  8612  	jle	.LBB1_1069
  8613  # %bb.164:
  8614  	movzx	eax, word ptr [rcx]
  8615  	mov	r10d, r9d
  8616  	cmp	r9d, 16
  8617  	jb	.LBB1_165
  8618  # %bb.347:
  8619  	lea	rcx, [rdx + 2*r10]
  8620  	cmp	rcx, r8
  8621  	jbe	.LBB1_525
  8622  # %bb.348:
  8623  	lea	rcx, [r8 + 2*r10]
  8624  	cmp	rcx, rdx
  8625  	jbe	.LBB1_525
  8626  .LBB1_165:
  8627  	xor	esi, esi
  8628  .LBB1_817:
  8629  	mov	r9, rsi
  8630  	not	r9
  8631  	add	r9, r10
  8632  	mov	rdi, r10
  8633  	and	rdi, 3
  8634  	je	.LBB1_819
  8635  .LBB1_818:                              # =>This Inner Loop Header: Depth=1
  8636  	movzx	ecx, word ptr [rdx + 2*rsi]
  8637  	imul	cx, ax
  8638  	mov	word ptr [r8 + 2*rsi], cx
  8639  	add	rsi, 1
  8640  	add	rdi, -1
  8641  	jne	.LBB1_818
  8642  .LBB1_819:
  8643  	cmp	r9, 3
  8644  	jb	.LBB1_1069
  8645  .LBB1_820:                              # =>This Inner Loop Header: Depth=1
  8646  	movzx	ecx, word ptr [rdx + 2*rsi]
  8647  	imul	cx, ax
  8648  	mov	word ptr [r8 + 2*rsi], cx
  8649  	movzx	ecx, word ptr [rdx + 2*rsi + 2]
  8650  	imul	cx, ax
  8651  	mov	word ptr [r8 + 2*rsi + 2], cx
  8652  	movzx	ecx, word ptr [rdx + 2*rsi + 4]
  8653  	imul	cx, ax
  8654  	mov	word ptr [r8 + 2*rsi + 4], cx
  8655  	movzx	ecx, word ptr [rdx + 2*rsi + 6]
  8656  	imul	cx, ax
  8657  	mov	word ptr [r8 + 2*rsi + 6], cx
  8658  	add	rsi, 4
  8659  	cmp	r10, rsi
  8660  	jne	.LBB1_820
  8661  	jmp	.LBB1_1069
  8662  .LBB1_166:
  8663  	test	r9d, r9d
  8664  	jle	.LBB1_1069
  8665  # %bb.167:
  8666  	movzx	eax, word ptr [rcx]
  8667  	mov	r10d, r9d
  8668  	cmp	r9d, 16
  8669  	jb	.LBB1_168
  8670  # %bb.350:
  8671  	lea	rcx, [rdx + 2*r10]
  8672  	cmp	rcx, r8
  8673  	jbe	.LBB1_528
  8674  # %bb.351:
  8675  	lea	rcx, [r8 + 2*r10]
  8676  	cmp	rcx, rdx
  8677  	jbe	.LBB1_528
  8678  .LBB1_168:
  8679  	xor	esi, esi
  8680  .LBB1_825:
  8681  	mov	r9, rsi
  8682  	not	r9
  8683  	add	r9, r10
  8684  	mov	rdi, r10
  8685  	and	rdi, 3
  8686  	je	.LBB1_827
  8687  .LBB1_826:                              # =>This Inner Loop Header: Depth=1
  8688  	movzx	ecx, word ptr [rdx + 2*rsi]
  8689  	imul	cx, ax
  8690  	mov	word ptr [r8 + 2*rsi], cx
  8691  	add	rsi, 1
  8692  	add	rdi, -1
  8693  	jne	.LBB1_826
  8694  .LBB1_827:
  8695  	cmp	r9, 3
  8696  	jb	.LBB1_1069
  8697  .LBB1_828:                              # =>This Inner Loop Header: Depth=1
  8698  	movzx	ecx, word ptr [rdx + 2*rsi]
  8699  	imul	cx, ax
  8700  	mov	word ptr [r8 + 2*rsi], cx
  8701  	movzx	ecx, word ptr [rdx + 2*rsi + 2]
  8702  	imul	cx, ax
  8703  	mov	word ptr [r8 + 2*rsi + 2], cx
  8704  	movzx	ecx, word ptr [rdx + 2*rsi + 4]
  8705  	imul	cx, ax
  8706  	mov	word ptr [r8 + 2*rsi + 4], cx
  8707  	movzx	ecx, word ptr [rdx + 2*rsi + 6]
  8708  	imul	cx, ax
  8709  	mov	word ptr [r8 + 2*rsi + 6], cx
  8710  	add	rsi, 4
  8711  	cmp	r10, rsi
  8712  	jne	.LBB1_828
  8713  	jmp	.LBB1_1069
  8714  .LBB1_169:
  8715  	test	r9d, r9d
  8716  	jle	.LBB1_1069
  8717  # %bb.170:
  8718  	movzx	eax, word ptr [rcx]
  8719  	mov	r10d, r9d
  8720  	cmp	r9d, 16
  8721  	jb	.LBB1_171
  8722  # %bb.353:
  8723  	lea	rcx, [rdx + 2*r10]
  8724  	cmp	rcx, r8
  8725  	jbe	.LBB1_531
  8726  # %bb.354:
  8727  	lea	rcx, [r8 + 2*r10]
  8728  	cmp	rcx, rdx
  8729  	jbe	.LBB1_531
  8730  .LBB1_171:
  8731  	xor	esi, esi
  8732  .LBB1_833:
  8733  	mov	r9, rsi
  8734  	not	r9
  8735  	add	r9, r10
  8736  	mov	rdi, r10
  8737  	and	rdi, 3
  8738  	je	.LBB1_835
  8739  .LBB1_834:                              # =>This Inner Loop Header: Depth=1
  8740  	movzx	ecx, word ptr [rdx + 2*rsi]
  8741  	add	cx, ax
  8742  	mov	word ptr [r8 + 2*rsi], cx
  8743  	add	rsi, 1
  8744  	add	rdi, -1
  8745  	jne	.LBB1_834
  8746  .LBB1_835:
  8747  	cmp	r9, 3
  8748  	jb	.LBB1_1069
  8749  .LBB1_836:                              # =>This Inner Loop Header: Depth=1
  8750  	movzx	ecx, word ptr [rdx + 2*rsi]
  8751  	add	cx, ax
  8752  	mov	word ptr [r8 + 2*rsi], cx
  8753  	movzx	ecx, word ptr [rdx + 2*rsi + 2]
  8754  	add	cx, ax
  8755  	mov	word ptr [r8 + 2*rsi + 2], cx
  8756  	movzx	ecx, word ptr [rdx + 2*rsi + 4]
  8757  	add	cx, ax
  8758  	mov	word ptr [r8 + 2*rsi + 4], cx
  8759  	movzx	ecx, word ptr [rdx + 2*rsi + 6]
  8760  	add	cx, ax
  8761  	mov	word ptr [r8 + 2*rsi + 6], cx
  8762  	add	rsi, 4
  8763  	cmp	r10, rsi
  8764  	jne	.LBB1_836
  8765  	jmp	.LBB1_1069
  8766  .LBB1_172:
  8767  	test	r9d, r9d
  8768  	jle	.LBB1_1069
  8769  # %bb.173:
  8770  	movzx	eax, word ptr [rcx]
  8771  	mov	r10d, r9d
  8772  	cmp	r9d, 16
  8773  	jb	.LBB1_174
  8774  # %bb.356:
  8775  	lea	rcx, [rdx + 2*r10]
  8776  	cmp	rcx, r8
  8777  	jbe	.LBB1_534
  8778  # %bb.357:
  8779  	lea	rcx, [r8 + 2*r10]
  8780  	cmp	rcx, rdx
  8781  	jbe	.LBB1_534
  8782  .LBB1_174:
  8783  	xor	esi, esi
  8784  .LBB1_841:
  8785  	mov	r9, rsi
  8786  	not	r9
  8787  	add	r9, r10
  8788  	mov	rdi, r10
  8789  	and	rdi, 3
  8790  	je	.LBB1_843
  8791  .LBB1_842:                              # =>This Inner Loop Header: Depth=1
  8792  	movzx	ecx, word ptr [rdx + 2*rsi]
  8793  	add	cx, ax
  8794  	mov	word ptr [r8 + 2*rsi], cx
  8795  	add	rsi, 1
  8796  	add	rdi, -1
  8797  	jne	.LBB1_842
  8798  .LBB1_843:
  8799  	cmp	r9, 3
  8800  	jb	.LBB1_1069
  8801  .LBB1_844:                              # =>This Inner Loop Header: Depth=1
  8802  	movzx	ecx, word ptr [rdx + 2*rsi]
  8803  	add	cx, ax
  8804  	mov	word ptr [r8 + 2*rsi], cx
  8805  	movzx	ecx, word ptr [rdx + 2*rsi + 2]
  8806  	add	cx, ax
  8807  	mov	word ptr [r8 + 2*rsi + 2], cx
  8808  	movzx	ecx, word ptr [rdx + 2*rsi + 4]
  8809  	add	cx, ax
  8810  	mov	word ptr [r8 + 2*rsi + 4], cx
  8811  	movzx	ecx, word ptr [rdx + 2*rsi + 6]
  8812  	add	cx, ax
  8813  	mov	word ptr [r8 + 2*rsi + 6], cx
  8814  	add	rsi, 4
  8815  	cmp	r10, rsi
  8816  	jne	.LBB1_844
  8817  	jmp	.LBB1_1069
  8818  .LBB1_175:
  8819  	test	r9d, r9d
  8820  	jle	.LBB1_1069
  8821  # %bb.176:
  8822  	movzx	eax, word ptr [rcx]
  8823  	mov	r10d, r9d
  8824  	cmp	r9d, 16
  8825  	jb	.LBB1_177
  8826  # %bb.359:
  8827  	lea	rcx, [rdx + 2*r10]
  8828  	cmp	rcx, r8
  8829  	jbe	.LBB1_537
  8830  # %bb.360:
  8831  	lea	rcx, [r8 + 2*r10]
  8832  	cmp	rcx, rdx
  8833  	jbe	.LBB1_537
  8834  .LBB1_177:
  8835  	xor	esi, esi
  8836  .LBB1_849:
  8837  	mov	r9, rsi
  8838  	not	r9
  8839  	add	r9, r10
  8840  	mov	rdi, r10
  8841  	and	rdi, 3
  8842  	je	.LBB1_851
  8843  .LBB1_850:                              # =>This Inner Loop Header: Depth=1
  8844  	movzx	ecx, word ptr [rdx + 2*rsi]
  8845  	sub	ecx, eax
  8846  	mov	word ptr [r8 + 2*rsi], cx
  8847  	add	rsi, 1
  8848  	add	rdi, -1
  8849  	jne	.LBB1_850
  8850  .LBB1_851:
  8851  	cmp	r9, 3
  8852  	jb	.LBB1_1069
  8853  .LBB1_852:                              # =>This Inner Loop Header: Depth=1
  8854  	movzx	ecx, word ptr [rdx + 2*rsi]
  8855  	sub	ecx, eax
  8856  	mov	word ptr [r8 + 2*rsi], cx
  8857  	movzx	ecx, word ptr [rdx + 2*rsi + 2]
  8858  	sub	ecx, eax
  8859  	mov	word ptr [r8 + 2*rsi + 2], cx
  8860  	movzx	ecx, word ptr [rdx + 2*rsi + 4]
  8861  	sub	ecx, eax
  8862  	mov	word ptr [r8 + 2*rsi + 4], cx
  8863  	movzx	ecx, word ptr [rdx + 2*rsi + 6]
  8864  	sub	ecx, eax
  8865  	mov	word ptr [r8 + 2*rsi + 6], cx
  8866  	add	rsi, 4
  8867  	cmp	r10, rsi
  8868  	jne	.LBB1_852
  8869  	jmp	.LBB1_1069
  8870  .LBB1_178:
  8871  	test	r9d, r9d
  8872  	jle	.LBB1_1069
  8873  # %bb.179:
  8874  	movzx	eax, word ptr [rcx]
  8875  	mov	r10d, r9d
  8876  	cmp	r9d, 16
  8877  	jb	.LBB1_180
  8878  # %bb.362:
  8879  	lea	rcx, [rdx + 2*r10]
  8880  	cmp	rcx, r8
  8881  	jbe	.LBB1_540
  8882  # %bb.363:
  8883  	lea	rcx, [r8 + 2*r10]
  8884  	cmp	rcx, rdx
  8885  	jbe	.LBB1_540
  8886  .LBB1_180:
  8887  	xor	esi, esi
  8888  .LBB1_857:
  8889  	mov	r9, rsi
  8890  	not	r9
  8891  	add	r9, r10
  8892  	mov	rdi, r10
  8893  	and	rdi, 3
  8894  	je	.LBB1_859
  8895  .LBB1_858:                              # =>This Inner Loop Header: Depth=1
  8896  	movzx	ecx, word ptr [rdx + 2*rsi]
  8897  	sub	ecx, eax
  8898  	mov	word ptr [r8 + 2*rsi], cx
  8899  	add	rsi, 1
  8900  	add	rdi, -1
  8901  	jne	.LBB1_858
  8902  .LBB1_859:
  8903  	cmp	r9, 3
  8904  	jb	.LBB1_1069
  8905  .LBB1_860:                              # =>This Inner Loop Header: Depth=1
  8906  	movzx	ecx, word ptr [rdx + 2*rsi]
  8907  	sub	ecx, eax
  8908  	mov	word ptr [r8 + 2*rsi], cx
  8909  	movzx	ecx, word ptr [rdx + 2*rsi + 2]
  8910  	sub	ecx, eax
  8911  	mov	word ptr [r8 + 2*rsi + 2], cx
  8912  	movzx	ecx, word ptr [rdx + 2*rsi + 4]
  8913  	sub	ecx, eax
  8914  	mov	word ptr [r8 + 2*rsi + 4], cx
  8915  	movzx	ecx, word ptr [rdx + 2*rsi + 6]
  8916  	sub	ecx, eax
  8917  	mov	word ptr [r8 + 2*rsi + 6], cx
  8918  	add	rsi, 4
  8919  	cmp	r10, rsi
  8920  	jne	.LBB1_860
  8921  	jmp	.LBB1_1069
  8922  .LBB1_181:
  8923  	test	r9d, r9d
  8924  	jle	.LBB1_1069
  8925  # %bb.182:
  8926  	movzx	eax, word ptr [rcx]
  8927  	mov	r10d, r9d
  8928  	cmp	r9d, 16
  8929  	jb	.LBB1_183
  8930  # %bb.365:
  8931  	lea	rcx, [rdx + 2*r10]
  8932  	cmp	rcx, r8
  8933  	jbe	.LBB1_543
  8934  # %bb.366:
  8935  	lea	rcx, [r8 + 2*r10]
  8936  	cmp	rcx, rdx
  8937  	jbe	.LBB1_543
  8938  .LBB1_183:
  8939  	xor	esi, esi
  8940  .LBB1_865:
  8941  	mov	r9, rsi
  8942  	not	r9
  8943  	add	r9, r10
  8944  	mov	rdi, r10
  8945  	and	rdi, 3
  8946  	je	.LBB1_867
  8947  .LBB1_866:                              # =>This Inner Loop Header: Depth=1
  8948  	movzx	ecx, word ptr [rdx + 2*rsi]
  8949  	add	cx, ax
  8950  	mov	word ptr [r8 + 2*rsi], cx
  8951  	add	rsi, 1
  8952  	add	rdi, -1
  8953  	jne	.LBB1_866
  8954  .LBB1_867:
  8955  	cmp	r9, 3
  8956  	jb	.LBB1_1069
  8957  .LBB1_868:                              # =>This Inner Loop Header: Depth=1
  8958  	movzx	ecx, word ptr [rdx + 2*rsi]
  8959  	add	cx, ax
  8960  	mov	word ptr [r8 + 2*rsi], cx
  8961  	movzx	ecx, word ptr [rdx + 2*rsi + 2]
  8962  	add	cx, ax
  8963  	mov	word ptr [r8 + 2*rsi + 2], cx
  8964  	movzx	ecx, word ptr [rdx + 2*rsi + 4]
  8965  	add	cx, ax
  8966  	mov	word ptr [r8 + 2*rsi + 4], cx
  8967  	movzx	ecx, word ptr [rdx + 2*rsi + 6]
  8968  	add	cx, ax
  8969  	mov	word ptr [r8 + 2*rsi + 6], cx
  8970  	add	rsi, 4
  8971  	cmp	r10, rsi
  8972  	jne	.LBB1_868
  8973  	jmp	.LBB1_1069
  8974  .LBB1_184:
  8975  	test	r9d, r9d
  8976  	jle	.LBB1_1069
  8977  # %bb.185:
  8978  	movzx	eax, word ptr [rcx]
  8979  	mov	r10d, r9d
  8980  	cmp	r9d, 16
  8981  	jb	.LBB1_186
  8982  # %bb.368:
  8983  	lea	rcx, [rdx + 2*r10]
  8984  	cmp	rcx, r8
  8985  	jbe	.LBB1_546
  8986  # %bb.369:
  8987  	lea	rcx, [r8 + 2*r10]
  8988  	cmp	rcx, rdx
  8989  	jbe	.LBB1_546
  8990  .LBB1_186:
  8991  	xor	esi, esi
  8992  .LBB1_873:
  8993  	mov	r9, rsi
  8994  	not	r9
  8995  	add	r9, r10
  8996  	mov	rdi, r10
  8997  	and	rdi, 3
  8998  	je	.LBB1_875
  8999  .LBB1_874:                              # =>This Inner Loop Header: Depth=1
  9000  	movzx	ecx, word ptr [rdx + 2*rsi]
  9001  	add	cx, ax
  9002  	mov	word ptr [r8 + 2*rsi], cx
  9003  	add	rsi, 1
  9004  	add	rdi, -1
  9005  	jne	.LBB1_874
  9006  .LBB1_875:
  9007  	cmp	r9, 3
  9008  	jb	.LBB1_1069
  9009  .LBB1_876:                              # =>This Inner Loop Header: Depth=1
  9010  	movzx	ecx, word ptr [rdx + 2*rsi]
  9011  	add	cx, ax
  9012  	mov	word ptr [r8 + 2*rsi], cx
  9013  	movzx	ecx, word ptr [rdx + 2*rsi + 2]
  9014  	add	cx, ax
  9015  	mov	word ptr [r8 + 2*rsi + 2], cx
  9016  	movzx	ecx, word ptr [rdx + 2*rsi + 4]
  9017  	add	cx, ax
  9018  	mov	word ptr [r8 + 2*rsi + 4], cx
  9019  	movzx	ecx, word ptr [rdx + 2*rsi + 6]
  9020  	add	cx, ax
  9021  	mov	word ptr [r8 + 2*rsi + 6], cx
  9022  	add	rsi, 4
  9023  	cmp	r10, rsi
  9024  	jne	.LBB1_876
  9025  	jmp	.LBB1_1069
  9026  .LBB1_187:
  9027  	test	r9d, r9d
  9028  	jle	.LBB1_1069
  9029  # %bb.188:
  9030  	movzx	eax, word ptr [rcx]
  9031  	mov	r10d, r9d
  9032  	cmp	r9d, 16
  9033  	jb	.LBB1_189
  9034  # %bb.371:
  9035  	lea	rcx, [rdx + 2*r10]
  9036  	cmp	rcx, r8
  9037  	jbe	.LBB1_549
  9038  # %bb.372:
  9039  	lea	rcx, [r8 + 2*r10]
  9040  	cmp	rcx, rdx
  9041  	jbe	.LBB1_549
  9042  .LBB1_189:
  9043  	xor	esi, esi
  9044  .LBB1_881:
  9045  	mov	r9, rsi
  9046  	not	r9
  9047  	add	r9, r10
  9048  	mov	rdi, r10
  9049  	and	rdi, 3
  9050  	je	.LBB1_883
  9051  .LBB1_882:                              # =>This Inner Loop Header: Depth=1
  9052  	movzx	ecx, word ptr [rdx + 2*rsi]
  9053  	sub	ecx, eax
  9054  	mov	word ptr [r8 + 2*rsi], cx
  9055  	add	rsi, 1
  9056  	add	rdi, -1
  9057  	jne	.LBB1_882
  9058  .LBB1_883:
  9059  	cmp	r9, 3
  9060  	jb	.LBB1_1069
  9061  .LBB1_884:                              # =>This Inner Loop Header: Depth=1
  9062  	movzx	ecx, word ptr [rdx + 2*rsi]
  9063  	sub	ecx, eax
  9064  	mov	word ptr [r8 + 2*rsi], cx
  9065  	movzx	ecx, word ptr [rdx + 2*rsi + 2]
  9066  	sub	ecx, eax
  9067  	mov	word ptr [r8 + 2*rsi + 2], cx
  9068  	movzx	ecx, word ptr [rdx + 2*rsi + 4]
  9069  	sub	ecx, eax
  9070  	mov	word ptr [r8 + 2*rsi + 4], cx
  9071  	movzx	ecx, word ptr [rdx + 2*rsi + 6]
  9072  	sub	ecx, eax
  9073  	mov	word ptr [r8 + 2*rsi + 6], cx
  9074  	add	rsi, 4
  9075  	cmp	r10, rsi
  9076  	jne	.LBB1_884
  9077  	jmp	.LBB1_1069
  9078  .LBB1_190:
  9079  	test	r9d, r9d
  9080  	jle	.LBB1_1069
  9081  # %bb.191:
  9082  	movzx	eax, word ptr [rcx]
  9083  	mov	r10d, r9d
  9084  	cmp	r9d, 16
  9085  	jb	.LBB1_192
  9086  # %bb.374:
  9087  	lea	rcx, [rdx + 2*r10]
  9088  	cmp	rcx, r8
  9089  	jbe	.LBB1_552
  9090  # %bb.375:
  9091  	lea	rcx, [r8 + 2*r10]
  9092  	cmp	rcx, rdx
  9093  	jbe	.LBB1_552
  9094  .LBB1_192:
  9095  	xor	esi, esi
  9096  .LBB1_889:
  9097  	mov	r9, rsi
  9098  	not	r9
  9099  	add	r9, r10
  9100  	mov	rdi, r10
  9101  	and	rdi, 3
  9102  	je	.LBB1_891
  9103  .LBB1_890:                              # =>This Inner Loop Header: Depth=1
  9104  	movzx	ecx, word ptr [rdx + 2*rsi]
  9105  	sub	ecx, eax
  9106  	mov	word ptr [r8 + 2*rsi], cx
  9107  	add	rsi, 1
  9108  	add	rdi, -1
  9109  	jne	.LBB1_890
  9110  .LBB1_891:
  9111  	cmp	r9, 3
  9112  	jb	.LBB1_1069
  9113  .LBB1_892:                              # =>This Inner Loop Header: Depth=1
  9114  	movzx	ecx, word ptr [rdx + 2*rsi]
  9115  	sub	ecx, eax
  9116  	mov	word ptr [r8 + 2*rsi], cx
  9117  	movzx	ecx, word ptr [rdx + 2*rsi + 2]
  9118  	sub	ecx, eax
  9119  	mov	word ptr [r8 + 2*rsi + 2], cx
  9120  	movzx	ecx, word ptr [rdx + 2*rsi + 4]
  9121  	sub	ecx, eax
  9122  	mov	word ptr [r8 + 2*rsi + 4], cx
  9123  	movzx	ecx, word ptr [rdx + 2*rsi + 6]
  9124  	sub	ecx, eax
  9125  	mov	word ptr [r8 + 2*rsi + 6], cx
  9126  	add	rsi, 4
  9127  	cmp	r10, rsi
  9128  	jne	.LBB1_892
  9129  	jmp	.LBB1_1069
  9130  .LBB1_193:
  9131  	test	r9d, r9d
  9132  	jle	.LBB1_1069
  9133  # %bb.194:
  9134  	mov	rax, qword ptr [rcx]
  9135  	mov	esi, r9d
  9136  	lea	rdi, [rsi - 1]
  9137  	mov	r9d, esi
  9138  	and	r9d, 3
  9139  	cmp	rdi, 3
  9140  	jae	.LBB1_377
  9141  # %bb.195:
  9142  	xor	edi, edi
  9143  	jmp	.LBB1_379
  9144  .LBB1_196:
  9145  	test	r9d, r9d
  9146  	jle	.LBB1_1069
  9147  # %bb.197:
  9148  	movss	xmm0, dword ptr [rcx]           # xmm0 = mem[0],zero,zero,zero
  9149  	mov	eax, r9d
  9150  	cmp	r9d, 8
  9151  	jb	.LBB1_198
  9152  # %bb.382:
  9153  	lea	rcx, [rdx + 4*rax]
  9154  	cmp	rcx, r8
  9155  	jbe	.LBB1_555
  9156  # %bb.383:
  9157  	lea	rcx, [r8 + 4*rax]
  9158  	cmp	rcx, rdx
  9159  	jbe	.LBB1_555
  9160  .LBB1_198:
  9161  	xor	ecx, ecx
  9162  .LBB1_897:
  9163  	mov	rsi, rcx
  9164  	not	rsi
  9165  	add	rsi, rax
  9166  	mov	rdi, rax
  9167  	and	rdi, 3
  9168  	je	.LBB1_899
  9169  .LBB1_898:                              # =>This Inner Loop Header: Depth=1
  9170  	movss	xmm1, dword ptr [rdx + 4*rcx]   # xmm1 = mem[0],zero,zero,zero
  9171  	mulss	xmm1, xmm0
  9172  	movss	dword ptr [r8 + 4*rcx], xmm1
  9173  	add	rcx, 1
  9174  	add	rdi, -1
  9175  	jne	.LBB1_898
  9176  .LBB1_899:
  9177  	cmp	rsi, 3
  9178  	jb	.LBB1_1069
  9179  .LBB1_900:                              # =>This Inner Loop Header: Depth=1
  9180  	movss	xmm1, dword ptr [rdx + 4*rcx]   # xmm1 = mem[0],zero,zero,zero
  9181  	mulss	xmm1, xmm0
  9182  	movss	dword ptr [r8 + 4*rcx], xmm1
  9183  	movss	xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero
  9184  	mulss	xmm1, xmm0
  9185  	movss	dword ptr [r8 + 4*rcx + 4], xmm1
  9186  	movss	xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero
  9187  	mulss	xmm1, xmm0
  9188  	movss	dword ptr [r8 + 4*rcx + 8], xmm1
  9189  	movss	xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero
  9190  	mulss	xmm1, xmm0
  9191  	movss	dword ptr [r8 + 4*rcx + 12], xmm1
  9192  	add	rcx, 4
  9193  	cmp	rax, rcx
  9194  	jne	.LBB1_900
  9195  	jmp	.LBB1_1069
  9196  .LBB1_199:
  9197  	test	r9d, r9d
  9198  	jle	.LBB1_1069
  9199  # %bb.200:
  9200  	mov	rax, qword ptr [rcx]
  9201  	mov	esi, r9d
  9202  	lea	rdi, [rsi - 1]
  9203  	mov	r9d, esi
  9204  	and	r9d, 3
  9205  	cmp	rdi, 3
  9206  	jae	.LBB1_385
  9207  # %bb.201:
  9208  	xor	edi, edi
  9209  	jmp	.LBB1_387
  9210  .LBB1_202:
  9211  	test	r9d, r9d
  9212  	jle	.LBB1_1069
  9213  # %bb.203:
  9214  	movss	xmm0, dword ptr [rcx]           # xmm0 = mem[0],zero,zero,zero
  9215  	mov	eax, r9d
  9216  	cmp	r9d, 8
  9217  	jb	.LBB1_204
  9218  # %bb.390:
  9219  	lea	rcx, [rdx + 4*rax]
  9220  	cmp	rcx, r8
  9221  	jbe	.LBB1_558
  9222  # %bb.391:
  9223  	lea	rcx, [r8 + 4*rax]
  9224  	cmp	rcx, rdx
  9225  	jbe	.LBB1_558
  9226  .LBB1_204:
  9227  	xor	ecx, ecx
  9228  .LBB1_905:
  9229  	mov	rsi, rcx
  9230  	not	rsi
  9231  	add	rsi, rax
  9232  	mov	rdi, rax
  9233  	and	rdi, 3
  9234  	je	.LBB1_907
  9235  .LBB1_906:                              # =>This Inner Loop Header: Depth=1
  9236  	movss	xmm1, dword ptr [rdx + 4*rcx]   # xmm1 = mem[0],zero,zero,zero
  9237  	mulss	xmm1, xmm0
  9238  	movss	dword ptr [r8 + 4*rcx], xmm1
  9239  	add	rcx, 1
  9240  	add	rdi, -1
  9241  	jne	.LBB1_906
  9242  .LBB1_907:
  9243  	cmp	rsi, 3
  9244  	jb	.LBB1_1069
  9245  .LBB1_908:                              # =>This Inner Loop Header: Depth=1
  9246  	movss	xmm1, dword ptr [rdx + 4*rcx]   # xmm1 = mem[0],zero,zero,zero
  9247  	mulss	xmm1, xmm0
  9248  	movss	dword ptr [r8 + 4*rcx], xmm1
  9249  	movss	xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero
  9250  	mulss	xmm1, xmm0
  9251  	movss	dword ptr [r8 + 4*rcx + 4], xmm1
  9252  	movss	xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero
  9253  	mulss	xmm1, xmm0
  9254  	movss	dword ptr [r8 + 4*rcx + 8], xmm1
  9255  	movss	xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero
  9256  	mulss	xmm1, xmm0
  9257  	movss	dword ptr [r8 + 4*rcx + 12], xmm1
  9258  	add	rcx, 4
  9259  	cmp	rax, rcx
  9260  	jne	.LBB1_908
  9261  	jmp	.LBB1_1069
  9262  .LBB1_205:
  9263  	test	r9d, r9d
  9264  	jle	.LBB1_1069
  9265  # %bb.206:
  9266  	mov	rax, qword ptr [rcx]
  9267  	mov	r10d, r9d
  9268  	cmp	r9d, 4
  9269  	jb	.LBB1_207
  9270  # %bb.393:
  9271  	lea	rcx, [rdx + 8*r10]
  9272  	cmp	rcx, r8
  9273  	jbe	.LBB1_561
  9274  # %bb.394:
  9275  	lea	rcx, [r8 + 8*r10]
  9276  	cmp	rcx, rdx
  9277  	jbe	.LBB1_561
  9278  .LBB1_207:
  9279  	xor	esi, esi
  9280  .LBB1_913:
  9281  	mov	r9, rsi
  9282  	not	r9
  9283  	add	r9, r10
  9284  	mov	rdi, r10
  9285  	and	rdi, 3
  9286  	je	.LBB1_915
  9287  .LBB1_914:                              # =>This Inner Loop Header: Depth=1
  9288  	mov	rcx, qword ptr [rdx + 8*rsi]
  9289  	add	rcx, rax
  9290  	mov	qword ptr [r8 + 8*rsi], rcx
  9291  	add	rsi, 1
  9292  	add	rdi, -1
  9293  	jne	.LBB1_914
  9294  .LBB1_915:
  9295  	cmp	r9, 3
  9296  	jb	.LBB1_1069
  9297  .LBB1_916:                              # =>This Inner Loop Header: Depth=1
  9298  	mov	rcx, qword ptr [rdx + 8*rsi]
  9299  	add	rcx, rax
  9300  	mov	qword ptr [r8 + 8*rsi], rcx
  9301  	mov	rcx, qword ptr [rdx + 8*rsi + 8]
  9302  	add	rcx, rax
  9303  	mov	qword ptr [r8 + 8*rsi + 8], rcx
  9304  	mov	rcx, qword ptr [rdx + 8*rsi + 16]
  9305  	add	rcx, rax
  9306  	mov	qword ptr [r8 + 8*rsi + 16], rcx
  9307  	mov	rcx, qword ptr [rdx + 8*rsi + 24]
  9308  	add	rcx, rax
  9309  	mov	qword ptr [r8 + 8*rsi + 24], rcx
  9310  	add	rsi, 4
  9311  	cmp	r10, rsi
  9312  	jne	.LBB1_916
  9313  	jmp	.LBB1_1069
  9314  .LBB1_208:
  9315  	test	r9d, r9d
  9316  	jle	.LBB1_1069
  9317  # %bb.209:
  9318  	movss	xmm0, dword ptr [rcx]           # xmm0 = mem[0],zero,zero,zero
  9319  	mov	eax, r9d
  9320  	cmp	r9d, 8
  9321  	jb	.LBB1_210
  9322  # %bb.396:
  9323  	lea	rcx, [rdx + 4*rax]
  9324  	cmp	rcx, r8
  9325  	jbe	.LBB1_564
  9326  # %bb.397:
  9327  	lea	rcx, [r8 + 4*rax]
  9328  	cmp	rcx, rdx
  9329  	jbe	.LBB1_564
  9330  .LBB1_210:
  9331  	xor	ecx, ecx
  9332  .LBB1_921:
  9333  	mov	rsi, rcx
  9334  	not	rsi
  9335  	add	rsi, rax
  9336  	mov	rdi, rax
  9337  	and	rdi, 3
  9338  	je	.LBB1_923
  9339  .LBB1_922:                              # =>This Inner Loop Header: Depth=1
  9340  	movss	xmm1, dword ptr [rdx + 4*rcx]   # xmm1 = mem[0],zero,zero,zero
  9341  	addss	xmm1, xmm0
  9342  	movss	dword ptr [r8 + 4*rcx], xmm1
  9343  	add	rcx, 1
  9344  	add	rdi, -1
  9345  	jne	.LBB1_922
  9346  .LBB1_923:
  9347  	cmp	rsi, 3
  9348  	jb	.LBB1_1069
  9349  .LBB1_924:                              # =>This Inner Loop Header: Depth=1
  9350  	movss	xmm1, dword ptr [rdx + 4*rcx]   # xmm1 = mem[0],zero,zero,zero
  9351  	addss	xmm1, xmm0
  9352  	movss	dword ptr [r8 + 4*rcx], xmm1
  9353  	movss	xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero
  9354  	addss	xmm1, xmm0
  9355  	movss	dword ptr [r8 + 4*rcx + 4], xmm1
  9356  	movss	xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero
  9357  	addss	xmm1, xmm0
  9358  	movss	dword ptr [r8 + 4*rcx + 8], xmm1
  9359  	movss	xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero
  9360  	addss	xmm1, xmm0
  9361  	movss	dword ptr [r8 + 4*rcx + 12], xmm1
  9362  	add	rcx, 4
  9363  	cmp	rax, rcx
  9364  	jne	.LBB1_924
  9365  	jmp	.LBB1_1069
  9366  .LBB1_211:
  9367  	test	r9d, r9d
  9368  	jle	.LBB1_1069
  9369  # %bb.212:
  9370  	mov	rax, qword ptr [rcx]
  9371  	mov	r10d, r9d
  9372  	cmp	r9d, 4
  9373  	jb	.LBB1_213
  9374  # %bb.399:
  9375  	lea	rcx, [rdx + 8*r10]
  9376  	cmp	rcx, r8
  9377  	jbe	.LBB1_567
  9378  # %bb.400:
  9379  	lea	rcx, [r8 + 8*r10]
  9380  	cmp	rcx, rdx
  9381  	jbe	.LBB1_567
  9382  .LBB1_213:
  9383  	xor	esi, esi
  9384  .LBB1_929:
  9385  	mov	r9, rsi
  9386  	not	r9
  9387  	add	r9, r10
  9388  	mov	rdi, r10
  9389  	and	rdi, 3
  9390  	je	.LBB1_931
  9391  .LBB1_930:                              # =>This Inner Loop Header: Depth=1
  9392  	mov	rcx, qword ptr [rdx + 8*rsi]
  9393  	sub	rcx, rax
  9394  	mov	qword ptr [r8 + 8*rsi], rcx
  9395  	add	rsi, 1
  9396  	add	rdi, -1
  9397  	jne	.LBB1_930
  9398  .LBB1_931:
  9399  	cmp	r9, 3
  9400  	jb	.LBB1_1069
  9401  .LBB1_932:                              # =>This Inner Loop Header: Depth=1
  9402  	mov	rcx, qword ptr [rdx + 8*rsi]
  9403  	sub	rcx, rax
  9404  	mov	qword ptr [r8 + 8*rsi], rcx
  9405  	mov	rcx, qword ptr [rdx + 8*rsi + 8]
  9406  	sub	rcx, rax
  9407  	mov	qword ptr [r8 + 8*rsi + 8], rcx
  9408  	mov	rcx, qword ptr [rdx + 8*rsi + 16]
  9409  	sub	rcx, rax
  9410  	mov	qword ptr [r8 + 8*rsi + 16], rcx
  9411  	mov	rcx, qword ptr [rdx + 8*rsi + 24]
  9412  	sub	rcx, rax
  9413  	mov	qword ptr [r8 + 8*rsi + 24], rcx
  9414  	add	rsi, 4
  9415  	cmp	r10, rsi
  9416  	jne	.LBB1_932
  9417  	jmp	.LBB1_1069
  9418  .LBB1_214:
  9419  	test	r9d, r9d
  9420  	jle	.LBB1_1069
  9421  # %bb.215:
  9422  	movss	xmm0, dword ptr [rcx]           # xmm0 = mem[0],zero,zero,zero
  9423  	mov	eax, r9d
  9424  	cmp	r9d, 8
  9425  	jb	.LBB1_216
  9426  # %bb.402:
  9427  	lea	rcx, [rdx + 4*rax]
  9428  	cmp	rcx, r8
  9429  	jbe	.LBB1_570
  9430  # %bb.403:
  9431  	lea	rcx, [r8 + 4*rax]
  9432  	cmp	rcx, rdx
  9433  	jbe	.LBB1_570
  9434  .LBB1_216:
  9435  	xor	ecx, ecx
  9436  .LBB1_937:
  9437  	mov	rsi, rcx
  9438  	not	rsi
  9439  	add	rsi, rax
  9440  	mov	rdi, rax
  9441  	and	rdi, 3
  9442  	je	.LBB1_939
  9443  .LBB1_938:                              # =>This Inner Loop Header: Depth=1
  9444  	movss	xmm1, dword ptr [rdx + 4*rcx]   # xmm1 = mem[0],zero,zero,zero
  9445  	subss	xmm1, xmm0
  9446  	movss	dword ptr [r8 + 4*rcx], xmm1
  9447  	add	rcx, 1
  9448  	add	rdi, -1
  9449  	jne	.LBB1_938
  9450  .LBB1_939:
  9451  	cmp	rsi, 3
  9452  	jb	.LBB1_1069
  9453  .LBB1_940:                              # =>This Inner Loop Header: Depth=1
  9454  	movss	xmm1, dword ptr [rdx + 4*rcx]   # xmm1 = mem[0],zero,zero,zero
  9455  	subss	xmm1, xmm0
  9456  	movss	dword ptr [r8 + 4*rcx], xmm1
  9457  	movss	xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero
  9458  	subss	xmm1, xmm0
  9459  	movss	dword ptr [r8 + 4*rcx + 4], xmm1
  9460  	movss	xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero
  9461  	subss	xmm1, xmm0
  9462  	movss	dword ptr [r8 + 4*rcx + 8], xmm1
  9463  	movss	xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero
  9464  	subss	xmm1, xmm0
  9465  	movss	dword ptr [r8 + 4*rcx + 12], xmm1
  9466  	add	rcx, 4
  9467  	cmp	rax, rcx
  9468  	jne	.LBB1_940
  9469  	jmp	.LBB1_1069
  9470  .LBB1_217:
  9471  	test	r9d, r9d
  9472  	jle	.LBB1_1069
  9473  # %bb.218:
  9474  	mov	rax, qword ptr [rcx]
  9475  	mov	r10d, r9d
  9476  	cmp	r9d, 4
  9477  	jb	.LBB1_219
  9478  # %bb.405:
  9479  	lea	rcx, [rdx + 8*r10]
  9480  	cmp	rcx, r8
  9481  	jbe	.LBB1_573
  9482  # %bb.406:
  9483  	lea	rcx, [r8 + 8*r10]
  9484  	cmp	rcx, rdx
  9485  	jbe	.LBB1_573
  9486  .LBB1_219:
  9487  	xor	esi, esi
  9488  .LBB1_945:
  9489  	mov	r9, rsi
  9490  	not	r9
  9491  	add	r9, r10
  9492  	mov	rdi, r10
  9493  	and	rdi, 3
  9494  	je	.LBB1_947
  9495  .LBB1_946:                              # =>This Inner Loop Header: Depth=1
  9496  	mov	rcx, qword ptr [rdx + 8*rsi]
  9497  	add	rcx, rax
  9498  	mov	qword ptr [r8 + 8*rsi], rcx
  9499  	add	rsi, 1
  9500  	add	rdi, -1
  9501  	jne	.LBB1_946
  9502  .LBB1_947:
  9503  	cmp	r9, 3
  9504  	jb	.LBB1_1069
  9505  .LBB1_948:                              # =>This Inner Loop Header: Depth=1
  9506  	mov	rcx, qword ptr [rdx + 8*rsi]
  9507  	add	rcx, rax
  9508  	mov	qword ptr [r8 + 8*rsi], rcx
  9509  	mov	rcx, qword ptr [rdx + 8*rsi + 8]
  9510  	add	rcx, rax
  9511  	mov	qword ptr [r8 + 8*rsi + 8], rcx
  9512  	mov	rcx, qword ptr [rdx + 8*rsi + 16]
  9513  	add	rcx, rax
  9514  	mov	qword ptr [r8 + 8*rsi + 16], rcx
  9515  	mov	rcx, qword ptr [rdx + 8*rsi + 24]
  9516  	add	rcx, rax
  9517  	mov	qword ptr [r8 + 8*rsi + 24], rcx
  9518  	add	rsi, 4
  9519  	cmp	r10, rsi
  9520  	jne	.LBB1_948
  9521  	jmp	.LBB1_1069
  9522  .LBB1_220:
  9523  	test	r9d, r9d
  9524  	jle	.LBB1_1069
  9525  # %bb.221:
  9526  	movss	xmm0, dword ptr [rcx]           # xmm0 = mem[0],zero,zero,zero
  9527  	mov	eax, r9d
  9528  	cmp	r9d, 8
  9529  	jb	.LBB1_222
  9530  # %bb.408:
  9531  	lea	rcx, [rdx + 4*rax]
  9532  	cmp	rcx, r8
  9533  	jbe	.LBB1_576
  9534  # %bb.409:
  9535  	lea	rcx, [r8 + 4*rax]
  9536  	cmp	rcx, rdx
  9537  	jbe	.LBB1_576
  9538  .LBB1_222:
  9539  	xor	ecx, ecx
  9540  .LBB1_953:
  9541  	mov	rsi, rcx
  9542  	not	rsi
  9543  	add	rsi, rax
  9544  	mov	rdi, rax
  9545  	and	rdi, 3
  9546  	je	.LBB1_955
  9547  .LBB1_954:                              # =>This Inner Loop Header: Depth=1
  9548  	movss	xmm1, dword ptr [rdx + 4*rcx]   # xmm1 = mem[0],zero,zero,zero
  9549  	addss	xmm1, xmm0
  9550  	movss	dword ptr [r8 + 4*rcx], xmm1
  9551  	add	rcx, 1
  9552  	add	rdi, -1
  9553  	jne	.LBB1_954
  9554  .LBB1_955:
  9555  	cmp	rsi, 3
  9556  	jb	.LBB1_1069
  9557  .LBB1_956:                              # =>This Inner Loop Header: Depth=1
  9558  	movss	xmm1, dword ptr [rdx + 4*rcx]   # xmm1 = mem[0],zero,zero,zero
  9559  	addss	xmm1, xmm0
  9560  	movss	dword ptr [r8 + 4*rcx], xmm1
  9561  	movss	xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero
  9562  	addss	xmm1, xmm0
  9563  	movss	dword ptr [r8 + 4*rcx + 4], xmm1
  9564  	movss	xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero
  9565  	addss	xmm1, xmm0
  9566  	movss	dword ptr [r8 + 4*rcx + 8], xmm1
  9567  	movss	xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero
  9568  	addss	xmm1, xmm0
  9569  	movss	dword ptr [r8 + 4*rcx + 12], xmm1
  9570  	add	rcx, 4
  9571  	cmp	rax, rcx
  9572  	jne	.LBB1_956
  9573  	jmp	.LBB1_1069
  9574  .LBB1_223:
  9575  	test	r9d, r9d
  9576  	jle	.LBB1_1069
  9577  # %bb.224:
  9578  	mov	rax, qword ptr [rcx]
  9579  	mov	r10d, r9d
  9580  	cmp	r9d, 4
  9581  	jb	.LBB1_225
  9582  # %bb.411:
  9583  	lea	rcx, [rdx + 8*r10]
  9584  	cmp	rcx, r8
  9585  	jbe	.LBB1_579
  9586  # %bb.412:
  9587  	lea	rcx, [r8 + 8*r10]
  9588  	cmp	rcx, rdx
  9589  	jbe	.LBB1_579
  9590  .LBB1_225:
  9591  	xor	esi, esi
  9592  .LBB1_961:
  9593  	mov	r9, rsi
  9594  	not	r9
  9595  	add	r9, r10
  9596  	mov	rdi, r10
  9597  	and	rdi, 3
  9598  	je	.LBB1_963
  9599  .LBB1_962:                              # =>This Inner Loop Header: Depth=1
  9600  	mov	rcx, qword ptr [rdx + 8*rsi]
  9601  	sub	rcx, rax
  9602  	mov	qword ptr [r8 + 8*rsi], rcx
  9603  	add	rsi, 1
  9604  	add	rdi, -1
  9605  	jne	.LBB1_962
  9606  .LBB1_963:
  9607  	cmp	r9, 3
  9608  	jb	.LBB1_1069
  9609  .LBB1_964:                              # =>This Inner Loop Header: Depth=1
  9610  	mov	rcx, qword ptr [rdx + 8*rsi]
  9611  	sub	rcx, rax
  9612  	mov	qword ptr [r8 + 8*rsi], rcx
  9613  	mov	rcx, qword ptr [rdx + 8*rsi + 8]
  9614  	sub	rcx, rax
  9615  	mov	qword ptr [r8 + 8*rsi + 8], rcx
  9616  	mov	rcx, qword ptr [rdx + 8*rsi + 16]
  9617  	sub	rcx, rax
  9618  	mov	qword ptr [r8 + 8*rsi + 16], rcx
  9619  	mov	rcx, qword ptr [rdx + 8*rsi + 24]
  9620  	sub	rcx, rax
  9621  	mov	qword ptr [r8 + 8*rsi + 24], rcx
  9622  	add	rsi, 4
  9623  	cmp	r10, rsi
  9624  	jne	.LBB1_964
  9625  	jmp	.LBB1_1069
  9626  .LBB1_226:
  9627  	test	r9d, r9d
  9628  	jle	.LBB1_1069
  9629  # %bb.227:
  9630  	movss	xmm0, dword ptr [rcx]           # xmm0 = mem[0],zero,zero,zero
  9631  	mov	eax, r9d
  9632  	cmp	r9d, 8
  9633  	jb	.LBB1_228
  9634  # %bb.414:
  9635  	lea	rcx, [rdx + 4*rax]
  9636  	cmp	rcx, r8
  9637  	jbe	.LBB1_582
  9638  # %bb.415:
  9639  	lea	rcx, [r8 + 4*rax]
  9640  	cmp	rcx, rdx
  9641  	jbe	.LBB1_582
  9642  .LBB1_228:
  9643  	xor	ecx, ecx
  9644  .LBB1_969:
  9645  	mov	rsi, rcx
  9646  	not	rsi
  9647  	add	rsi, rax
  9648  	mov	rdi, rax
  9649  	and	rdi, 3
  9650  	je	.LBB1_971
  9651  .LBB1_970:                              # =>This Inner Loop Header: Depth=1
  9652  	movss	xmm1, dword ptr [rdx + 4*rcx]   # xmm1 = mem[0],zero,zero,zero
  9653  	subss	xmm1, xmm0
  9654  	movss	dword ptr [r8 + 4*rcx], xmm1
  9655  	add	rcx, 1
  9656  	add	rdi, -1
  9657  	jne	.LBB1_970
  9658  .LBB1_971:
  9659  	cmp	rsi, 3
  9660  	jb	.LBB1_1069
  9661  .LBB1_972:                              # =>This Inner Loop Header: Depth=1
  9662  	movss	xmm1, dword ptr [rdx + 4*rcx]   # xmm1 = mem[0],zero,zero,zero
  9663  	subss	xmm1, xmm0
  9664  	movss	dword ptr [r8 + 4*rcx], xmm1
  9665  	movss	xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero
  9666  	subss	xmm1, xmm0
  9667  	movss	dword ptr [r8 + 4*rcx + 4], xmm1
  9668  	movss	xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero
  9669  	subss	xmm1, xmm0
  9670  	movss	dword ptr [r8 + 4*rcx + 8], xmm1
  9671  	movss	xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero
  9672  	subss	xmm1, xmm0
  9673  	movss	dword ptr [r8 + 4*rcx + 12], xmm1
  9674  	add	rcx, 4
  9675  	cmp	rax, rcx
  9676  	jne	.LBB1_972
  9677  	jmp	.LBB1_1069
  9678  .LBB1_229:
  9679  	test	r9d, r9d
  9680  	jle	.LBB1_1069
  9681  # %bb.230:
  9682  	mov	cl, byte ptr [rcx]
  9683  	mov	r10d, r9d
  9684  	cmp	r9d, 32
  9685  	jb	.LBB1_231
  9686  # %bb.417:
  9687  	lea	rax, [rdx + r10]
  9688  	cmp	rax, r8
  9689  	jbe	.LBB1_585
  9690  # %bb.418:
  9691  	lea	rax, [r8 + r10]
  9692  	cmp	rax, rdx
  9693  	jbe	.LBB1_585
  9694  .LBB1_231:
  9695  	xor	edi, edi
  9696  .LBB1_977:
  9697  	mov	r9, rdi
  9698  	not	r9
  9699  	add	r9, r10
  9700  	mov	rsi, r10
  9701  	and	rsi, 3
  9702  	je	.LBB1_979
  9703  .LBB1_978:                              # =>This Inner Loop Header: Depth=1
  9704  	movzx	eax, byte ptr [rdx + rdi]
  9705  	mul	cl
  9706  	mov	byte ptr [r8 + rdi], al
  9707  	add	rdi, 1
  9708  	add	rsi, -1
  9709  	jne	.LBB1_978
  9710  .LBB1_979:
  9711  	cmp	r9, 3
  9712  	jb	.LBB1_1069
  9713  .LBB1_980:                              # =>This Inner Loop Header: Depth=1
  9714  	movzx	eax, byte ptr [rdx + rdi]
  9715  	mul	cl
  9716  	mov	byte ptr [r8 + rdi], al
  9717  	movzx	eax, byte ptr [rdx + rdi + 1]
  9718  	mul	cl
  9719  	mov	byte ptr [r8 + rdi + 1], al
  9720  	movzx	eax, byte ptr [rdx + rdi + 2]
  9721  	mul	cl
  9722  	mov	byte ptr [r8 + rdi + 2], al
  9723  	movzx	eax, byte ptr [rdx + rdi + 3]
  9724  	mul	cl
  9725  	mov	byte ptr [r8 + rdi + 3], al
  9726  	add	rdi, 4
  9727  	cmp	r10, rdi
  9728  	jne	.LBB1_980
  9729  	jmp	.LBB1_1069
  9730  .LBB1_232:
  9731  	test	r9d, r9d
  9732  	jle	.LBB1_1069
  9733  # %bb.233:
  9734  	mov	cl, byte ptr [rcx]
  9735  	mov	r10d, r9d
  9736  	cmp	r9d, 32
  9737  	jb	.LBB1_234
  9738  # %bb.420:
  9739  	lea	rax, [rdx + r10]
  9740  	cmp	rax, r8
  9741  	jbe	.LBB1_588
  9742  # %bb.421:
  9743  	lea	rax, [r8 + r10]
  9744  	cmp	rax, rdx
  9745  	jbe	.LBB1_588
  9746  .LBB1_234:
  9747  	xor	edi, edi
  9748  .LBB1_985:
  9749  	mov	r9, rdi
  9750  	not	r9
  9751  	add	r9, r10
  9752  	mov	rsi, r10
  9753  	and	rsi, 3
  9754  	je	.LBB1_987
  9755  .LBB1_986:                              # =>This Inner Loop Header: Depth=1
  9756  	movzx	eax, byte ptr [rdx + rdi]
  9757  	mul	cl
  9758  	mov	byte ptr [r8 + rdi], al
  9759  	add	rdi, 1
  9760  	add	rsi, -1
  9761  	jne	.LBB1_986
  9762  .LBB1_987:
  9763  	cmp	r9, 3
  9764  	jb	.LBB1_1069
  9765  .LBB1_988:                              # =>This Inner Loop Header: Depth=1
  9766  	movzx	eax, byte ptr [rdx + rdi]
  9767  	mul	cl
  9768  	mov	byte ptr [r8 + rdi], al
  9769  	movzx	eax, byte ptr [rdx + rdi + 1]
  9770  	mul	cl
  9771  	mov	byte ptr [r8 + rdi + 1], al
  9772  	movzx	eax, byte ptr [rdx + rdi + 2]
  9773  	mul	cl
  9774  	mov	byte ptr [r8 + rdi + 2], al
  9775  	movzx	eax, byte ptr [rdx + rdi + 3]
  9776  	mul	cl
  9777  	mov	byte ptr [r8 + rdi + 3], al
  9778  	add	rdi, 4
  9779  	cmp	r10, rdi
  9780  	jne	.LBB1_988
  9781  	jmp	.LBB1_1069
  9782  .LBB1_235:
  9783  	test	r9d, r9d
  9784  	jle	.LBB1_1069
  9785  # %bb.236:
  9786  	mov	al, byte ptr [rcx]
  9787  	mov	r10d, r9d
  9788  	cmp	r9d, 32
  9789  	jb	.LBB1_237
  9790  # %bb.423:
  9791  	lea	rcx, [rdx + r10]
  9792  	cmp	rcx, r8
  9793  	jbe	.LBB1_591
  9794  # %bb.424:
  9795  	lea	rcx, [r8 + r10]
  9796  	cmp	rcx, rdx
  9797  	jbe	.LBB1_591
  9798  .LBB1_237:
  9799  	xor	esi, esi
  9800  .LBB1_993:
  9801  	mov	r9, rsi
  9802  	not	r9
  9803  	add	r9, r10
  9804  	mov	rdi, r10
  9805  	and	rdi, 3
  9806  	je	.LBB1_995
  9807  .LBB1_994:                              # =>This Inner Loop Header: Depth=1
  9808  	movzx	ecx, byte ptr [rdx + rsi]
  9809  	add	cl, al
  9810  	mov	byte ptr [r8 + rsi], cl
  9811  	add	rsi, 1
  9812  	add	rdi, -1
  9813  	jne	.LBB1_994
  9814  .LBB1_995:
  9815  	cmp	r9, 3
  9816  	jb	.LBB1_1069
  9817  .LBB1_996:                              # =>This Inner Loop Header: Depth=1
  9818  	movzx	ecx, byte ptr [rdx + rsi]
  9819  	add	cl, al
  9820  	mov	byte ptr [r8 + rsi], cl
  9821  	movzx	ecx, byte ptr [rdx + rsi + 1]
  9822  	add	cl, al
  9823  	mov	byte ptr [r8 + rsi + 1], cl
  9824  	movzx	ecx, byte ptr [rdx + rsi + 2]
  9825  	add	cl, al
  9826  	mov	byte ptr [r8 + rsi + 2], cl
  9827  	movzx	ecx, byte ptr [rdx + rsi + 3]
  9828  	add	cl, al
  9829  	mov	byte ptr [r8 + rsi + 3], cl
  9830  	add	rsi, 4
  9831  	cmp	r10, rsi
  9832  	jne	.LBB1_996
  9833  	jmp	.LBB1_1069
  9834  .LBB1_238:
  9835  	test	r9d, r9d
  9836  	jle	.LBB1_1069
  9837  # %bb.239:
  9838  	mov	al, byte ptr [rcx]
  9839  	mov	r10d, r9d
  9840  	cmp	r9d, 32
  9841  	jb	.LBB1_240
  9842  # %bb.426:
  9843  	lea	rcx, [rdx + r10]
  9844  	cmp	rcx, r8
  9845  	jbe	.LBB1_594
  9846  # %bb.427:
  9847  	lea	rcx, [r8 + r10]
  9848  	cmp	rcx, rdx
  9849  	jbe	.LBB1_594
  9850  .LBB1_240:
  9851  	xor	esi, esi
  9852  .LBB1_1001:
  9853  	mov	r9, rsi
  9854  	not	r9
  9855  	add	r9, r10
  9856  	mov	rdi, r10
  9857  	and	rdi, 3
  9858  	je	.LBB1_1003
  9859  .LBB1_1002:                             # =>This Inner Loop Header: Depth=1
  9860  	movzx	ecx, byte ptr [rdx + rsi]
  9861  	sub	cl, al
  9862  	mov	byte ptr [r8 + rsi], cl
  9863  	add	rsi, 1
  9864  	add	rdi, -1
  9865  	jne	.LBB1_1002
  9866  .LBB1_1003:
  9867  	cmp	r9, 3
  9868  	jb	.LBB1_1069
  9869  .LBB1_1004:                             # =>This Inner Loop Header: Depth=1
  9870  	movzx	ecx, byte ptr [rdx + rsi]
  9871  	sub	cl, al
  9872  	mov	byte ptr [r8 + rsi], cl
  9873  	movzx	ecx, byte ptr [rdx + rsi + 1]
  9874  	sub	cl, al
  9875  	mov	byte ptr [r8 + rsi + 1], cl
  9876  	movzx	ecx, byte ptr [rdx + rsi + 2]
  9877  	sub	cl, al
  9878  	mov	byte ptr [r8 + rsi + 2], cl
  9879  	movzx	ecx, byte ptr [rdx + rsi + 3]
  9880  	sub	cl, al
  9881  	mov	byte ptr [r8 + rsi + 3], cl
  9882  	add	rsi, 4
  9883  	cmp	r10, rsi
  9884  	jne	.LBB1_1004
  9885  	jmp	.LBB1_1069
  9886  .LBB1_241:
  9887  	test	r9d, r9d
  9888  	jle	.LBB1_1069
  9889  # %bb.242:
  9890  	mov	al, byte ptr [rcx]
  9891  	mov	r10d, r9d
  9892  	cmp	r9d, 32
  9893  	jb	.LBB1_243
  9894  # %bb.429:
  9895  	lea	rcx, [rdx + r10]
  9896  	cmp	rcx, r8
  9897  	jbe	.LBB1_597
  9898  # %bb.430:
  9899  	lea	rcx, [r8 + r10]
  9900  	cmp	rcx, rdx
  9901  	jbe	.LBB1_597
  9902  .LBB1_243:
  9903  	xor	esi, esi
  9904  .LBB1_1009:
  9905  	mov	r9, rsi
  9906  	not	r9
  9907  	add	r9, r10
  9908  	mov	rdi, r10
  9909  	and	rdi, 3
  9910  	je	.LBB1_1011
  9911  .LBB1_1010:                             # =>This Inner Loop Header: Depth=1
  9912  	movzx	ecx, byte ptr [rdx + rsi]
  9913  	add	cl, al
  9914  	mov	byte ptr [r8 + rsi], cl
  9915  	add	rsi, 1
  9916  	add	rdi, -1
  9917  	jne	.LBB1_1010
  9918  .LBB1_1011:
  9919  	cmp	r9, 3
  9920  	jb	.LBB1_1069
  9921  .LBB1_1012:                             # =>This Inner Loop Header: Depth=1
  9922  	movzx	ecx, byte ptr [rdx + rsi]
  9923  	add	cl, al
  9924  	mov	byte ptr [r8 + rsi], cl
  9925  	movzx	ecx, byte ptr [rdx + rsi + 1]
  9926  	add	cl, al
  9927  	mov	byte ptr [r8 + rsi + 1], cl
  9928  	movzx	ecx, byte ptr [rdx + rsi + 2]
  9929  	add	cl, al
  9930  	mov	byte ptr [r8 + rsi + 2], cl
  9931  	movzx	ecx, byte ptr [rdx + rsi + 3]
  9932  	add	cl, al
  9933  	mov	byte ptr [r8 + rsi + 3], cl
  9934  	add	rsi, 4
  9935  	cmp	r10, rsi
  9936  	jne	.LBB1_1012
  9937  	jmp	.LBB1_1069
  9938  .LBB1_244:
  9939  	test	r9d, r9d
  9940  	jle	.LBB1_1069
  9941  # %bb.245:
  9942  	mov	al, byte ptr [rcx]
  9943  	mov	r10d, r9d
  9944  	cmp	r9d, 32
  9945  	jb	.LBB1_246
  9946  # %bb.432:
  9947  	lea	rcx, [rdx + r10]
  9948  	cmp	rcx, r8
  9949  	jbe	.LBB1_600
  9950  # %bb.433:
  9951  	lea	rcx, [r8 + r10]
  9952  	cmp	rcx, rdx
  9953  	jbe	.LBB1_600
  9954  .LBB1_246:
  9955  	xor	esi, esi
  9956  .LBB1_1017:
  9957  	mov	r9, rsi
  9958  	not	r9
  9959  	add	r9, r10
  9960  	mov	rdi, r10
  9961  	and	rdi, 3
  9962  	je	.LBB1_1019
  9963  .LBB1_1018:                             # =>This Inner Loop Header: Depth=1
  9964  	movzx	ecx, byte ptr [rdx + rsi]
  9965  	sub	cl, al
  9966  	mov	byte ptr [r8 + rsi], cl
  9967  	add	rsi, 1
  9968  	add	rdi, -1
  9969  	jne	.LBB1_1018
  9970  .LBB1_1019:
  9971  	cmp	r9, 3
  9972  	jb	.LBB1_1069
  9973  .LBB1_1020:                             # =>This Inner Loop Header: Depth=1
  9974  	movzx	ecx, byte ptr [rdx + rsi]
  9975  	sub	cl, al
  9976  	mov	byte ptr [r8 + rsi], cl
  9977  	movzx	ecx, byte ptr [rdx + rsi + 1]
  9978  	sub	cl, al
  9979  	mov	byte ptr [r8 + rsi + 1], cl
  9980  	movzx	ecx, byte ptr [rdx + rsi + 2]
  9981  	sub	cl, al
  9982  	mov	byte ptr [r8 + rsi + 2], cl
  9983  	movzx	ecx, byte ptr [rdx + rsi + 3]
  9984  	sub	cl, al
  9985  	mov	byte ptr [r8 + rsi + 3], cl
  9986  	add	rsi, 4
  9987  	cmp	r10, rsi
  9988  	jne	.LBB1_1020
  9989  	jmp	.LBB1_1069
  9990  .LBB1_247:
  9991  	test	r9d, r9d
  9992  	jle	.LBB1_1069
  9993  # %bb.248:
  9994  	mov	eax, dword ptr [rcx]
  9995  	mov	r10d, r9d
  9996  	cmp	r9d, 8
  9997  	jb	.LBB1_249
  9998  # %bb.435:
  9999  	lea	rcx, [rdx + 4*r10]
 10000  	cmp	rcx, r8
 10001  	jbe	.LBB1_603
 10002  # %bb.436:
 10003  	lea	rcx, [r8 + 4*r10]
 10004  	cmp	rcx, rdx
 10005  	jbe	.LBB1_603
 10006  .LBB1_249:
 10007  	xor	esi, esi
 10008  .LBB1_1025:
 10009  	mov	r9, rsi
 10010  	not	r9
 10011  	add	r9, r10
 10012  	mov	rdi, r10
 10013  	and	rdi, 3
 10014  	je	.LBB1_1027
 10015  .LBB1_1026:                             # =>This Inner Loop Header: Depth=1
 10016  	mov	ecx, dword ptr [rdx + 4*rsi]
 10017  	imul	ecx, eax
 10018  	mov	dword ptr [r8 + 4*rsi], ecx
 10019  	add	rsi, 1
 10020  	add	rdi, -1
 10021  	jne	.LBB1_1026
 10022  .LBB1_1027:
 10023  	cmp	r9, 3
 10024  	jb	.LBB1_1069
 10025  .LBB1_1028:                             # =>This Inner Loop Header: Depth=1
 10026  	mov	ecx, dword ptr [rdx + 4*rsi]
 10027  	imul	ecx, eax
 10028  	mov	dword ptr [r8 + 4*rsi], ecx
 10029  	mov	ecx, dword ptr [rdx + 4*rsi + 4]
 10030  	imul	ecx, eax
 10031  	mov	dword ptr [r8 + 4*rsi + 4], ecx
 10032  	mov	ecx, dword ptr [rdx + 4*rsi + 8]
 10033  	imul	ecx, eax
 10034  	mov	dword ptr [r8 + 4*rsi + 8], ecx
 10035  	mov	ecx, dword ptr [rdx + 4*rsi + 12]
 10036  	imul	ecx, eax
 10037  	mov	dword ptr [r8 + 4*rsi + 12], ecx
 10038  	add	rsi, 4
 10039  	cmp	r10, rsi
 10040  	jne	.LBB1_1028
 10041  	jmp	.LBB1_1069
 10042  .LBB1_250:
 10043  	test	r9d, r9d
 10044  	jle	.LBB1_1069
 10045  # %bb.251:
 10046  	mov	eax, dword ptr [rcx]
 10047  	mov	r10d, r9d
 10048  	cmp	r9d, 8
 10049  	jb	.LBB1_252
 10050  # %bb.438:
 10051  	lea	rcx, [rdx + 4*r10]
 10052  	cmp	rcx, r8
 10053  	jbe	.LBB1_606
 10054  # %bb.439:
 10055  	lea	rcx, [r8 + 4*r10]
 10056  	cmp	rcx, rdx
 10057  	jbe	.LBB1_606
 10058  .LBB1_252:
 10059  	xor	esi, esi
 10060  .LBB1_1033:
 10061  	mov	r9, rsi
 10062  	not	r9
 10063  	add	r9, r10
 10064  	mov	rdi, r10
 10065  	and	rdi, 3
 10066  	je	.LBB1_1035
 10067  .LBB1_1034:                             # =>This Inner Loop Header: Depth=1
 10068  	mov	ecx, dword ptr [rdx + 4*rsi]
 10069  	imul	ecx, eax
 10070  	mov	dword ptr [r8 + 4*rsi], ecx
 10071  	add	rsi, 1
 10072  	add	rdi, -1
 10073  	jne	.LBB1_1034
 10074  .LBB1_1035:
 10075  	cmp	r9, 3
 10076  	jb	.LBB1_1069
 10077  .LBB1_1036:                             # =>This Inner Loop Header: Depth=1
 10078  	mov	ecx, dword ptr [rdx + 4*rsi]
 10079  	imul	ecx, eax
 10080  	mov	dword ptr [r8 + 4*rsi], ecx
 10081  	mov	ecx, dword ptr [rdx + 4*rsi + 4]
 10082  	imul	ecx, eax
 10083  	mov	dword ptr [r8 + 4*rsi + 4], ecx
 10084  	mov	ecx, dword ptr [rdx + 4*rsi + 8]
 10085  	imul	ecx, eax
 10086  	mov	dword ptr [r8 + 4*rsi + 8], ecx
 10087  	mov	ecx, dword ptr [rdx + 4*rsi + 12]
 10088  	imul	ecx, eax
 10089  	mov	dword ptr [r8 + 4*rsi + 12], ecx
 10090  	add	rsi, 4
 10091  	cmp	r10, rsi
 10092  	jne	.LBB1_1036
 10093  	jmp	.LBB1_1069
 10094  .LBB1_253:
 10095  	test	r9d, r9d
 10096  	jle	.LBB1_1069
 10097  # %bb.254:
 10098  	mov	eax, dword ptr [rcx]
 10099  	mov	r10d, r9d
 10100  	cmp	r9d, 8
 10101  	jb	.LBB1_255
 10102  # %bb.441:
 10103  	lea	rcx, [rdx + 4*r10]
 10104  	cmp	rcx, r8
 10105  	jbe	.LBB1_609
 10106  # %bb.442:
 10107  	lea	rcx, [r8 + 4*r10]
 10108  	cmp	rcx, rdx
 10109  	jbe	.LBB1_609
 10110  .LBB1_255:
 10111  	xor	esi, esi
 10112  .LBB1_1041:
 10113  	mov	r9, rsi
 10114  	not	r9
 10115  	add	r9, r10
 10116  	mov	rdi, r10
 10117  	and	rdi, 3
 10118  	je	.LBB1_1043
 10119  .LBB1_1042:                             # =>This Inner Loop Header: Depth=1
 10120  	mov	ecx, dword ptr [rdx + 4*rsi]
 10121  	add	ecx, eax
 10122  	mov	dword ptr [r8 + 4*rsi], ecx
 10123  	add	rsi, 1
 10124  	add	rdi, -1
 10125  	jne	.LBB1_1042
 10126  .LBB1_1043:
 10127  	cmp	r9, 3
 10128  	jb	.LBB1_1069
 10129  .LBB1_1044:                             # =>This Inner Loop Header: Depth=1
 10130  	mov	ecx, dword ptr [rdx + 4*rsi]
 10131  	add	ecx, eax
 10132  	mov	dword ptr [r8 + 4*rsi], ecx
 10133  	mov	ecx, dword ptr [rdx + 4*rsi + 4]
 10134  	add	ecx, eax
 10135  	mov	dword ptr [r8 + 4*rsi + 4], ecx
 10136  	mov	ecx, dword ptr [rdx + 4*rsi + 8]
 10137  	add	ecx, eax
 10138  	mov	dword ptr [r8 + 4*rsi + 8], ecx
 10139  	mov	ecx, dword ptr [rdx + 4*rsi + 12]
 10140  	add	ecx, eax
 10141  	mov	dword ptr [r8 + 4*rsi + 12], ecx
 10142  	add	rsi, 4
 10143  	cmp	r10, rsi
 10144  	jne	.LBB1_1044
 10145  	jmp	.LBB1_1069
 10146  .LBB1_256:
 10147  	test	r9d, r9d
 10148  	jle	.LBB1_1069
 10149  # %bb.257:
 10150  	mov	eax, dword ptr [rcx]
 10151  	mov	r10d, r9d
 10152  	cmp	r9d, 8
 10153  	jb	.LBB1_258
 10154  # %bb.444:
 10155  	lea	rcx, [rdx + 4*r10]
 10156  	cmp	rcx, r8
 10157  	jbe	.LBB1_612
 10158  # %bb.445:
 10159  	lea	rcx, [r8 + 4*r10]
 10160  	cmp	rcx, rdx
 10161  	jbe	.LBB1_612
 10162  .LBB1_258:
 10163  	xor	esi, esi
 10164  .LBB1_1049:
 10165  	mov	r9, rsi
 10166  	not	r9
 10167  	add	r9, r10
 10168  	mov	rdi, r10
 10169  	and	rdi, 3
 10170  	je	.LBB1_1051
 10171  .LBB1_1050:                             # =>This Inner Loop Header: Depth=1
 10172  	mov	ecx, dword ptr [rdx + 4*rsi]
 10173  	sub	ecx, eax
 10174  	mov	dword ptr [r8 + 4*rsi], ecx
 10175  	add	rsi, 1
 10176  	add	rdi, -1
 10177  	jne	.LBB1_1050
 10178  .LBB1_1051:
 10179  	cmp	r9, 3
 10180  	jb	.LBB1_1069
 10181  .LBB1_1052:                             # =>This Inner Loop Header: Depth=1
 10182  	mov	ecx, dword ptr [rdx + 4*rsi]
 10183  	sub	ecx, eax
 10184  	mov	dword ptr [r8 + 4*rsi], ecx
 10185  	mov	ecx, dword ptr [rdx + 4*rsi + 4]
 10186  	sub	ecx, eax
 10187  	mov	dword ptr [r8 + 4*rsi + 4], ecx
 10188  	mov	ecx, dword ptr [rdx + 4*rsi + 8]
 10189  	sub	ecx, eax
 10190  	mov	dword ptr [r8 + 4*rsi + 8], ecx
 10191  	mov	ecx, dword ptr [rdx + 4*rsi + 12]
 10192  	sub	ecx, eax
 10193  	mov	dword ptr [r8 + 4*rsi + 12], ecx
 10194  	add	rsi, 4
 10195  	cmp	r10, rsi
 10196  	jne	.LBB1_1052
 10197  	jmp	.LBB1_1069
 10198  .LBB1_259:
 10199  	test	r9d, r9d
 10200  	jle	.LBB1_1069
 10201  # %bb.260:
 10202  	mov	eax, dword ptr [rcx]
 10203  	mov	r10d, r9d
 10204  	cmp	r9d, 8
 10205  	jb	.LBB1_261
 10206  # %bb.447:
 10207  	lea	rcx, [rdx + 4*r10]
 10208  	cmp	rcx, r8
 10209  	jbe	.LBB1_615
 10210  # %bb.448:
 10211  	lea	rcx, [r8 + 4*r10]
 10212  	cmp	rcx, rdx
 10213  	jbe	.LBB1_615
 10214  .LBB1_261:
 10215  	xor	esi, esi
 10216  .LBB1_1057:
 10217  	mov	r9, rsi
 10218  	not	r9
 10219  	add	r9, r10
 10220  	mov	rdi, r10
 10221  	and	rdi, 3
 10222  	je	.LBB1_1059
 10223  .LBB1_1058:                             # =>This Inner Loop Header: Depth=1
 10224  	mov	ecx, dword ptr [rdx + 4*rsi]
 10225  	add	ecx, eax
 10226  	mov	dword ptr [r8 + 4*rsi], ecx
 10227  	add	rsi, 1
 10228  	add	rdi, -1
 10229  	jne	.LBB1_1058
 10230  .LBB1_1059:
 10231  	cmp	r9, 3
 10232  	jb	.LBB1_1069
 10233  .LBB1_1060:                             # =>This Inner Loop Header: Depth=1
 10234  	mov	ecx, dword ptr [rdx + 4*rsi]
 10235  	add	ecx, eax
 10236  	mov	dword ptr [r8 + 4*rsi], ecx
 10237  	mov	ecx, dword ptr [rdx + 4*rsi + 4]
 10238  	add	ecx, eax
 10239  	mov	dword ptr [r8 + 4*rsi + 4], ecx
 10240  	mov	ecx, dword ptr [rdx + 4*rsi + 8]
 10241  	add	ecx, eax
 10242  	mov	dword ptr [r8 + 4*rsi + 8], ecx
 10243  	mov	ecx, dword ptr [rdx + 4*rsi + 12]
 10244  	add	ecx, eax
 10245  	mov	dword ptr [r8 + 4*rsi + 12], ecx
 10246  	add	rsi, 4
 10247  	cmp	r10, rsi
 10248  	jne	.LBB1_1060
 10249  	jmp	.LBB1_1069
 10250  .LBB1_262:
 10251  	test	r9d, r9d
 10252  	jle	.LBB1_1069
 10253  # %bb.263:
 10254  	mov	eax, dword ptr [rcx]
 10255  	mov	r10d, r9d
 10256  	cmp	r9d, 8
 10257  	jb	.LBB1_264
 10258  # %bb.450:
 10259  	lea	rcx, [rdx + 4*r10]
 10260  	cmp	rcx, r8
 10261  	jbe	.LBB1_618
 10262  # %bb.451:
 10263  	lea	rcx, [r8 + 4*r10]
 10264  	cmp	rcx, rdx
 10265  	jbe	.LBB1_618
 10266  .LBB1_264:
 10267  	xor	esi, esi
 10268  .LBB1_1065:
 10269  	mov	r9, rsi
 10270  	not	r9
 10271  	add	r9, r10
 10272  	mov	rdi, r10
 10273  	and	rdi, 3
 10274  	je	.LBB1_1067
 10275  .LBB1_1066:                             # =>This Inner Loop Header: Depth=1
 10276  	mov	ecx, dword ptr [rdx + 4*rsi]
 10277  	sub	ecx, eax
 10278  	mov	dword ptr [r8 + 4*rsi], ecx
 10279  	add	rsi, 1
 10280  	add	rdi, -1
 10281  	jne	.LBB1_1066
 10282  .LBB1_1067:
 10283  	cmp	r9, 3
 10284  	jb	.LBB1_1069
 10285  .LBB1_1068:                             # =>This Inner Loop Header: Depth=1
 10286  	mov	ecx, dword ptr [rdx + 4*rsi]
 10287  	sub	ecx, eax
 10288  	mov	dword ptr [r8 + 4*rsi], ecx
 10289  	mov	ecx, dword ptr [rdx + 4*rsi + 4]
 10290  	sub	ecx, eax
 10291  	mov	dword ptr [r8 + 4*rsi + 4], ecx
 10292  	mov	ecx, dword ptr [rdx + 4*rsi + 8]
 10293  	sub	ecx, eax
 10294  	mov	dword ptr [r8 + 4*rsi + 8], ecx
 10295  	mov	ecx, dword ptr [rdx + 4*rsi + 12]
 10296  	sub	ecx, eax
 10297  	mov	dword ptr [r8 + 4*rsi + 12], ecx
 10298  	add	rsi, 4
 10299  	cmp	r10, rsi
 10300  	jne	.LBB1_1068
 10301  	jmp	.LBB1_1069
 10302  .LBB1_319:
 10303  	and	esi, -4
 10304  	xor	edi, edi
 10305  .LBB1_320:                              # =>This Inner Loop Header: Depth=1
 10306  	mov	rcx, qword ptr [rdx + 8*rdi]
 10307  	imul	rcx, rax
 10308  	mov	qword ptr [r8 + 8*rdi], rcx
 10309  	mov	rcx, qword ptr [rdx + 8*rdi + 8]
 10310  	imul	rcx, rax
 10311  	mov	qword ptr [r8 + 8*rdi + 8], rcx
 10312  	mov	rcx, qword ptr [rdx + 8*rdi + 16]
 10313  	imul	rcx, rax
 10314  	mov	qword ptr [r8 + 8*rdi + 16], rcx
 10315  	mov	rcx, qword ptr [rdx + 8*rdi + 24]
 10316  	imul	rcx, rax
 10317  	mov	qword ptr [r8 + 8*rdi + 24], rcx
 10318  	add	rdi, 4
 10319  	cmp	rsi, rdi
 10320  	jne	.LBB1_320
 10321  .LBB1_321:
 10322  	test	r9, r9
 10323  	je	.LBB1_1069
 10324  # %bb.322:
 10325  	lea	rsi, [r8 + 8*rdi]
 10326  	lea	rdx, [rdx + 8*rdi]
 10327  	xor	edi, edi
 10328  .LBB1_323:                              # =>This Inner Loop Header: Depth=1
 10329  	mov	rcx, qword ptr [rdx + 8*rdi]
 10330  	imul	rcx, rax
 10331  	mov	qword ptr [rsi + 8*rdi], rcx
 10332  	add	rdi, 1
 10333  	cmp	r9, rdi
 10334  	jne	.LBB1_323
 10335  	jmp	.LBB1_1069
 10336  .LBB1_324:
 10337  	and	esi, -4
 10338  	xor	edi, edi
 10339  .LBB1_325:                              # =>This Inner Loop Header: Depth=1
 10340  	mov	rcx, qword ptr [rdx + 8*rdi]
 10341  	imul	rcx, rax
 10342  	mov	qword ptr [r8 + 8*rdi], rcx
 10343  	mov	rcx, qword ptr [rdx + 8*rdi + 8]
 10344  	imul	rcx, rax
 10345  	mov	qword ptr [r8 + 8*rdi + 8], rcx
 10346  	mov	rcx, qword ptr [rdx + 8*rdi + 16]
 10347  	imul	rcx, rax
 10348  	mov	qword ptr [r8 + 8*rdi + 16], rcx
 10349  	mov	rcx, qword ptr [rdx + 8*rdi + 24]
 10350  	imul	rcx, rax
 10351  	mov	qword ptr [r8 + 8*rdi + 24], rcx
 10352  	add	rdi, 4
 10353  	cmp	rsi, rdi
 10354  	jne	.LBB1_325
 10355  .LBB1_326:
 10356  	test	r9, r9
 10357  	je	.LBB1_1069
 10358  # %bb.327:
 10359  	lea	rsi, [r8 + 8*rdi]
 10360  	lea	rdx, [rdx + 8*rdi]
 10361  	xor	edi, edi
 10362  .LBB1_328:                              # =>This Inner Loop Header: Depth=1
 10363  	mov	rcx, qword ptr [rdx + 8*rdi]
 10364  	imul	rcx, rax
 10365  	mov	qword ptr [rsi + 8*rdi], rcx
 10366  	add	rdi, 1
 10367  	cmp	r9, rdi
 10368  	jne	.LBB1_328
 10369  	jmp	.LBB1_1069
 10370  .LBB1_377:
 10371  	and	esi, -4
 10372  	xor	edi, edi
 10373  .LBB1_378:                              # =>This Inner Loop Header: Depth=1
 10374  	mov	rcx, qword ptr [rdx + 8*rdi]
 10375  	imul	rcx, rax
 10376  	mov	qword ptr [r8 + 8*rdi], rcx
 10377  	mov	rcx, qword ptr [rdx + 8*rdi + 8]
 10378  	imul	rcx, rax
 10379  	mov	qword ptr [r8 + 8*rdi + 8], rcx
 10380  	mov	rcx, qword ptr [rdx + 8*rdi + 16]
 10381  	imul	rcx, rax
 10382  	mov	qword ptr [r8 + 8*rdi + 16], rcx
 10383  	mov	rcx, qword ptr [rdx + 8*rdi + 24]
 10384  	imul	rcx, rax
 10385  	mov	qword ptr [r8 + 8*rdi + 24], rcx
 10386  	add	rdi, 4
 10387  	cmp	rsi, rdi
 10388  	jne	.LBB1_378
 10389  .LBB1_379:
 10390  	test	r9, r9
 10391  	je	.LBB1_1069
 10392  # %bb.380:
 10393  	lea	rsi, [r8 + 8*rdi]
 10394  	lea	rdx, [rdx + 8*rdi]
 10395  	xor	edi, edi
 10396  .LBB1_381:                              # =>This Inner Loop Header: Depth=1
 10397  	mov	rcx, qword ptr [rdx + 8*rdi]
 10398  	imul	rcx, rax
 10399  	mov	qword ptr [rsi + 8*rdi], rcx
 10400  	add	rdi, 1
 10401  	cmp	r9, rdi
 10402  	jne	.LBB1_381
 10403  	jmp	.LBB1_1069
 10404  .LBB1_385:
 10405  	and	esi, -4
 10406  	xor	edi, edi
 10407  .LBB1_386:                              # =>This Inner Loop Header: Depth=1
 10408  	mov	rcx, qword ptr [rdx + 8*rdi]
 10409  	imul	rcx, rax
 10410  	mov	qword ptr [r8 + 8*rdi], rcx
 10411  	mov	rcx, qword ptr [rdx + 8*rdi + 8]
 10412  	imul	rcx, rax
 10413  	mov	qword ptr [r8 + 8*rdi + 8], rcx
 10414  	mov	rcx, qword ptr [rdx + 8*rdi + 16]
 10415  	imul	rcx, rax
 10416  	mov	qword ptr [r8 + 8*rdi + 16], rcx
 10417  	mov	rcx, qword ptr [rdx + 8*rdi + 24]
 10418  	imul	rcx, rax
 10419  	mov	qword ptr [r8 + 8*rdi + 24], rcx
 10420  	add	rdi, 4
 10421  	cmp	rsi, rdi
 10422  	jne	.LBB1_386
 10423  .LBB1_387:
 10424  	test	r9, r9
 10425  	je	.LBB1_1069
 10426  # %bb.388:
 10427  	lea	rsi, [r8 + 8*rdi]
 10428  	lea	rdx, [rdx + 8*rdi]
 10429  	xor	edi, edi
 10430  .LBB1_389:                              # =>This Inner Loop Header: Depth=1
 10431  	mov	rcx, qword ptr [rdx + 8*rdi]
 10432  	imul	rcx, rax
 10433  	mov	qword ptr [rsi + 8*rdi], rcx
 10434  	add	rdi, 1
 10435  	cmp	r9, rdi
 10436  	jne	.LBB1_389
 10437  .LBB1_1069:
 10438  	mov	rsp, rbp
 10439  	pop	rbp
 10440  	ret
 10441  .LBB1_453:
 10442  	mov	esi, r10d
 10443  	and	esi, -8
 10444  	movd	xmm0, eax
 10445  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 10446  	lea	rcx, [rsi - 8]
 10447  	mov	r9, rcx
 10448  	shr	r9, 3
 10449  	add	r9, 1
 10450  	test	rcx, rcx
 10451  	je	.LBB1_621
 10452  # %bb.454:
 10453  	mov	rcx, r9
 10454  	and	rcx, -2
 10455  	neg	rcx
 10456  	xor	edi, edi
 10457  .LBB1_455:                              # =>This Inner Loop Header: Depth=1
 10458  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 10459  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 10460  	pmulld	xmm1, xmm0
 10461  	pmulld	xmm2, xmm0
 10462  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 10463  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 10464  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 10465  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 10466  	pmulld	xmm1, xmm0
 10467  	pmulld	xmm2, xmm0
 10468  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 10469  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 10470  	add	rdi, 16
 10471  	add	rcx, 2
 10472  	jne	.LBB1_455
 10473  	jmp	.LBB1_622
 10474  .LBB1_456:
 10475  	mov	esi, r10d
 10476  	and	esi, -8
 10477  	movd	xmm0, eax
 10478  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 10479  	lea	rcx, [rsi - 8]
 10480  	mov	r9, rcx
 10481  	shr	r9, 3
 10482  	add	r9, 1
 10483  	test	rcx, rcx
 10484  	je	.LBB1_629
 10485  # %bb.457:
 10486  	mov	rcx, r9
 10487  	and	rcx, -2
 10488  	neg	rcx
 10489  	xor	edi, edi
 10490  .LBB1_458:                              # =>This Inner Loop Header: Depth=1
 10491  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 10492  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 10493  	pmulld	xmm1, xmm0
 10494  	pmulld	xmm2, xmm0
 10495  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 10496  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 10497  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 10498  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 10499  	pmulld	xmm1, xmm0
 10500  	pmulld	xmm2, xmm0
 10501  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 10502  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 10503  	add	rdi, 16
 10504  	add	rcx, 2
 10505  	jne	.LBB1_458
 10506  	jmp	.LBB1_630
 10507  .LBB1_459:
 10508  	mov	esi, r10d
 10509  	and	esi, -8
 10510  	movd	xmm0, eax
 10511  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 10512  	lea	rcx, [rsi - 8]
 10513  	mov	r9, rcx
 10514  	shr	r9, 3
 10515  	add	r9, 1
 10516  	test	rcx, rcx
 10517  	je	.LBB1_637
 10518  # %bb.460:
 10519  	mov	rcx, r9
 10520  	and	rcx, -2
 10521  	neg	rcx
 10522  	xor	edi, edi
 10523  .LBB1_461:                              # =>This Inner Loop Header: Depth=1
 10524  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 10525  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 10526  	paddd	xmm1, xmm0
 10527  	paddd	xmm2, xmm0
 10528  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 10529  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 10530  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 10531  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 10532  	paddd	xmm1, xmm0
 10533  	paddd	xmm2, xmm0
 10534  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 10535  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 10536  	add	rdi, 16
 10537  	add	rcx, 2
 10538  	jne	.LBB1_461
 10539  	jmp	.LBB1_638
 10540  .LBB1_462:
 10541  	mov	esi, r10d
 10542  	and	esi, -8
 10543  	movd	xmm0, eax
 10544  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 10545  	lea	rcx, [rsi - 8]
 10546  	mov	r9, rcx
 10547  	shr	r9, 3
 10548  	add	r9, 1
 10549  	test	rcx, rcx
 10550  	je	.LBB1_645
 10551  # %bb.463:
 10552  	mov	rcx, r9
 10553  	and	rcx, -2
 10554  	neg	rcx
 10555  	xor	edi, edi
 10556  .LBB1_464:                              # =>This Inner Loop Header: Depth=1
 10557  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 10558  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 10559  	psubd	xmm1, xmm0
 10560  	psubd	xmm2, xmm0
 10561  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 10562  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 10563  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 10564  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 10565  	psubd	xmm1, xmm0
 10566  	psubd	xmm2, xmm0
 10567  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 10568  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 10569  	add	rdi, 16
 10570  	add	rcx, 2
 10571  	jne	.LBB1_464
 10572  	jmp	.LBB1_646
 10573  .LBB1_465:
 10574  	mov	esi, r10d
 10575  	and	esi, -8
 10576  	movd	xmm0, eax
 10577  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 10578  	lea	rcx, [rsi - 8]
 10579  	mov	r9, rcx
 10580  	shr	r9, 3
 10581  	add	r9, 1
 10582  	test	rcx, rcx
 10583  	je	.LBB1_653
 10584  # %bb.466:
 10585  	mov	rcx, r9
 10586  	and	rcx, -2
 10587  	neg	rcx
 10588  	xor	edi, edi
 10589  .LBB1_467:                              # =>This Inner Loop Header: Depth=1
 10590  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 10591  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 10592  	paddd	xmm1, xmm0
 10593  	paddd	xmm2, xmm0
 10594  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 10595  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 10596  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 10597  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 10598  	paddd	xmm1, xmm0
 10599  	paddd	xmm2, xmm0
 10600  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 10601  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 10602  	add	rdi, 16
 10603  	add	rcx, 2
 10604  	jne	.LBB1_467
 10605  	jmp	.LBB1_654
 10606  .LBB1_468:
 10607  	mov	esi, r10d
 10608  	and	esi, -8
 10609  	movd	xmm0, eax
 10610  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 10611  	lea	rcx, [rsi - 8]
 10612  	mov	r9, rcx
 10613  	shr	r9, 3
 10614  	add	r9, 1
 10615  	test	rcx, rcx
 10616  	je	.LBB1_661
 10617  # %bb.469:
 10618  	mov	rcx, r9
 10619  	and	rcx, -2
 10620  	neg	rcx
 10621  	xor	edi, edi
 10622  .LBB1_470:                              # =>This Inner Loop Header: Depth=1
 10623  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 10624  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 10625  	psubd	xmm1, xmm0
 10626  	psubd	xmm2, xmm0
 10627  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 10628  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 10629  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 10630  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 10631  	psubd	xmm1, xmm0
 10632  	psubd	xmm2, xmm0
 10633  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 10634  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 10635  	add	rdi, 16
 10636  	add	rcx, 2
 10637  	jne	.LBB1_470
 10638  	jmp	.LBB1_662
 10639  .LBB1_471:
 10640  	mov	ecx, eax
 10641  	and	ecx, -4
 10642  	movddup	xmm1, xmm0                      # xmm1 = xmm0[0,0]
 10643  	lea	rsi, [rcx - 4]
 10644  	mov	r9, rsi
 10645  	shr	r9, 2
 10646  	add	r9, 1
 10647  	test	rsi, rsi
 10648  	je	.LBB1_669
 10649  # %bb.472:
 10650  	mov	rsi, r9
 10651  	and	rsi, -2
 10652  	neg	rsi
 10653  	xor	edi, edi
 10654  .LBB1_473:                              # =>This Inner Loop Header: Depth=1
 10655  	movupd	xmm2, xmmword ptr [rdx + 8*rdi]
 10656  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 16]
 10657  	mulpd	xmm2, xmm1
 10658  	mulpd	xmm3, xmm1
 10659  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 10660  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 10661  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 32]
 10662  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 48]
 10663  	mulpd	xmm2, xmm1
 10664  	mulpd	xmm3, xmm1
 10665  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
 10666  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm3
 10667  	add	rdi, 8
 10668  	add	rsi, 2
 10669  	jne	.LBB1_473
 10670  	jmp	.LBB1_670
 10671  .LBB1_474:
 10672  	mov	ecx, eax
 10673  	and	ecx, -4
 10674  	movddup	xmm1, xmm0                      # xmm1 = xmm0[0,0]
 10675  	lea	rsi, [rcx - 4]
 10676  	mov	r9, rsi
 10677  	shr	r9, 2
 10678  	add	r9, 1
 10679  	test	rsi, rsi
 10680  	je	.LBB1_677
 10681  # %bb.475:
 10682  	mov	rsi, r9
 10683  	and	rsi, -2
 10684  	neg	rsi
 10685  	xor	edi, edi
 10686  .LBB1_476:                              # =>This Inner Loop Header: Depth=1
 10687  	movupd	xmm2, xmmword ptr [rdx + 8*rdi]
 10688  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 16]
 10689  	mulpd	xmm2, xmm1
 10690  	mulpd	xmm3, xmm1
 10691  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 10692  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 10693  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 32]
 10694  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 48]
 10695  	mulpd	xmm2, xmm1
 10696  	mulpd	xmm3, xmm1
 10697  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
 10698  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm3
 10699  	add	rdi, 8
 10700  	add	rsi, 2
 10701  	jne	.LBB1_476
 10702  	jmp	.LBB1_678
 10703  .LBB1_477:
 10704  	mov	ecx, eax
 10705  	and	ecx, -4
 10706  	movddup	xmm1, xmm0                      # xmm1 = xmm0[0,0]
 10707  	lea	rsi, [rcx - 4]
 10708  	mov	r9, rsi
 10709  	shr	r9, 2
 10710  	add	r9, 1
 10711  	test	rsi, rsi
 10712  	je	.LBB1_685
 10713  # %bb.478:
 10714  	mov	rsi, r9
 10715  	and	rsi, -2
 10716  	neg	rsi
 10717  	xor	edi, edi
 10718  .LBB1_479:                              # =>This Inner Loop Header: Depth=1
 10719  	movupd	xmm2, xmmword ptr [rdx + 8*rdi]
 10720  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 16]
 10721  	addpd	xmm2, xmm1
 10722  	addpd	xmm3, xmm1
 10723  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 10724  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 10725  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 32]
 10726  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 48]
 10727  	addpd	xmm2, xmm1
 10728  	addpd	xmm3, xmm1
 10729  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
 10730  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm3
 10731  	add	rdi, 8
 10732  	add	rsi, 2
 10733  	jne	.LBB1_479
 10734  	jmp	.LBB1_686
 10735  .LBB1_480:
 10736  	mov	ecx, eax
 10737  	and	ecx, -4
 10738  	movddup	xmm1, xmm0                      # xmm1 = xmm0[0,0]
 10739  	lea	rsi, [rcx - 4]
 10740  	mov	r9, rsi
 10741  	shr	r9, 2
 10742  	add	r9, 1
 10743  	test	rsi, rsi
 10744  	je	.LBB1_693
 10745  # %bb.481:
 10746  	mov	rsi, r9
 10747  	and	rsi, -2
 10748  	neg	rsi
 10749  	xor	edi, edi
 10750  .LBB1_482:                              # =>This Inner Loop Header: Depth=1
 10751  	movupd	xmm2, xmmword ptr [rdx + 8*rdi]
 10752  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 16]
 10753  	subpd	xmm2, xmm1
 10754  	subpd	xmm3, xmm1
 10755  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 10756  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 10757  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 32]
 10758  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 48]
 10759  	subpd	xmm2, xmm1
 10760  	subpd	xmm3, xmm1
 10761  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
 10762  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm3
 10763  	add	rdi, 8
 10764  	add	rsi, 2
 10765  	jne	.LBB1_482
 10766  	jmp	.LBB1_694
 10767  .LBB1_483:
 10768  	mov	ecx, eax
 10769  	and	ecx, -4
 10770  	movddup	xmm1, xmm0                      # xmm1 = xmm0[0,0]
 10771  	lea	rsi, [rcx - 4]
 10772  	mov	r9, rsi
 10773  	shr	r9, 2
 10774  	add	r9, 1
 10775  	test	rsi, rsi
 10776  	je	.LBB1_701
 10777  # %bb.484:
 10778  	mov	rsi, r9
 10779  	and	rsi, -2
 10780  	neg	rsi
 10781  	xor	edi, edi
 10782  .LBB1_485:                              # =>This Inner Loop Header: Depth=1
 10783  	movupd	xmm2, xmmword ptr [rdx + 8*rdi]
 10784  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 16]
 10785  	addpd	xmm2, xmm1
 10786  	addpd	xmm3, xmm1
 10787  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 10788  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 10789  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 32]
 10790  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 48]
 10791  	addpd	xmm2, xmm1
 10792  	addpd	xmm3, xmm1
 10793  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
 10794  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm3
 10795  	add	rdi, 8
 10796  	add	rsi, 2
 10797  	jne	.LBB1_485
 10798  	jmp	.LBB1_702
 10799  .LBB1_486:
 10800  	mov	ecx, eax
 10801  	and	ecx, -4
 10802  	movddup	xmm1, xmm0                      # xmm1 = xmm0[0,0]
 10803  	lea	rsi, [rcx - 4]
 10804  	mov	r9, rsi
 10805  	shr	r9, 2
 10806  	add	r9, 1
 10807  	test	rsi, rsi
 10808  	je	.LBB1_709
 10809  # %bb.487:
 10810  	mov	rsi, r9
 10811  	and	rsi, -2
 10812  	neg	rsi
 10813  	xor	edi, edi
 10814  .LBB1_488:                              # =>This Inner Loop Header: Depth=1
 10815  	movupd	xmm2, xmmword ptr [rdx + 8*rdi]
 10816  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 16]
 10817  	subpd	xmm2, xmm1
 10818  	subpd	xmm3, xmm1
 10819  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 10820  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 10821  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 32]
 10822  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 48]
 10823  	subpd	xmm2, xmm1
 10824  	subpd	xmm3, xmm1
 10825  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
 10826  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm3
 10827  	add	rdi, 8
 10828  	add	rsi, 2
 10829  	jne	.LBB1_488
 10830  	jmp	.LBB1_710
 10831  .LBB1_489:
 10832  	mov	edi, r10d
 10833  	and	edi, -32
 10834  	movzx	eax, cl
 10835  	movd	xmm0, eax
 10836  	pxor	xmm1, xmm1
 10837  	pshufb	xmm0, xmm1
 10838  	lea	rax, [rdi - 32]
 10839  	mov	r9, rax
 10840  	shr	r9, 5
 10841  	add	r9, 1
 10842  	pmovzxbw	xmm1, xmm0                      # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 10843  	test	rax, rax
 10844  	je	.LBB1_717
 10845  # %bb.490:
 10846  	mov	rsi, r9
 10847  	and	rsi, -2
 10848  	neg	rsi
 10849  	xor	eax, eax
 10850  	movdqa	xmm2, xmm0
 10851  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 10852  	movdqa	xmm3, xmmword ptr [rip + .LCPI1_0] # xmm3 = [255,255,255,255,255,255,255,255]
 10853  	movdqa	xmm4, xmm0
 10854  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 10855  .LBB1_491:                              # =>This Inner Loop Header: Depth=1
 10856  	movdqu	xmm5, xmmword ptr [rdx + rax]
 10857  	movdqu	xmm6, xmmword ptr [rdx + rax + 16]
 10858  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 10859  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 10860  	pmullw	xmm5, xmm2
 10861  	pand	xmm5, xmm3
 10862  	pmullw	xmm7, xmm1
 10863  	pand	xmm7, xmm3
 10864  	packuswb	xmm7, xmm5
 10865  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 10866  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 10867  	pmullw	xmm6, xmm4
 10868  	pand	xmm6, xmm3
 10869  	pmullw	xmm5, xmm1
 10870  	pand	xmm5, xmm3
 10871  	packuswb	xmm5, xmm6
 10872  	movdqu	xmmword ptr [r8 + rax], xmm7
 10873  	movdqu	xmmword ptr [r8 + rax + 16], xmm5
 10874  	movdqu	xmm5, xmmword ptr [rdx + rax + 32]
 10875  	movdqu	xmm6, xmmword ptr [rdx + rax + 48]
 10876  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 10877  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 10878  	pmullw	xmm5, xmm2
 10879  	pand	xmm5, xmm3
 10880  	pmullw	xmm7, xmm1
 10881  	pand	xmm7, xmm3
 10882  	packuswb	xmm7, xmm5
 10883  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 10884  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 10885  	pmullw	xmm6, xmm4
 10886  	pand	xmm6, xmm3
 10887  	pmullw	xmm5, xmm1
 10888  	pand	xmm5, xmm3
 10889  	packuswb	xmm5, xmm6
 10890  	movdqu	xmmword ptr [r8 + rax + 32], xmm7
 10891  	movdqu	xmmword ptr [r8 + rax + 48], xmm5
 10892  	add	rax, 64
 10893  	add	rsi, 2
 10894  	jne	.LBB1_491
 10895  	jmp	.LBB1_718
 10896  .LBB1_492:
 10897  	mov	edi, r10d
 10898  	and	edi, -32
 10899  	movzx	eax, cl
 10900  	movd	xmm0, eax
 10901  	pxor	xmm1, xmm1
 10902  	pshufb	xmm0, xmm1
 10903  	lea	rax, [rdi - 32]
 10904  	mov	r9, rax
 10905  	shr	r9, 5
 10906  	add	r9, 1
 10907  	pmovzxbw	xmm1, xmm0                      # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 10908  	test	rax, rax
 10909  	je	.LBB1_725
 10910  # %bb.493:
 10911  	mov	rsi, r9
 10912  	and	rsi, -2
 10913  	neg	rsi
 10914  	xor	eax, eax
 10915  	movdqa	xmm2, xmm0
 10916  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 10917  	movdqa	xmm3, xmmword ptr [rip + .LCPI1_0] # xmm3 = [255,255,255,255,255,255,255,255]
 10918  	movdqa	xmm4, xmm0
 10919  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 10920  .LBB1_494:                              # =>This Inner Loop Header: Depth=1
 10921  	movdqu	xmm5, xmmword ptr [rdx + rax]
 10922  	movdqu	xmm6, xmmword ptr [rdx + rax + 16]
 10923  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 10924  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 10925  	pmullw	xmm5, xmm2
 10926  	pand	xmm5, xmm3
 10927  	pmullw	xmm7, xmm1
 10928  	pand	xmm7, xmm3
 10929  	packuswb	xmm7, xmm5
 10930  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 10931  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 10932  	pmullw	xmm6, xmm4
 10933  	pand	xmm6, xmm3
 10934  	pmullw	xmm5, xmm1
 10935  	pand	xmm5, xmm3
 10936  	packuswb	xmm5, xmm6
 10937  	movdqu	xmmword ptr [r8 + rax], xmm7
 10938  	movdqu	xmmword ptr [r8 + rax + 16], xmm5
 10939  	movdqu	xmm5, xmmword ptr [rdx + rax + 32]
 10940  	movdqu	xmm6, xmmword ptr [rdx + rax + 48]
 10941  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 10942  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 10943  	pmullw	xmm5, xmm2
 10944  	pand	xmm5, xmm3
 10945  	pmullw	xmm7, xmm1
 10946  	pand	xmm7, xmm3
 10947  	packuswb	xmm7, xmm5
 10948  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 10949  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 10950  	pmullw	xmm6, xmm4
 10951  	pand	xmm6, xmm3
 10952  	pmullw	xmm5, xmm1
 10953  	pand	xmm5, xmm3
 10954  	packuswb	xmm5, xmm6
 10955  	movdqu	xmmword ptr [r8 + rax + 32], xmm7
 10956  	movdqu	xmmword ptr [r8 + rax + 48], xmm5
 10957  	add	rax, 64
 10958  	add	rsi, 2
 10959  	jne	.LBB1_494
 10960  	jmp	.LBB1_726
 10961  .LBB1_495:
 10962  	mov	esi, r10d
 10963  	and	esi, -32
 10964  	movzx	ecx, al
 10965  	movd	xmm0, ecx
 10966  	pxor	xmm1, xmm1
 10967  	pshufb	xmm0, xmm1
 10968  	lea	rcx, [rsi - 32]
 10969  	mov	r9, rcx
 10970  	shr	r9, 5
 10971  	add	r9, 1
 10972  	test	rcx, rcx
 10973  	je	.LBB1_733
 10974  # %bb.496:
 10975  	mov	rcx, r9
 10976  	and	rcx, -2
 10977  	neg	rcx
 10978  	xor	edi, edi
 10979  .LBB1_497:                              # =>This Inner Loop Header: Depth=1
 10980  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 10981  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 10982  	paddb	xmm1, xmm0
 10983  	paddb	xmm2, xmm0
 10984  	movdqu	xmmword ptr [r8 + rdi], xmm1
 10985  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 10986  	movdqu	xmm1, xmmword ptr [rdx + rdi + 32]
 10987  	movdqu	xmm2, xmmword ptr [rdx + rdi + 48]
 10988  	paddb	xmm1, xmm0
 10989  	paddb	xmm2, xmm0
 10990  	movdqu	xmmword ptr [r8 + rdi + 32], xmm1
 10991  	movdqu	xmmword ptr [r8 + rdi + 48], xmm2
 10992  	add	rdi, 64
 10993  	add	rcx, 2
 10994  	jne	.LBB1_497
 10995  	jmp	.LBB1_734
 10996  .LBB1_498:
 10997  	mov	esi, r10d
 10998  	and	esi, -32
 10999  	movzx	ecx, al
 11000  	movd	xmm0, ecx
 11001  	pxor	xmm1, xmm1
 11002  	pshufb	xmm0, xmm1
 11003  	lea	rcx, [rsi - 32]
 11004  	mov	r9, rcx
 11005  	shr	r9, 5
 11006  	add	r9, 1
 11007  	test	rcx, rcx
 11008  	je	.LBB1_741
 11009  # %bb.499:
 11010  	mov	rcx, r9
 11011  	and	rcx, -2
 11012  	neg	rcx
 11013  	xor	edi, edi
 11014  .LBB1_500:                              # =>This Inner Loop Header: Depth=1
 11015  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 11016  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 11017  	psubb	xmm1, xmm0
 11018  	psubb	xmm2, xmm0
 11019  	movdqu	xmmword ptr [r8 + rdi], xmm1
 11020  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 11021  	movdqu	xmm1, xmmword ptr [rdx + rdi + 32]
 11022  	movdqu	xmm2, xmmword ptr [rdx + rdi + 48]
 11023  	psubb	xmm1, xmm0
 11024  	psubb	xmm2, xmm0
 11025  	movdqu	xmmword ptr [r8 + rdi + 32], xmm1
 11026  	movdqu	xmmword ptr [r8 + rdi + 48], xmm2
 11027  	add	rdi, 64
 11028  	add	rcx, 2
 11029  	jne	.LBB1_500
 11030  	jmp	.LBB1_742
 11031  .LBB1_501:
 11032  	mov	esi, r10d
 11033  	and	esi, -32
 11034  	movzx	ecx, al
 11035  	movd	xmm0, ecx
 11036  	pxor	xmm1, xmm1
 11037  	pshufb	xmm0, xmm1
 11038  	lea	rcx, [rsi - 32]
 11039  	mov	r9, rcx
 11040  	shr	r9, 5
 11041  	add	r9, 1
 11042  	test	rcx, rcx
 11043  	je	.LBB1_749
 11044  # %bb.502:
 11045  	mov	rcx, r9
 11046  	and	rcx, -2
 11047  	neg	rcx
 11048  	xor	edi, edi
 11049  .LBB1_503:                              # =>This Inner Loop Header: Depth=1
 11050  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 11051  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 11052  	paddb	xmm1, xmm0
 11053  	paddb	xmm2, xmm0
 11054  	movdqu	xmmword ptr [r8 + rdi], xmm1
 11055  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 11056  	movdqu	xmm1, xmmword ptr [rdx + rdi + 32]
 11057  	movdqu	xmm2, xmmword ptr [rdx + rdi + 48]
 11058  	paddb	xmm1, xmm0
 11059  	paddb	xmm2, xmm0
 11060  	movdqu	xmmword ptr [r8 + rdi + 32], xmm1
 11061  	movdqu	xmmword ptr [r8 + rdi + 48], xmm2
 11062  	add	rdi, 64
 11063  	add	rcx, 2
 11064  	jne	.LBB1_503
 11065  	jmp	.LBB1_750
 11066  .LBB1_504:
 11067  	mov	esi, r10d
 11068  	and	esi, -32
 11069  	movzx	ecx, al
 11070  	movd	xmm0, ecx
 11071  	pxor	xmm1, xmm1
 11072  	pshufb	xmm0, xmm1
 11073  	lea	rcx, [rsi - 32]
 11074  	mov	r9, rcx
 11075  	shr	r9, 5
 11076  	add	r9, 1
 11077  	test	rcx, rcx
 11078  	je	.LBB1_757
 11079  # %bb.505:
 11080  	mov	rcx, r9
 11081  	and	rcx, -2
 11082  	neg	rcx
 11083  	xor	edi, edi
 11084  .LBB1_506:                              # =>This Inner Loop Header: Depth=1
 11085  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 11086  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 11087  	psubb	xmm1, xmm0
 11088  	psubb	xmm2, xmm0
 11089  	movdqu	xmmword ptr [r8 + rdi], xmm1
 11090  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 11091  	movdqu	xmm1, xmmword ptr [rdx + rdi + 32]
 11092  	movdqu	xmm2, xmmword ptr [rdx + rdi + 48]
 11093  	psubb	xmm1, xmm0
 11094  	psubb	xmm2, xmm0
 11095  	movdqu	xmmword ptr [r8 + rdi + 32], xmm1
 11096  	movdqu	xmmword ptr [r8 + rdi + 48], xmm2
 11097  	add	rdi, 64
 11098  	add	rcx, 2
 11099  	jne	.LBB1_506
 11100  	jmp	.LBB1_758
 11101  .LBB1_507:
 11102  	mov	esi, r10d
 11103  	and	esi, -4
 11104  	movq	xmm0, rax
 11105  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 11106  	lea	rcx, [rsi - 4]
 11107  	mov	r9, rcx
 11108  	shr	r9, 2
 11109  	add	r9, 1
 11110  	test	rcx, rcx
 11111  	je	.LBB1_765
 11112  # %bb.508:
 11113  	mov	rcx, r9
 11114  	and	rcx, -2
 11115  	neg	rcx
 11116  	xor	edi, edi
 11117  .LBB1_509:                              # =>This Inner Loop Header: Depth=1
 11118  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 11119  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 11120  	paddq	xmm1, xmm0
 11121  	paddq	xmm2, xmm0
 11122  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 11123  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 11124  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 11125  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 11126  	paddq	xmm1, xmm0
 11127  	paddq	xmm2, xmm0
 11128  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm1
 11129  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm2
 11130  	add	rdi, 8
 11131  	add	rcx, 2
 11132  	jne	.LBB1_509
 11133  	jmp	.LBB1_766
 11134  .LBB1_510:
 11135  	mov	esi, r10d
 11136  	and	esi, -4
 11137  	movq	xmm0, rax
 11138  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 11139  	lea	rcx, [rsi - 4]
 11140  	mov	r9, rcx
 11141  	shr	r9, 2
 11142  	add	r9, 1
 11143  	test	rcx, rcx
 11144  	je	.LBB1_773
 11145  # %bb.511:
 11146  	mov	rcx, r9
 11147  	and	rcx, -2
 11148  	neg	rcx
 11149  	xor	edi, edi
 11150  .LBB1_512:                              # =>This Inner Loop Header: Depth=1
 11151  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 11152  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 11153  	psubq	xmm1, xmm0
 11154  	psubq	xmm2, xmm0
 11155  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 11156  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 11157  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 11158  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 11159  	psubq	xmm1, xmm0
 11160  	psubq	xmm2, xmm0
 11161  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm1
 11162  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm2
 11163  	add	rdi, 8
 11164  	add	rcx, 2
 11165  	jne	.LBB1_512
 11166  	jmp	.LBB1_774
 11167  .LBB1_513:
 11168  	mov	esi, r10d
 11169  	and	esi, -4
 11170  	movq	xmm0, rax
 11171  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 11172  	lea	rcx, [rsi - 4]
 11173  	mov	r9, rcx
 11174  	shr	r9, 2
 11175  	add	r9, 1
 11176  	test	rcx, rcx
 11177  	je	.LBB1_781
 11178  # %bb.514:
 11179  	mov	rcx, r9
 11180  	and	rcx, -2
 11181  	neg	rcx
 11182  	xor	edi, edi
 11183  .LBB1_515:                              # =>This Inner Loop Header: Depth=1
 11184  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 11185  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 11186  	paddq	xmm1, xmm0
 11187  	paddq	xmm2, xmm0
 11188  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 11189  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 11190  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 11191  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 11192  	paddq	xmm1, xmm0
 11193  	paddq	xmm2, xmm0
 11194  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm1
 11195  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm2
 11196  	add	rdi, 8
 11197  	add	rcx, 2
 11198  	jne	.LBB1_515
 11199  	jmp	.LBB1_782
 11200  .LBB1_516:
 11201  	mov	esi, r10d
 11202  	and	esi, -4
 11203  	movq	xmm0, rax
 11204  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 11205  	lea	rcx, [rsi - 4]
 11206  	mov	r9, rcx
 11207  	shr	r9, 2
 11208  	add	r9, 1
 11209  	test	rcx, rcx
 11210  	je	.LBB1_789
 11211  # %bb.517:
 11212  	mov	rcx, r9
 11213  	and	rcx, -2
 11214  	neg	rcx
 11215  	xor	edi, edi
 11216  .LBB1_518:                              # =>This Inner Loop Header: Depth=1
 11217  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 11218  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 11219  	psubq	xmm1, xmm0
 11220  	psubq	xmm2, xmm0
 11221  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 11222  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 11223  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 11224  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 11225  	psubq	xmm1, xmm0
 11226  	psubq	xmm2, xmm0
 11227  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm1
 11228  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm2
 11229  	add	rdi, 8
 11230  	add	rcx, 2
 11231  	jne	.LBB1_518
 11232  	jmp	.LBB1_790
 11233  .LBB1_519:
 11234  	mov	esi, r10d
 11235  	and	esi, -16
 11236  	movd	xmm0, eax
 11237  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 11238  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 11239  	lea	rcx, [rsi - 16]
 11240  	mov	r9, rcx
 11241  	shr	r9, 4
 11242  	add	r9, 1
 11243  	test	rcx, rcx
 11244  	je	.LBB1_797
 11245  # %bb.520:
 11246  	mov	rcx, r9
 11247  	and	rcx, -2
 11248  	neg	rcx
 11249  	xor	edi, edi
 11250  .LBB1_521:                              # =>This Inner Loop Header: Depth=1
 11251  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 11252  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 11253  	pmullw	xmm1, xmm0
 11254  	pmullw	xmm2, xmm0
 11255  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 11256  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 11257  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 32]
 11258  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 48]
 11259  	pmullw	xmm1, xmm0
 11260  	pmullw	xmm2, xmm0
 11261  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 11262  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 11263  	add	rdi, 32
 11264  	add	rcx, 2
 11265  	jne	.LBB1_521
 11266  	jmp	.LBB1_798
 11267  .LBB1_522:
 11268  	mov	esi, r10d
 11269  	and	esi, -16
 11270  	movd	xmm0, eax
 11271  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 11272  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 11273  	lea	rcx, [rsi - 16]
 11274  	mov	r9, rcx
 11275  	shr	r9, 4
 11276  	add	r9, 1
 11277  	test	rcx, rcx
 11278  	je	.LBB1_805
 11279  # %bb.523:
 11280  	mov	rcx, r9
 11281  	and	rcx, -2
 11282  	neg	rcx
 11283  	xor	edi, edi
 11284  .LBB1_524:                              # =>This Inner Loop Header: Depth=1
 11285  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 11286  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 11287  	pmullw	xmm1, xmm0
 11288  	pmullw	xmm2, xmm0
 11289  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 11290  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 11291  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 32]
 11292  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 48]
 11293  	pmullw	xmm1, xmm0
 11294  	pmullw	xmm2, xmm0
 11295  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 11296  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 11297  	add	rdi, 32
 11298  	add	rcx, 2
 11299  	jne	.LBB1_524
 11300  	jmp	.LBB1_806
 11301  .LBB1_525:
 11302  	mov	esi, r10d
 11303  	and	esi, -16
 11304  	movd	xmm0, eax
 11305  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 11306  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 11307  	lea	rcx, [rsi - 16]
 11308  	mov	r9, rcx
 11309  	shr	r9, 4
 11310  	add	r9, 1
 11311  	test	rcx, rcx
 11312  	je	.LBB1_813
 11313  # %bb.526:
 11314  	mov	rcx, r9
 11315  	and	rcx, -2
 11316  	neg	rcx
 11317  	xor	edi, edi
 11318  .LBB1_527:                              # =>This Inner Loop Header: Depth=1
 11319  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 11320  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 11321  	pmullw	xmm1, xmm0
 11322  	pmullw	xmm2, xmm0
 11323  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 11324  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 11325  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 32]
 11326  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 48]
 11327  	pmullw	xmm1, xmm0
 11328  	pmullw	xmm2, xmm0
 11329  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 11330  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 11331  	add	rdi, 32
 11332  	add	rcx, 2
 11333  	jne	.LBB1_527
 11334  	jmp	.LBB1_814
 11335  .LBB1_528:
 11336  	mov	esi, r10d
 11337  	and	esi, -16
 11338  	movd	xmm0, eax
 11339  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 11340  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 11341  	lea	rcx, [rsi - 16]
 11342  	mov	r9, rcx
 11343  	shr	r9, 4
 11344  	add	r9, 1
 11345  	test	rcx, rcx
 11346  	je	.LBB1_821
 11347  # %bb.529:
 11348  	mov	rcx, r9
 11349  	and	rcx, -2
 11350  	neg	rcx
 11351  	xor	edi, edi
 11352  .LBB1_530:                              # =>This Inner Loop Header: Depth=1
 11353  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 11354  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 11355  	pmullw	xmm1, xmm0
 11356  	pmullw	xmm2, xmm0
 11357  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 11358  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 11359  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 32]
 11360  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 48]
 11361  	pmullw	xmm1, xmm0
 11362  	pmullw	xmm2, xmm0
 11363  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 11364  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 11365  	add	rdi, 32
 11366  	add	rcx, 2
 11367  	jne	.LBB1_530
 11368  	jmp	.LBB1_822
 11369  .LBB1_531:
 11370  	mov	esi, r10d
 11371  	and	esi, -16
 11372  	movd	xmm0, eax
 11373  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 11374  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 11375  	lea	rcx, [rsi - 16]
 11376  	mov	r9, rcx
 11377  	shr	r9, 4
 11378  	add	r9, 1
 11379  	test	rcx, rcx
 11380  	je	.LBB1_829
 11381  # %bb.532:
 11382  	mov	rcx, r9
 11383  	and	rcx, -2
 11384  	neg	rcx
 11385  	xor	edi, edi
 11386  .LBB1_533:                              # =>This Inner Loop Header: Depth=1
 11387  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 11388  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 11389  	paddw	xmm1, xmm0
 11390  	paddw	xmm2, xmm0
 11391  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 11392  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 11393  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 32]
 11394  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 48]
 11395  	paddw	xmm1, xmm0
 11396  	paddw	xmm2, xmm0
 11397  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 11398  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 11399  	add	rdi, 32
 11400  	add	rcx, 2
 11401  	jne	.LBB1_533
 11402  	jmp	.LBB1_830
 11403  .LBB1_534:
 11404  	mov	esi, r10d
 11405  	and	esi, -16
 11406  	movd	xmm0, eax
 11407  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 11408  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 11409  	lea	rcx, [rsi - 16]
 11410  	mov	r9, rcx
 11411  	shr	r9, 4
 11412  	add	r9, 1
 11413  	test	rcx, rcx
 11414  	je	.LBB1_837
 11415  # %bb.535:
 11416  	mov	rcx, r9
 11417  	and	rcx, -2
 11418  	neg	rcx
 11419  	xor	edi, edi
 11420  .LBB1_536:                              # =>This Inner Loop Header: Depth=1
 11421  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 11422  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 11423  	paddw	xmm1, xmm0
 11424  	paddw	xmm2, xmm0
 11425  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 11426  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 11427  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 32]
 11428  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 48]
 11429  	paddw	xmm1, xmm0
 11430  	paddw	xmm2, xmm0
 11431  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 11432  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 11433  	add	rdi, 32
 11434  	add	rcx, 2
 11435  	jne	.LBB1_536
 11436  	jmp	.LBB1_838
 11437  .LBB1_537:
 11438  	mov	esi, r10d
 11439  	and	esi, -16
 11440  	movd	xmm0, eax
 11441  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 11442  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 11443  	lea	rcx, [rsi - 16]
 11444  	mov	r9, rcx
 11445  	shr	r9, 4
 11446  	add	r9, 1
 11447  	test	rcx, rcx
 11448  	je	.LBB1_845
 11449  # %bb.538:
 11450  	mov	rcx, r9
 11451  	and	rcx, -2
 11452  	neg	rcx
 11453  	xor	edi, edi
 11454  .LBB1_539:                              # =>This Inner Loop Header: Depth=1
 11455  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 11456  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 11457  	psubw	xmm1, xmm0
 11458  	psubw	xmm2, xmm0
 11459  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 11460  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 11461  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 32]
 11462  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 48]
 11463  	psubw	xmm1, xmm0
 11464  	psubw	xmm2, xmm0
 11465  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 11466  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 11467  	add	rdi, 32
 11468  	add	rcx, 2
 11469  	jne	.LBB1_539
 11470  	jmp	.LBB1_846
 11471  .LBB1_540:
 11472  	mov	esi, r10d
 11473  	and	esi, -16
 11474  	movd	xmm0, eax
 11475  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 11476  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 11477  	lea	rcx, [rsi - 16]
 11478  	mov	r9, rcx
 11479  	shr	r9, 4
 11480  	add	r9, 1
 11481  	test	rcx, rcx
 11482  	je	.LBB1_853
 11483  # %bb.541:
 11484  	mov	rcx, r9
 11485  	and	rcx, -2
 11486  	neg	rcx
 11487  	xor	edi, edi
 11488  .LBB1_542:                              # =>This Inner Loop Header: Depth=1
 11489  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 11490  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 11491  	psubw	xmm1, xmm0
 11492  	psubw	xmm2, xmm0
 11493  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 11494  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 11495  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 32]
 11496  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 48]
 11497  	psubw	xmm1, xmm0
 11498  	psubw	xmm2, xmm0
 11499  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 11500  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 11501  	add	rdi, 32
 11502  	add	rcx, 2
 11503  	jne	.LBB1_542
 11504  	jmp	.LBB1_854
 11505  .LBB1_543:
 11506  	mov	esi, r10d
 11507  	and	esi, -16
 11508  	movd	xmm0, eax
 11509  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 11510  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 11511  	lea	rcx, [rsi - 16]
 11512  	mov	r9, rcx
 11513  	shr	r9, 4
 11514  	add	r9, 1
 11515  	test	rcx, rcx
 11516  	je	.LBB1_861
 11517  # %bb.544:
 11518  	mov	rcx, r9
 11519  	and	rcx, -2
 11520  	neg	rcx
 11521  	xor	edi, edi
 11522  .LBB1_545:                              # =>This Inner Loop Header: Depth=1
 11523  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 11524  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 11525  	paddw	xmm1, xmm0
 11526  	paddw	xmm2, xmm0
 11527  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 11528  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 11529  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 32]
 11530  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 48]
 11531  	paddw	xmm1, xmm0
 11532  	paddw	xmm2, xmm0
 11533  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 11534  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 11535  	add	rdi, 32
 11536  	add	rcx, 2
 11537  	jne	.LBB1_545
 11538  	jmp	.LBB1_862
 11539  .LBB1_546:
 11540  	mov	esi, r10d
 11541  	and	esi, -16
 11542  	movd	xmm0, eax
 11543  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 11544  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 11545  	lea	rcx, [rsi - 16]
 11546  	mov	r9, rcx
 11547  	shr	r9, 4
 11548  	add	r9, 1
 11549  	test	rcx, rcx
 11550  	je	.LBB1_869
 11551  # %bb.547:
 11552  	mov	rcx, r9
 11553  	and	rcx, -2
 11554  	neg	rcx
 11555  	xor	edi, edi
 11556  .LBB1_548:                              # =>This Inner Loop Header: Depth=1
 11557  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 11558  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 11559  	paddw	xmm1, xmm0
 11560  	paddw	xmm2, xmm0
 11561  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 11562  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 11563  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 32]
 11564  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 48]
 11565  	paddw	xmm1, xmm0
 11566  	paddw	xmm2, xmm0
 11567  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 11568  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 11569  	add	rdi, 32
 11570  	add	rcx, 2
 11571  	jne	.LBB1_548
 11572  	jmp	.LBB1_870
 11573  .LBB1_549:
 11574  	mov	esi, r10d
 11575  	and	esi, -16
 11576  	movd	xmm0, eax
 11577  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 11578  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 11579  	lea	rcx, [rsi - 16]
 11580  	mov	r9, rcx
 11581  	shr	r9, 4
 11582  	add	r9, 1
 11583  	test	rcx, rcx
 11584  	je	.LBB1_877
 11585  # %bb.550:
 11586  	mov	rcx, r9
 11587  	and	rcx, -2
 11588  	neg	rcx
 11589  	xor	edi, edi
 11590  .LBB1_551:                              # =>This Inner Loop Header: Depth=1
 11591  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 11592  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 11593  	psubw	xmm1, xmm0
 11594  	psubw	xmm2, xmm0
 11595  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 11596  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 11597  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 32]
 11598  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 48]
 11599  	psubw	xmm1, xmm0
 11600  	psubw	xmm2, xmm0
 11601  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 11602  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 11603  	add	rdi, 32
 11604  	add	rcx, 2
 11605  	jne	.LBB1_551
 11606  	jmp	.LBB1_878
 11607  .LBB1_552:
 11608  	mov	esi, r10d
 11609  	and	esi, -16
 11610  	movd	xmm0, eax
 11611  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 11612  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 11613  	lea	rcx, [rsi - 16]
 11614  	mov	r9, rcx
 11615  	shr	r9, 4
 11616  	add	r9, 1
 11617  	test	rcx, rcx
 11618  	je	.LBB1_885
 11619  # %bb.553:
 11620  	mov	rcx, r9
 11621  	and	rcx, -2
 11622  	neg	rcx
 11623  	xor	edi, edi
 11624  .LBB1_554:                              # =>This Inner Loop Header: Depth=1
 11625  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 11626  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 11627  	psubw	xmm1, xmm0
 11628  	psubw	xmm2, xmm0
 11629  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 11630  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 11631  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 32]
 11632  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 48]
 11633  	psubw	xmm1, xmm0
 11634  	psubw	xmm2, xmm0
 11635  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 11636  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 11637  	add	rdi, 32
 11638  	add	rcx, 2
 11639  	jne	.LBB1_554
 11640  	jmp	.LBB1_886
 11641  .LBB1_555:
 11642  	mov	ecx, eax
 11643  	and	ecx, -8
 11644  	movaps	xmm1, xmm0
 11645  	shufps	xmm1, xmm0, 0                   # xmm1 = xmm1[0,0],xmm0[0,0]
 11646  	lea	rsi, [rcx - 8]
 11647  	mov	r9, rsi
 11648  	shr	r9, 3
 11649  	add	r9, 1
 11650  	test	rsi, rsi
 11651  	je	.LBB1_893
 11652  # %bb.556:
 11653  	mov	rsi, r9
 11654  	and	rsi, -2
 11655  	neg	rsi
 11656  	xor	edi, edi
 11657  .LBB1_557:                              # =>This Inner Loop Header: Depth=1
 11658  	movups	xmm2, xmmword ptr [rdx + 4*rdi]
 11659  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 11660  	mulps	xmm2, xmm1
 11661  	mulps	xmm3, xmm1
 11662  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 11663  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 11664  	movups	xmm2, xmmword ptr [rdx + 4*rdi + 32]
 11665  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 48]
 11666  	mulps	xmm2, xmm1
 11667  	mulps	xmm3, xmm1
 11668  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
 11669  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm3
 11670  	add	rdi, 16
 11671  	add	rsi, 2
 11672  	jne	.LBB1_557
 11673  	jmp	.LBB1_894
 11674  .LBB1_558:
 11675  	mov	ecx, eax
 11676  	and	ecx, -8
 11677  	movaps	xmm1, xmm0
 11678  	shufps	xmm1, xmm0, 0                   # xmm1 = xmm1[0,0],xmm0[0,0]
 11679  	lea	rsi, [rcx - 8]
 11680  	mov	r9, rsi
 11681  	shr	r9, 3
 11682  	add	r9, 1
 11683  	test	rsi, rsi
 11684  	je	.LBB1_901
 11685  # %bb.559:
 11686  	mov	rsi, r9
 11687  	and	rsi, -2
 11688  	neg	rsi
 11689  	xor	edi, edi
 11690  .LBB1_560:                              # =>This Inner Loop Header: Depth=1
 11691  	movups	xmm2, xmmword ptr [rdx + 4*rdi]
 11692  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 11693  	mulps	xmm2, xmm1
 11694  	mulps	xmm3, xmm1
 11695  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 11696  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 11697  	movups	xmm2, xmmword ptr [rdx + 4*rdi + 32]
 11698  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 48]
 11699  	mulps	xmm2, xmm1
 11700  	mulps	xmm3, xmm1
 11701  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
 11702  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm3
 11703  	add	rdi, 16
 11704  	add	rsi, 2
 11705  	jne	.LBB1_560
 11706  	jmp	.LBB1_902
 11707  .LBB1_561:
 11708  	mov	esi, r10d
 11709  	and	esi, -4
 11710  	movq	xmm0, rax
 11711  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 11712  	lea	rcx, [rsi - 4]
 11713  	mov	r9, rcx
 11714  	shr	r9, 2
 11715  	add	r9, 1
 11716  	test	rcx, rcx
 11717  	je	.LBB1_909
 11718  # %bb.562:
 11719  	mov	rcx, r9
 11720  	and	rcx, -2
 11721  	neg	rcx
 11722  	xor	edi, edi
 11723  .LBB1_563:                              # =>This Inner Loop Header: Depth=1
 11724  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 11725  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 11726  	paddq	xmm1, xmm0
 11727  	paddq	xmm2, xmm0
 11728  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 11729  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 11730  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 11731  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 11732  	paddq	xmm1, xmm0
 11733  	paddq	xmm2, xmm0
 11734  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm1
 11735  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm2
 11736  	add	rdi, 8
 11737  	add	rcx, 2
 11738  	jne	.LBB1_563
 11739  	jmp	.LBB1_910
 11740  .LBB1_564:
 11741  	mov	ecx, eax
 11742  	and	ecx, -8
 11743  	movaps	xmm1, xmm0
 11744  	shufps	xmm1, xmm0, 0                   # xmm1 = xmm1[0,0],xmm0[0,0]
 11745  	lea	rsi, [rcx - 8]
 11746  	mov	r9, rsi
 11747  	shr	r9, 3
 11748  	add	r9, 1
 11749  	test	rsi, rsi
 11750  	je	.LBB1_917
 11751  # %bb.565:
 11752  	mov	rsi, r9
 11753  	and	rsi, -2
 11754  	neg	rsi
 11755  	xor	edi, edi
 11756  .LBB1_566:                              # =>This Inner Loop Header: Depth=1
 11757  	movups	xmm2, xmmword ptr [rdx + 4*rdi]
 11758  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 11759  	addps	xmm2, xmm1
 11760  	addps	xmm3, xmm1
 11761  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 11762  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 11763  	movups	xmm2, xmmword ptr [rdx + 4*rdi + 32]
 11764  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 48]
 11765  	addps	xmm2, xmm1
 11766  	addps	xmm3, xmm1
 11767  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
 11768  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm3
 11769  	add	rdi, 16
 11770  	add	rsi, 2
 11771  	jne	.LBB1_566
 11772  	jmp	.LBB1_918
 11773  .LBB1_567:
 11774  	mov	esi, r10d
 11775  	and	esi, -4
 11776  	movq	xmm0, rax
 11777  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 11778  	lea	rcx, [rsi - 4]
 11779  	mov	r9, rcx
 11780  	shr	r9, 2
 11781  	add	r9, 1
 11782  	test	rcx, rcx
 11783  	je	.LBB1_925
 11784  # %bb.568:
 11785  	mov	rcx, r9
 11786  	and	rcx, -2
 11787  	neg	rcx
 11788  	xor	edi, edi
 11789  .LBB1_569:                              # =>This Inner Loop Header: Depth=1
 11790  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 11791  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 11792  	psubq	xmm1, xmm0
 11793  	psubq	xmm2, xmm0
 11794  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 11795  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 11796  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 11797  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 11798  	psubq	xmm1, xmm0
 11799  	psubq	xmm2, xmm0
 11800  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm1
 11801  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm2
 11802  	add	rdi, 8
 11803  	add	rcx, 2
 11804  	jne	.LBB1_569
 11805  	jmp	.LBB1_926
 11806  .LBB1_570:
 11807  	mov	ecx, eax
 11808  	and	ecx, -8
 11809  	movaps	xmm1, xmm0
 11810  	shufps	xmm1, xmm0, 0                   # xmm1 = xmm1[0,0],xmm0[0,0]
 11811  	lea	rsi, [rcx - 8]
 11812  	mov	r9, rsi
 11813  	shr	r9, 3
 11814  	add	r9, 1
 11815  	test	rsi, rsi
 11816  	je	.LBB1_933
 11817  # %bb.571:
 11818  	mov	rsi, r9
 11819  	and	rsi, -2
 11820  	neg	rsi
 11821  	xor	edi, edi
 11822  .LBB1_572:                              # =>This Inner Loop Header: Depth=1
 11823  	movups	xmm2, xmmword ptr [rdx + 4*rdi]
 11824  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 11825  	subps	xmm2, xmm1
 11826  	subps	xmm3, xmm1
 11827  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 11828  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 11829  	movups	xmm2, xmmword ptr [rdx + 4*rdi + 32]
 11830  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 48]
 11831  	subps	xmm2, xmm1
 11832  	subps	xmm3, xmm1
 11833  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
 11834  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm3
 11835  	add	rdi, 16
 11836  	add	rsi, 2
 11837  	jne	.LBB1_572
 11838  	jmp	.LBB1_934
 11839  .LBB1_573:
 11840  	mov	esi, r10d
 11841  	and	esi, -4
 11842  	movq	xmm0, rax
 11843  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 11844  	lea	rcx, [rsi - 4]
 11845  	mov	r9, rcx
 11846  	shr	r9, 2
 11847  	add	r9, 1
 11848  	test	rcx, rcx
 11849  	je	.LBB1_941
 11850  # %bb.574:
 11851  	mov	rcx, r9
 11852  	and	rcx, -2
 11853  	neg	rcx
 11854  	xor	edi, edi
 11855  .LBB1_575:                              # =>This Inner Loop Header: Depth=1
 11856  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 11857  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 11858  	paddq	xmm1, xmm0
 11859  	paddq	xmm2, xmm0
 11860  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 11861  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 11862  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 11863  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 11864  	paddq	xmm1, xmm0
 11865  	paddq	xmm2, xmm0
 11866  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm1
 11867  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm2
 11868  	add	rdi, 8
 11869  	add	rcx, 2
 11870  	jne	.LBB1_575
 11871  	jmp	.LBB1_942
 11872  .LBB1_576:
 11873  	mov	ecx, eax
 11874  	and	ecx, -8
 11875  	movaps	xmm1, xmm0
 11876  	shufps	xmm1, xmm0, 0                   # xmm1 = xmm1[0,0],xmm0[0,0]
 11877  	lea	rsi, [rcx - 8]
 11878  	mov	r9, rsi
 11879  	shr	r9, 3
 11880  	add	r9, 1
 11881  	test	rsi, rsi
 11882  	je	.LBB1_949
 11883  # %bb.577:
 11884  	mov	rsi, r9
 11885  	and	rsi, -2
 11886  	neg	rsi
 11887  	xor	edi, edi
 11888  .LBB1_578:                              # =>This Inner Loop Header: Depth=1
 11889  	movups	xmm2, xmmword ptr [rdx + 4*rdi]
 11890  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 11891  	addps	xmm2, xmm1
 11892  	addps	xmm3, xmm1
 11893  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 11894  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 11895  	movups	xmm2, xmmword ptr [rdx + 4*rdi + 32]
 11896  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 48]
 11897  	addps	xmm2, xmm1
 11898  	addps	xmm3, xmm1
 11899  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
 11900  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm3
 11901  	add	rdi, 16
 11902  	add	rsi, 2
 11903  	jne	.LBB1_578
 11904  	jmp	.LBB1_950
 11905  .LBB1_579:
 11906  	mov	esi, r10d
 11907  	and	esi, -4
 11908  	movq	xmm0, rax
 11909  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 11910  	lea	rcx, [rsi - 4]
 11911  	mov	r9, rcx
 11912  	shr	r9, 2
 11913  	add	r9, 1
 11914  	test	rcx, rcx
 11915  	je	.LBB1_957
 11916  # %bb.580:
 11917  	mov	rcx, r9
 11918  	and	rcx, -2
 11919  	neg	rcx
 11920  	xor	edi, edi
 11921  .LBB1_581:                              # =>This Inner Loop Header: Depth=1
 11922  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 11923  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 11924  	psubq	xmm1, xmm0
 11925  	psubq	xmm2, xmm0
 11926  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 11927  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 11928  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 11929  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 11930  	psubq	xmm1, xmm0
 11931  	psubq	xmm2, xmm0
 11932  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm1
 11933  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm2
 11934  	add	rdi, 8
 11935  	add	rcx, 2
 11936  	jne	.LBB1_581
 11937  	jmp	.LBB1_958
 11938  .LBB1_582:
 11939  	mov	ecx, eax
 11940  	and	ecx, -8
 11941  	movaps	xmm1, xmm0
 11942  	shufps	xmm1, xmm0, 0                   # xmm1 = xmm1[0,0],xmm0[0,0]
 11943  	lea	rsi, [rcx - 8]
 11944  	mov	r9, rsi
 11945  	shr	r9, 3
 11946  	add	r9, 1
 11947  	test	rsi, rsi
 11948  	je	.LBB1_965
 11949  # %bb.583:
 11950  	mov	rsi, r9
 11951  	and	rsi, -2
 11952  	neg	rsi
 11953  	xor	edi, edi
 11954  .LBB1_584:                              # =>This Inner Loop Header: Depth=1
 11955  	movups	xmm2, xmmword ptr [rdx + 4*rdi]
 11956  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 11957  	subps	xmm2, xmm1
 11958  	subps	xmm3, xmm1
 11959  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 11960  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 11961  	movups	xmm2, xmmword ptr [rdx + 4*rdi + 32]
 11962  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 48]
 11963  	subps	xmm2, xmm1
 11964  	subps	xmm3, xmm1
 11965  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
 11966  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm3
 11967  	add	rdi, 16
 11968  	add	rsi, 2
 11969  	jne	.LBB1_584
 11970  	jmp	.LBB1_966
 11971  .LBB1_585:
 11972  	mov	edi, r10d
 11973  	and	edi, -32
 11974  	movzx	eax, cl
 11975  	movd	xmm0, eax
 11976  	pxor	xmm1, xmm1
 11977  	pshufb	xmm0, xmm1
 11978  	lea	rax, [rdi - 32]
 11979  	mov	r9, rax
 11980  	shr	r9, 5
 11981  	add	r9, 1
 11982  	pmovzxbw	xmm1, xmm0                      # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 11983  	test	rax, rax
 11984  	je	.LBB1_973
 11985  # %bb.586:
 11986  	mov	rsi, r9
 11987  	and	rsi, -2
 11988  	neg	rsi
 11989  	xor	eax, eax
 11990  	movdqa	xmm2, xmm0
 11991  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 11992  	movdqa	xmm3, xmmword ptr [rip + .LCPI1_0] # xmm3 = [255,255,255,255,255,255,255,255]
 11993  	movdqa	xmm4, xmm0
 11994  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 11995  .LBB1_587:                              # =>This Inner Loop Header: Depth=1
 11996  	movdqu	xmm5, xmmword ptr [rdx + rax]
 11997  	movdqu	xmm6, xmmword ptr [rdx + rax + 16]
 11998  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 11999  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12000  	pmullw	xmm5, xmm2
 12001  	pand	xmm5, xmm3
 12002  	pmullw	xmm7, xmm1
 12003  	pand	xmm7, xmm3
 12004  	packuswb	xmm7, xmm5
 12005  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 12006  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12007  	pmullw	xmm6, xmm4
 12008  	pand	xmm6, xmm3
 12009  	pmullw	xmm5, xmm1
 12010  	pand	xmm5, xmm3
 12011  	packuswb	xmm5, xmm6
 12012  	movdqu	xmmword ptr [r8 + rax], xmm7
 12013  	movdqu	xmmword ptr [r8 + rax + 16], xmm5
 12014  	movdqu	xmm5, xmmword ptr [rdx + rax + 32]
 12015  	movdqu	xmm6, xmmword ptr [rdx + rax + 48]
 12016  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 12017  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12018  	pmullw	xmm5, xmm2
 12019  	pand	xmm5, xmm3
 12020  	pmullw	xmm7, xmm1
 12021  	pand	xmm7, xmm3
 12022  	packuswb	xmm7, xmm5
 12023  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 12024  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12025  	pmullw	xmm6, xmm4
 12026  	pand	xmm6, xmm3
 12027  	pmullw	xmm5, xmm1
 12028  	pand	xmm5, xmm3
 12029  	packuswb	xmm5, xmm6
 12030  	movdqu	xmmword ptr [r8 + rax + 32], xmm7
 12031  	movdqu	xmmword ptr [r8 + rax + 48], xmm5
 12032  	add	rax, 64
 12033  	add	rsi, 2
 12034  	jne	.LBB1_587
 12035  	jmp	.LBB1_974
 12036  .LBB1_588:
 12037  	mov	edi, r10d
 12038  	and	edi, -32
 12039  	movzx	eax, cl
 12040  	movd	xmm0, eax
 12041  	pxor	xmm1, xmm1
 12042  	pshufb	xmm0, xmm1
 12043  	lea	rax, [rdi - 32]
 12044  	mov	r9, rax
 12045  	shr	r9, 5
 12046  	add	r9, 1
 12047  	pmovzxbw	xmm1, xmm0                      # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 12048  	test	rax, rax
 12049  	je	.LBB1_981
 12050  # %bb.589:
 12051  	mov	rsi, r9
 12052  	and	rsi, -2
 12053  	neg	rsi
 12054  	xor	eax, eax
 12055  	movdqa	xmm2, xmm0
 12056  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12057  	movdqa	xmm3, xmmword ptr [rip + .LCPI1_0] # xmm3 = [255,255,255,255,255,255,255,255]
 12058  	movdqa	xmm4, xmm0
 12059  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12060  .LBB1_590:                              # =>This Inner Loop Header: Depth=1
 12061  	movdqu	xmm5, xmmword ptr [rdx + rax]
 12062  	movdqu	xmm6, xmmword ptr [rdx + rax + 16]
 12063  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 12064  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12065  	pmullw	xmm5, xmm2
 12066  	pand	xmm5, xmm3
 12067  	pmullw	xmm7, xmm1
 12068  	pand	xmm7, xmm3
 12069  	packuswb	xmm7, xmm5
 12070  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 12071  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12072  	pmullw	xmm6, xmm4
 12073  	pand	xmm6, xmm3
 12074  	pmullw	xmm5, xmm1
 12075  	pand	xmm5, xmm3
 12076  	packuswb	xmm5, xmm6
 12077  	movdqu	xmmword ptr [r8 + rax], xmm7
 12078  	movdqu	xmmword ptr [r8 + rax + 16], xmm5
 12079  	movdqu	xmm5, xmmword ptr [rdx + rax + 32]
 12080  	movdqu	xmm6, xmmword ptr [rdx + rax + 48]
 12081  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 12082  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12083  	pmullw	xmm5, xmm2
 12084  	pand	xmm5, xmm3
 12085  	pmullw	xmm7, xmm1
 12086  	pand	xmm7, xmm3
 12087  	packuswb	xmm7, xmm5
 12088  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 12089  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12090  	pmullw	xmm6, xmm4
 12091  	pand	xmm6, xmm3
 12092  	pmullw	xmm5, xmm1
 12093  	pand	xmm5, xmm3
 12094  	packuswb	xmm5, xmm6
 12095  	movdqu	xmmword ptr [r8 + rax + 32], xmm7
 12096  	movdqu	xmmword ptr [r8 + rax + 48], xmm5
 12097  	add	rax, 64
 12098  	add	rsi, 2
 12099  	jne	.LBB1_590
 12100  	jmp	.LBB1_982
 12101  .LBB1_591:
 12102  	mov	esi, r10d
 12103  	and	esi, -32
 12104  	movzx	ecx, al
 12105  	movd	xmm0, ecx
 12106  	pxor	xmm1, xmm1
 12107  	pshufb	xmm0, xmm1
 12108  	lea	rcx, [rsi - 32]
 12109  	mov	r9, rcx
 12110  	shr	r9, 5
 12111  	add	r9, 1
 12112  	test	rcx, rcx
 12113  	je	.LBB1_989
 12114  # %bb.592:
 12115  	mov	rcx, r9
 12116  	and	rcx, -2
 12117  	neg	rcx
 12118  	xor	edi, edi
 12119  .LBB1_593:                              # =>This Inner Loop Header: Depth=1
 12120  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 12121  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 12122  	paddb	xmm1, xmm0
 12123  	paddb	xmm2, xmm0
 12124  	movdqu	xmmword ptr [r8 + rdi], xmm1
 12125  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 12126  	movdqu	xmm1, xmmword ptr [rdx + rdi + 32]
 12127  	movdqu	xmm2, xmmword ptr [rdx + rdi + 48]
 12128  	paddb	xmm1, xmm0
 12129  	paddb	xmm2, xmm0
 12130  	movdqu	xmmword ptr [r8 + rdi + 32], xmm1
 12131  	movdqu	xmmword ptr [r8 + rdi + 48], xmm2
 12132  	add	rdi, 64
 12133  	add	rcx, 2
 12134  	jne	.LBB1_593
 12135  	jmp	.LBB1_990
 12136  .LBB1_594:
 12137  	mov	esi, r10d
 12138  	and	esi, -32
 12139  	movzx	ecx, al
 12140  	movd	xmm0, ecx
 12141  	pxor	xmm1, xmm1
 12142  	pshufb	xmm0, xmm1
 12143  	lea	rcx, [rsi - 32]
 12144  	mov	r9, rcx
 12145  	shr	r9, 5
 12146  	add	r9, 1
 12147  	test	rcx, rcx
 12148  	je	.LBB1_997
 12149  # %bb.595:
 12150  	mov	rcx, r9
 12151  	and	rcx, -2
 12152  	neg	rcx
 12153  	xor	edi, edi
 12154  .LBB1_596:                              # =>This Inner Loop Header: Depth=1
 12155  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 12156  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 12157  	psubb	xmm1, xmm0
 12158  	psubb	xmm2, xmm0
 12159  	movdqu	xmmword ptr [r8 + rdi], xmm1
 12160  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 12161  	movdqu	xmm1, xmmword ptr [rdx + rdi + 32]
 12162  	movdqu	xmm2, xmmword ptr [rdx + rdi + 48]
 12163  	psubb	xmm1, xmm0
 12164  	psubb	xmm2, xmm0
 12165  	movdqu	xmmword ptr [r8 + rdi + 32], xmm1
 12166  	movdqu	xmmword ptr [r8 + rdi + 48], xmm2
 12167  	add	rdi, 64
 12168  	add	rcx, 2
 12169  	jne	.LBB1_596
 12170  	jmp	.LBB1_998
 12171  .LBB1_597:
 12172  	mov	esi, r10d
 12173  	and	esi, -32
 12174  	movzx	ecx, al
 12175  	movd	xmm0, ecx
 12176  	pxor	xmm1, xmm1
 12177  	pshufb	xmm0, xmm1
 12178  	lea	rcx, [rsi - 32]
 12179  	mov	r9, rcx
 12180  	shr	r9, 5
 12181  	add	r9, 1
 12182  	test	rcx, rcx
 12183  	je	.LBB1_1005
 12184  # %bb.598:
 12185  	mov	rcx, r9
 12186  	and	rcx, -2
 12187  	neg	rcx
 12188  	xor	edi, edi
 12189  .LBB1_599:                              # =>This Inner Loop Header: Depth=1
 12190  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 12191  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 12192  	paddb	xmm1, xmm0
 12193  	paddb	xmm2, xmm0
 12194  	movdqu	xmmword ptr [r8 + rdi], xmm1
 12195  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 12196  	movdqu	xmm1, xmmword ptr [rdx + rdi + 32]
 12197  	movdqu	xmm2, xmmword ptr [rdx + rdi + 48]
 12198  	paddb	xmm1, xmm0
 12199  	paddb	xmm2, xmm0
 12200  	movdqu	xmmword ptr [r8 + rdi + 32], xmm1
 12201  	movdqu	xmmword ptr [r8 + rdi + 48], xmm2
 12202  	add	rdi, 64
 12203  	add	rcx, 2
 12204  	jne	.LBB1_599
 12205  	jmp	.LBB1_1006
 12206  .LBB1_600:
 12207  	mov	esi, r10d
 12208  	and	esi, -32
 12209  	movzx	ecx, al
 12210  	movd	xmm0, ecx
 12211  	pxor	xmm1, xmm1
 12212  	pshufb	xmm0, xmm1
 12213  	lea	rcx, [rsi - 32]
 12214  	mov	r9, rcx
 12215  	shr	r9, 5
 12216  	add	r9, 1
 12217  	test	rcx, rcx
 12218  	je	.LBB1_1013
 12219  # %bb.601:
 12220  	mov	rcx, r9
 12221  	and	rcx, -2
 12222  	neg	rcx
 12223  	xor	edi, edi
 12224  .LBB1_602:                              # =>This Inner Loop Header: Depth=1
 12225  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 12226  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 12227  	psubb	xmm1, xmm0
 12228  	psubb	xmm2, xmm0
 12229  	movdqu	xmmword ptr [r8 + rdi], xmm1
 12230  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 12231  	movdqu	xmm1, xmmword ptr [rdx + rdi + 32]
 12232  	movdqu	xmm2, xmmword ptr [rdx + rdi + 48]
 12233  	psubb	xmm1, xmm0
 12234  	psubb	xmm2, xmm0
 12235  	movdqu	xmmword ptr [r8 + rdi + 32], xmm1
 12236  	movdqu	xmmword ptr [r8 + rdi + 48], xmm2
 12237  	add	rdi, 64
 12238  	add	rcx, 2
 12239  	jne	.LBB1_602
 12240  	jmp	.LBB1_1014
 12241  .LBB1_603:
 12242  	mov	esi, r10d
 12243  	and	esi, -8
 12244  	movd	xmm0, eax
 12245  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 12246  	lea	rcx, [rsi - 8]
 12247  	mov	r9, rcx
 12248  	shr	r9, 3
 12249  	add	r9, 1
 12250  	test	rcx, rcx
 12251  	je	.LBB1_1021
 12252  # %bb.604:
 12253  	mov	rcx, r9
 12254  	and	rcx, -2
 12255  	neg	rcx
 12256  	xor	edi, edi
 12257  .LBB1_605:                              # =>This Inner Loop Header: Depth=1
 12258  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 12259  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 12260  	pmulld	xmm1, xmm0
 12261  	pmulld	xmm2, xmm0
 12262  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 12263  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 12264  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 12265  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 12266  	pmulld	xmm1, xmm0
 12267  	pmulld	xmm2, xmm0
 12268  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 12269  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 12270  	add	rdi, 16
 12271  	add	rcx, 2
 12272  	jne	.LBB1_605
 12273  	jmp	.LBB1_1022
 12274  .LBB1_606:
 12275  	mov	esi, r10d
 12276  	and	esi, -8
 12277  	movd	xmm0, eax
 12278  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 12279  	lea	rcx, [rsi - 8]
 12280  	mov	r9, rcx
 12281  	shr	r9, 3
 12282  	add	r9, 1
 12283  	test	rcx, rcx
 12284  	je	.LBB1_1029
 12285  # %bb.607:
 12286  	mov	rcx, r9
 12287  	and	rcx, -2
 12288  	neg	rcx
 12289  	xor	edi, edi
 12290  .LBB1_608:                              # =>This Inner Loop Header: Depth=1
 12291  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 12292  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 12293  	pmulld	xmm1, xmm0
 12294  	pmulld	xmm2, xmm0
 12295  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 12296  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 12297  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 12298  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 12299  	pmulld	xmm1, xmm0
 12300  	pmulld	xmm2, xmm0
 12301  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 12302  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 12303  	add	rdi, 16
 12304  	add	rcx, 2
 12305  	jne	.LBB1_608
 12306  	jmp	.LBB1_1030
 12307  .LBB1_609:
 12308  	mov	esi, r10d
 12309  	and	esi, -8
 12310  	movd	xmm0, eax
 12311  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 12312  	lea	rcx, [rsi - 8]
 12313  	mov	r9, rcx
 12314  	shr	r9, 3
 12315  	add	r9, 1
 12316  	test	rcx, rcx
 12317  	je	.LBB1_1037
 12318  # %bb.610:
 12319  	mov	rcx, r9
 12320  	and	rcx, -2
 12321  	neg	rcx
 12322  	xor	edi, edi
 12323  .LBB1_611:                              # =>This Inner Loop Header: Depth=1
 12324  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 12325  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 12326  	paddd	xmm1, xmm0
 12327  	paddd	xmm2, xmm0
 12328  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 12329  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 12330  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 12331  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 12332  	paddd	xmm1, xmm0
 12333  	paddd	xmm2, xmm0
 12334  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 12335  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 12336  	add	rdi, 16
 12337  	add	rcx, 2
 12338  	jne	.LBB1_611
 12339  	jmp	.LBB1_1038
 12340  .LBB1_612:
 12341  	mov	esi, r10d
 12342  	and	esi, -8
 12343  	movd	xmm0, eax
 12344  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 12345  	lea	rcx, [rsi - 8]
 12346  	mov	r9, rcx
 12347  	shr	r9, 3
 12348  	add	r9, 1
 12349  	test	rcx, rcx
 12350  	je	.LBB1_1045
 12351  # %bb.613:
 12352  	mov	rcx, r9
 12353  	and	rcx, -2
 12354  	neg	rcx
 12355  	xor	edi, edi
 12356  .LBB1_614:                              # =>This Inner Loop Header: Depth=1
 12357  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 12358  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 12359  	psubd	xmm1, xmm0
 12360  	psubd	xmm2, xmm0
 12361  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 12362  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 12363  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 12364  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 12365  	psubd	xmm1, xmm0
 12366  	psubd	xmm2, xmm0
 12367  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 12368  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 12369  	add	rdi, 16
 12370  	add	rcx, 2
 12371  	jne	.LBB1_614
 12372  	jmp	.LBB1_1046
 12373  .LBB1_615:
 12374  	mov	esi, r10d
 12375  	and	esi, -8
 12376  	movd	xmm0, eax
 12377  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 12378  	lea	rcx, [rsi - 8]
 12379  	mov	r9, rcx
 12380  	shr	r9, 3
 12381  	add	r9, 1
 12382  	test	rcx, rcx
 12383  	je	.LBB1_1053
 12384  # %bb.616:
 12385  	mov	rcx, r9
 12386  	and	rcx, -2
 12387  	neg	rcx
 12388  	xor	edi, edi
 12389  .LBB1_617:                              # =>This Inner Loop Header: Depth=1
 12390  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 12391  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 12392  	paddd	xmm1, xmm0
 12393  	paddd	xmm2, xmm0
 12394  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 12395  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 12396  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 12397  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 12398  	paddd	xmm1, xmm0
 12399  	paddd	xmm2, xmm0
 12400  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 12401  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 12402  	add	rdi, 16
 12403  	add	rcx, 2
 12404  	jne	.LBB1_617
 12405  	jmp	.LBB1_1054
 12406  .LBB1_618:
 12407  	mov	esi, r10d
 12408  	and	esi, -8
 12409  	movd	xmm0, eax
 12410  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 12411  	lea	rcx, [rsi - 8]
 12412  	mov	r9, rcx
 12413  	shr	r9, 3
 12414  	add	r9, 1
 12415  	test	rcx, rcx
 12416  	je	.LBB1_1061
 12417  # %bb.619:
 12418  	mov	rcx, r9
 12419  	and	rcx, -2
 12420  	neg	rcx
 12421  	xor	edi, edi
 12422  .LBB1_620:                              # =>This Inner Loop Header: Depth=1
 12423  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 12424  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 12425  	psubd	xmm1, xmm0
 12426  	psubd	xmm2, xmm0
 12427  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 12428  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 12429  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 12430  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 12431  	psubd	xmm1, xmm0
 12432  	psubd	xmm2, xmm0
 12433  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 12434  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 12435  	add	rdi, 16
 12436  	add	rcx, 2
 12437  	jne	.LBB1_620
 12438  	jmp	.LBB1_1062
 12439  .LBB1_621:
 12440  	xor	edi, edi
 12441  .LBB1_622:
 12442  	test	r9b, 1
 12443  	je	.LBB1_624
 12444  # %bb.623:
 12445  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 12446  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 12447  	pmulld	xmm1, xmm0
 12448  	pmulld	xmm2, xmm0
 12449  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 12450  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 12451  .LBB1_624:
 12452  	cmp	rsi, r10
 12453  	je	.LBB1_1069
 12454  	jmp	.LBB1_625
 12455  .LBB1_629:
 12456  	xor	edi, edi
 12457  .LBB1_630:
 12458  	test	r9b, 1
 12459  	je	.LBB1_632
 12460  # %bb.631:
 12461  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 12462  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 12463  	pmulld	xmm1, xmm0
 12464  	pmulld	xmm2, xmm0
 12465  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 12466  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 12467  .LBB1_632:
 12468  	cmp	rsi, r10
 12469  	je	.LBB1_1069
 12470  	jmp	.LBB1_633
 12471  .LBB1_637:
 12472  	xor	edi, edi
 12473  .LBB1_638:
 12474  	test	r9b, 1
 12475  	je	.LBB1_640
 12476  # %bb.639:
 12477  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 12478  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 12479  	paddd	xmm1, xmm0
 12480  	paddd	xmm2, xmm0
 12481  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 12482  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 12483  .LBB1_640:
 12484  	cmp	rsi, r10
 12485  	je	.LBB1_1069
 12486  	jmp	.LBB1_641
 12487  .LBB1_645:
 12488  	xor	edi, edi
 12489  .LBB1_646:
 12490  	test	r9b, 1
 12491  	je	.LBB1_648
 12492  # %bb.647:
 12493  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 12494  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 12495  	psubd	xmm1, xmm0
 12496  	psubd	xmm2, xmm0
 12497  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 12498  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 12499  .LBB1_648:
 12500  	cmp	rsi, r10
 12501  	je	.LBB1_1069
 12502  	jmp	.LBB1_649
 12503  .LBB1_653:
 12504  	xor	edi, edi
 12505  .LBB1_654:
 12506  	test	r9b, 1
 12507  	je	.LBB1_656
 12508  # %bb.655:
 12509  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 12510  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 12511  	paddd	xmm1, xmm0
 12512  	paddd	xmm2, xmm0
 12513  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 12514  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 12515  .LBB1_656:
 12516  	cmp	rsi, r10
 12517  	je	.LBB1_1069
 12518  	jmp	.LBB1_657
 12519  .LBB1_661:
 12520  	xor	edi, edi
 12521  .LBB1_662:
 12522  	test	r9b, 1
 12523  	je	.LBB1_664
 12524  # %bb.663:
 12525  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 12526  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 12527  	psubd	xmm1, xmm0
 12528  	psubd	xmm2, xmm0
 12529  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 12530  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 12531  .LBB1_664:
 12532  	cmp	rsi, r10
 12533  	je	.LBB1_1069
 12534  	jmp	.LBB1_665
 12535  .LBB1_669:
 12536  	xor	edi, edi
 12537  .LBB1_670:
 12538  	test	r9b, 1
 12539  	je	.LBB1_672
 12540  # %bb.671:
 12541  	movupd	xmm2, xmmword ptr [rdx + 8*rdi]
 12542  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 16]
 12543  	mulpd	xmm2, xmm1
 12544  	mulpd	xmm3, xmm1
 12545  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 12546  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 12547  .LBB1_672:
 12548  	cmp	rcx, rax
 12549  	je	.LBB1_1069
 12550  	jmp	.LBB1_673
 12551  .LBB1_677:
 12552  	xor	edi, edi
 12553  .LBB1_678:
 12554  	test	r9b, 1
 12555  	je	.LBB1_680
 12556  # %bb.679:
 12557  	movupd	xmm2, xmmword ptr [rdx + 8*rdi]
 12558  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 16]
 12559  	mulpd	xmm2, xmm1
 12560  	mulpd	xmm3, xmm1
 12561  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 12562  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 12563  .LBB1_680:
 12564  	cmp	rcx, rax
 12565  	je	.LBB1_1069
 12566  	jmp	.LBB1_681
 12567  .LBB1_685:
 12568  	xor	edi, edi
 12569  .LBB1_686:
 12570  	test	r9b, 1
 12571  	je	.LBB1_688
 12572  # %bb.687:
 12573  	movupd	xmm2, xmmword ptr [rdx + 8*rdi]
 12574  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 16]
 12575  	addpd	xmm2, xmm1
 12576  	addpd	xmm3, xmm1
 12577  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 12578  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 12579  .LBB1_688:
 12580  	cmp	rcx, rax
 12581  	je	.LBB1_1069
 12582  	jmp	.LBB1_689
 12583  .LBB1_693:
 12584  	xor	edi, edi
 12585  .LBB1_694:
 12586  	test	r9b, 1
 12587  	je	.LBB1_696
 12588  # %bb.695:
 12589  	movupd	xmm2, xmmword ptr [rdx + 8*rdi]
 12590  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 16]
 12591  	subpd	xmm2, xmm1
 12592  	subpd	xmm3, xmm1
 12593  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 12594  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 12595  .LBB1_696:
 12596  	cmp	rcx, rax
 12597  	je	.LBB1_1069
 12598  	jmp	.LBB1_697
 12599  .LBB1_701:
 12600  	xor	edi, edi
 12601  .LBB1_702:
 12602  	test	r9b, 1
 12603  	je	.LBB1_704
 12604  # %bb.703:
 12605  	movupd	xmm2, xmmword ptr [rdx + 8*rdi]
 12606  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 16]
 12607  	addpd	xmm2, xmm1
 12608  	addpd	xmm3, xmm1
 12609  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 12610  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 12611  .LBB1_704:
 12612  	cmp	rcx, rax
 12613  	je	.LBB1_1069
 12614  	jmp	.LBB1_705
 12615  .LBB1_709:
 12616  	xor	edi, edi
 12617  .LBB1_710:
 12618  	test	r9b, 1
 12619  	je	.LBB1_712
 12620  # %bb.711:
 12621  	movupd	xmm2, xmmword ptr [rdx + 8*rdi]
 12622  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 16]
 12623  	subpd	xmm2, xmm1
 12624  	subpd	xmm3, xmm1
 12625  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 12626  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 12627  .LBB1_712:
 12628  	cmp	rcx, rax
 12629  	je	.LBB1_1069
 12630  	jmp	.LBB1_713
 12631  .LBB1_717:
 12632  	xor	eax, eax
 12633  .LBB1_718:
 12634  	test	r9b, 1
 12635  	je	.LBB1_720
 12636  # %bb.719:
 12637  	movdqu	xmm2, xmmword ptr [rdx + rax]
 12638  	movdqu	xmm3, xmmword ptr [rdx + rax + 16]
 12639  	movdqa	xmm4, xmm0
 12640  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12641  	pmovzxbw	xmm5, xmm2                      # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 12642  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12643  	pmullw	xmm2, xmm4
 12644  	movdqa	xmm4, xmmword ptr [rip + .LCPI1_0] # xmm4 = [255,255,255,255,255,255,255,255]
 12645  	pand	xmm2, xmm4
 12646  	pmullw	xmm5, xmm1
 12647  	pand	xmm5, xmm4
 12648  	packuswb	xmm5, xmm2
 12649  	punpckhbw	xmm0, xmm0              # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12650  	pmovzxbw	xmm2, xmm3                      # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 12651  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12652  	pmullw	xmm3, xmm0
 12653  	pand	xmm3, xmm4
 12654  	pmullw	xmm2, xmm1
 12655  	pand	xmm2, xmm4
 12656  	packuswb	xmm2, xmm3
 12657  	movdqu	xmmword ptr [r8 + rax], xmm5
 12658  	movdqu	xmmword ptr [r8 + rax + 16], xmm2
 12659  .LBB1_720:
 12660  	cmp	rdi, r10
 12661  	je	.LBB1_1069
 12662  	jmp	.LBB1_721
 12663  .LBB1_725:
 12664  	xor	eax, eax
 12665  .LBB1_726:
 12666  	test	r9b, 1
 12667  	je	.LBB1_728
 12668  # %bb.727:
 12669  	movdqu	xmm2, xmmword ptr [rdx + rax]
 12670  	movdqu	xmm3, xmmword ptr [rdx + rax + 16]
 12671  	movdqa	xmm4, xmm0
 12672  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12673  	pmovzxbw	xmm5, xmm2                      # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 12674  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12675  	pmullw	xmm2, xmm4
 12676  	movdqa	xmm4, xmmword ptr [rip + .LCPI1_0] # xmm4 = [255,255,255,255,255,255,255,255]
 12677  	pand	xmm2, xmm4
 12678  	pmullw	xmm5, xmm1
 12679  	pand	xmm5, xmm4
 12680  	packuswb	xmm5, xmm2
 12681  	punpckhbw	xmm0, xmm0              # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12682  	pmovzxbw	xmm2, xmm3                      # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 12683  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 12684  	pmullw	xmm3, xmm0
 12685  	pand	xmm3, xmm4
 12686  	pmullw	xmm2, xmm1
 12687  	pand	xmm2, xmm4
 12688  	packuswb	xmm2, xmm3
 12689  	movdqu	xmmword ptr [r8 + rax], xmm5
 12690  	movdqu	xmmword ptr [r8 + rax + 16], xmm2
 12691  .LBB1_728:
 12692  	cmp	rdi, r10
 12693  	je	.LBB1_1069
 12694  	jmp	.LBB1_729
 12695  .LBB1_733:
 12696  	xor	edi, edi
 12697  .LBB1_734:
 12698  	test	r9b, 1
 12699  	je	.LBB1_736
 12700  # %bb.735:
 12701  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 12702  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 12703  	paddb	xmm1, xmm0
 12704  	paddb	xmm2, xmm0
 12705  	movdqu	xmmword ptr [r8 + rdi], xmm1
 12706  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 12707  .LBB1_736:
 12708  	cmp	rsi, r10
 12709  	je	.LBB1_1069
 12710  	jmp	.LBB1_737
 12711  .LBB1_741:
 12712  	xor	edi, edi
 12713  .LBB1_742:
 12714  	test	r9b, 1
 12715  	je	.LBB1_744
 12716  # %bb.743:
 12717  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 12718  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 12719  	psubb	xmm1, xmm0
 12720  	psubb	xmm2, xmm0
 12721  	movdqu	xmmword ptr [r8 + rdi], xmm1
 12722  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 12723  .LBB1_744:
 12724  	cmp	rsi, r10
 12725  	je	.LBB1_1069
 12726  	jmp	.LBB1_745
 12727  .LBB1_749:
 12728  	xor	edi, edi
 12729  .LBB1_750:
 12730  	test	r9b, 1
 12731  	je	.LBB1_752
 12732  # %bb.751:
 12733  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 12734  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 12735  	paddb	xmm1, xmm0
 12736  	paddb	xmm2, xmm0
 12737  	movdqu	xmmword ptr [r8 + rdi], xmm1
 12738  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 12739  .LBB1_752:
 12740  	cmp	rsi, r10
 12741  	je	.LBB1_1069
 12742  	jmp	.LBB1_753
 12743  .LBB1_757:
 12744  	xor	edi, edi
 12745  .LBB1_758:
 12746  	test	r9b, 1
 12747  	je	.LBB1_760
 12748  # %bb.759:
 12749  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 12750  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 12751  	psubb	xmm1, xmm0
 12752  	psubb	xmm2, xmm0
 12753  	movdqu	xmmword ptr [r8 + rdi], xmm1
 12754  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 12755  .LBB1_760:
 12756  	cmp	rsi, r10
 12757  	je	.LBB1_1069
 12758  	jmp	.LBB1_761
 12759  .LBB1_765:
 12760  	xor	edi, edi
 12761  .LBB1_766:
 12762  	test	r9b, 1
 12763  	je	.LBB1_768
 12764  # %bb.767:
 12765  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 12766  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 12767  	paddq	xmm1, xmm0
 12768  	paddq	xmm2, xmm0
 12769  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 12770  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 12771  .LBB1_768:
 12772  	cmp	rsi, r10
 12773  	je	.LBB1_1069
 12774  	jmp	.LBB1_769
 12775  .LBB1_773:
 12776  	xor	edi, edi
 12777  .LBB1_774:
 12778  	test	r9b, 1
 12779  	je	.LBB1_776
 12780  # %bb.775:
 12781  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 12782  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 12783  	psubq	xmm1, xmm0
 12784  	psubq	xmm2, xmm0
 12785  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 12786  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 12787  .LBB1_776:
 12788  	cmp	rsi, r10
 12789  	je	.LBB1_1069
 12790  	jmp	.LBB1_777
 12791  .LBB1_781:
 12792  	xor	edi, edi
 12793  .LBB1_782:
 12794  	test	r9b, 1
 12795  	je	.LBB1_784
 12796  # %bb.783:
 12797  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 12798  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 12799  	paddq	xmm1, xmm0
 12800  	paddq	xmm2, xmm0
 12801  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 12802  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 12803  .LBB1_784:
 12804  	cmp	rsi, r10
 12805  	je	.LBB1_1069
 12806  	jmp	.LBB1_785
 12807  .LBB1_789:
 12808  	xor	edi, edi
 12809  .LBB1_790:
 12810  	test	r9b, 1
 12811  	je	.LBB1_792
 12812  # %bb.791:
 12813  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 12814  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 12815  	psubq	xmm1, xmm0
 12816  	psubq	xmm2, xmm0
 12817  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 12818  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 12819  .LBB1_792:
 12820  	cmp	rsi, r10
 12821  	je	.LBB1_1069
 12822  	jmp	.LBB1_793
 12823  .LBB1_797:
 12824  	xor	edi, edi
 12825  .LBB1_798:
 12826  	test	r9b, 1
 12827  	je	.LBB1_800
 12828  # %bb.799:
 12829  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 12830  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 12831  	pmullw	xmm1, xmm0
 12832  	pmullw	xmm2, xmm0
 12833  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 12834  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 12835  .LBB1_800:
 12836  	cmp	rsi, r10
 12837  	je	.LBB1_1069
 12838  	jmp	.LBB1_801
 12839  .LBB1_805:
 12840  	xor	edi, edi
 12841  .LBB1_806:
 12842  	test	r9b, 1
 12843  	je	.LBB1_808
 12844  # %bb.807:
 12845  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 12846  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 12847  	pmullw	xmm1, xmm0
 12848  	pmullw	xmm2, xmm0
 12849  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 12850  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 12851  .LBB1_808:
 12852  	cmp	rsi, r10
 12853  	je	.LBB1_1069
 12854  	jmp	.LBB1_809
 12855  .LBB1_813:
 12856  	xor	edi, edi
 12857  .LBB1_814:
 12858  	test	r9b, 1
 12859  	je	.LBB1_816
 12860  # %bb.815:
 12861  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 12862  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 12863  	pmullw	xmm1, xmm0
 12864  	pmullw	xmm2, xmm0
 12865  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 12866  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 12867  .LBB1_816:
 12868  	cmp	rsi, r10
 12869  	je	.LBB1_1069
 12870  	jmp	.LBB1_817
 12871  .LBB1_821:
 12872  	xor	edi, edi
 12873  .LBB1_822:
 12874  	test	r9b, 1
 12875  	je	.LBB1_824
 12876  # %bb.823:
 12877  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 12878  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 12879  	pmullw	xmm1, xmm0
 12880  	pmullw	xmm2, xmm0
 12881  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 12882  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 12883  .LBB1_824:
 12884  	cmp	rsi, r10
 12885  	je	.LBB1_1069
 12886  	jmp	.LBB1_825
 12887  .LBB1_829:
 12888  	xor	edi, edi
 12889  .LBB1_830:
 12890  	test	r9b, 1
 12891  	je	.LBB1_832
 12892  # %bb.831:
 12893  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 12894  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 12895  	paddw	xmm1, xmm0
 12896  	paddw	xmm2, xmm0
 12897  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 12898  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 12899  .LBB1_832:
 12900  	cmp	rsi, r10
 12901  	je	.LBB1_1069
 12902  	jmp	.LBB1_833
 12903  .LBB1_837:
 12904  	xor	edi, edi
 12905  .LBB1_838:
 12906  	test	r9b, 1
 12907  	je	.LBB1_840
 12908  # %bb.839:
 12909  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 12910  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 12911  	paddw	xmm1, xmm0
 12912  	paddw	xmm2, xmm0
 12913  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 12914  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 12915  .LBB1_840:
 12916  	cmp	rsi, r10
 12917  	je	.LBB1_1069
 12918  	jmp	.LBB1_841
 12919  .LBB1_845:
 12920  	xor	edi, edi
 12921  .LBB1_846:
 12922  	test	r9b, 1
 12923  	je	.LBB1_848
 12924  # %bb.847:
 12925  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 12926  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 12927  	psubw	xmm1, xmm0
 12928  	psubw	xmm2, xmm0
 12929  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 12930  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 12931  .LBB1_848:
 12932  	cmp	rsi, r10
 12933  	je	.LBB1_1069
 12934  	jmp	.LBB1_849
 12935  .LBB1_853:
 12936  	xor	edi, edi
 12937  .LBB1_854:
 12938  	test	r9b, 1
 12939  	je	.LBB1_856
 12940  # %bb.855:
 12941  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 12942  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 12943  	psubw	xmm1, xmm0
 12944  	psubw	xmm2, xmm0
 12945  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 12946  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 12947  .LBB1_856:
 12948  	cmp	rsi, r10
 12949  	je	.LBB1_1069
 12950  	jmp	.LBB1_857
 12951  .LBB1_861:
 12952  	xor	edi, edi
 12953  .LBB1_862:
 12954  	test	r9b, 1
 12955  	je	.LBB1_864
 12956  # %bb.863:
 12957  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 12958  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 12959  	paddw	xmm1, xmm0
 12960  	paddw	xmm2, xmm0
 12961  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 12962  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 12963  .LBB1_864:
 12964  	cmp	rsi, r10
 12965  	je	.LBB1_1069
 12966  	jmp	.LBB1_865
 12967  .LBB1_869:
 12968  	xor	edi, edi
 12969  .LBB1_870:
 12970  	test	r9b, 1
 12971  	je	.LBB1_872
 12972  # %bb.871:
 12973  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 12974  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 12975  	paddw	xmm1, xmm0
 12976  	paddw	xmm2, xmm0
 12977  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 12978  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 12979  .LBB1_872:
 12980  	cmp	rsi, r10
 12981  	je	.LBB1_1069
 12982  	jmp	.LBB1_873
 12983  .LBB1_877:
 12984  	xor	edi, edi
 12985  .LBB1_878:
 12986  	test	r9b, 1
 12987  	je	.LBB1_880
 12988  # %bb.879:
 12989  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 12990  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 12991  	psubw	xmm1, xmm0
 12992  	psubw	xmm2, xmm0
 12993  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 12994  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 12995  .LBB1_880:
 12996  	cmp	rsi, r10
 12997  	je	.LBB1_1069
 12998  	jmp	.LBB1_881
 12999  .LBB1_885:
 13000  	xor	edi, edi
 13001  .LBB1_886:
 13002  	test	r9b, 1
 13003  	je	.LBB1_888
 13004  # %bb.887:
 13005  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 13006  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 13007  	psubw	xmm1, xmm0
 13008  	psubw	xmm2, xmm0
 13009  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 13010  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 13011  .LBB1_888:
 13012  	cmp	rsi, r10
 13013  	je	.LBB1_1069
 13014  	jmp	.LBB1_889
 13015  .LBB1_893:
 13016  	xor	edi, edi
 13017  .LBB1_894:
 13018  	test	r9b, 1
 13019  	je	.LBB1_896
 13020  # %bb.895:
 13021  	movups	xmm2, xmmword ptr [rdx + 4*rdi]
 13022  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 13023  	mulps	xmm2, xmm1
 13024  	mulps	xmm3, xmm1
 13025  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 13026  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 13027  .LBB1_896:
 13028  	cmp	rcx, rax
 13029  	je	.LBB1_1069
 13030  	jmp	.LBB1_897
 13031  .LBB1_901:
 13032  	xor	edi, edi
 13033  .LBB1_902:
 13034  	test	r9b, 1
 13035  	je	.LBB1_904
 13036  # %bb.903:
 13037  	movups	xmm2, xmmword ptr [rdx + 4*rdi]
 13038  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 13039  	mulps	xmm2, xmm1
 13040  	mulps	xmm3, xmm1
 13041  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 13042  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 13043  .LBB1_904:
 13044  	cmp	rcx, rax
 13045  	je	.LBB1_1069
 13046  	jmp	.LBB1_905
 13047  .LBB1_909:
 13048  	xor	edi, edi
 13049  .LBB1_910:
 13050  	test	r9b, 1
 13051  	je	.LBB1_912
 13052  # %bb.911:
 13053  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 13054  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 13055  	paddq	xmm1, xmm0
 13056  	paddq	xmm2, xmm0
 13057  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 13058  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 13059  .LBB1_912:
 13060  	cmp	rsi, r10
 13061  	je	.LBB1_1069
 13062  	jmp	.LBB1_913
 13063  .LBB1_917:
 13064  	xor	edi, edi
 13065  .LBB1_918:
 13066  	test	r9b, 1
 13067  	je	.LBB1_920
 13068  # %bb.919:
 13069  	movups	xmm2, xmmword ptr [rdx + 4*rdi]
 13070  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 13071  	addps	xmm2, xmm1
 13072  	addps	xmm3, xmm1
 13073  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 13074  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 13075  .LBB1_920:
 13076  	cmp	rcx, rax
 13077  	je	.LBB1_1069
 13078  	jmp	.LBB1_921
 13079  .LBB1_925:
 13080  	xor	edi, edi
 13081  .LBB1_926:
 13082  	test	r9b, 1
 13083  	je	.LBB1_928
 13084  # %bb.927:
 13085  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 13086  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 13087  	psubq	xmm1, xmm0
 13088  	psubq	xmm2, xmm0
 13089  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 13090  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 13091  .LBB1_928:
 13092  	cmp	rsi, r10
 13093  	je	.LBB1_1069
 13094  	jmp	.LBB1_929
 13095  .LBB1_933:
 13096  	xor	edi, edi
 13097  .LBB1_934:
 13098  	test	r9b, 1
 13099  	je	.LBB1_936
 13100  # %bb.935:
 13101  	movups	xmm2, xmmword ptr [rdx + 4*rdi]
 13102  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 13103  	subps	xmm2, xmm1
 13104  	subps	xmm3, xmm1
 13105  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 13106  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 13107  .LBB1_936:
 13108  	cmp	rcx, rax
 13109  	je	.LBB1_1069
 13110  	jmp	.LBB1_937
 13111  .LBB1_941:
 13112  	xor	edi, edi
 13113  .LBB1_942:
 13114  	test	r9b, 1
 13115  	je	.LBB1_944
 13116  # %bb.943:
 13117  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 13118  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 13119  	paddq	xmm1, xmm0
 13120  	paddq	xmm2, xmm0
 13121  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 13122  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 13123  .LBB1_944:
 13124  	cmp	rsi, r10
 13125  	je	.LBB1_1069
 13126  	jmp	.LBB1_945
 13127  .LBB1_949:
 13128  	xor	edi, edi
 13129  .LBB1_950:
 13130  	test	r9b, 1
 13131  	je	.LBB1_952
 13132  # %bb.951:
 13133  	movups	xmm2, xmmword ptr [rdx + 4*rdi]
 13134  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 13135  	addps	xmm2, xmm1
 13136  	addps	xmm3, xmm1
 13137  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 13138  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 13139  .LBB1_952:
 13140  	cmp	rcx, rax
 13141  	je	.LBB1_1069
 13142  	jmp	.LBB1_953
 13143  .LBB1_957:
 13144  	xor	edi, edi
 13145  .LBB1_958:
 13146  	test	r9b, 1
 13147  	je	.LBB1_960
 13148  # %bb.959:
 13149  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 13150  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 13151  	psubq	xmm1, xmm0
 13152  	psubq	xmm2, xmm0
 13153  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 13154  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 13155  .LBB1_960:
 13156  	cmp	rsi, r10
 13157  	je	.LBB1_1069
 13158  	jmp	.LBB1_961
 13159  .LBB1_965:
 13160  	xor	edi, edi
 13161  .LBB1_966:
 13162  	test	r9b, 1
 13163  	je	.LBB1_968
 13164  # %bb.967:
 13165  	movups	xmm2, xmmword ptr [rdx + 4*rdi]
 13166  	movups	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 13167  	subps	xmm2, xmm1
 13168  	subps	xmm3, xmm1
 13169  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 13170  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 13171  .LBB1_968:
 13172  	cmp	rcx, rax
 13173  	je	.LBB1_1069
 13174  	jmp	.LBB1_969
 13175  .LBB1_973:
 13176  	xor	eax, eax
 13177  .LBB1_974:
 13178  	test	r9b, 1
 13179  	je	.LBB1_976
 13180  # %bb.975:
 13181  	movdqu	xmm2, xmmword ptr [rdx + rax]
 13182  	movdqu	xmm3, xmmword ptr [rdx + rax + 16]
 13183  	movdqa	xmm4, xmm0
 13184  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 13185  	pmovzxbw	xmm5, xmm2                      # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 13186  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 13187  	pmullw	xmm2, xmm4
 13188  	movdqa	xmm4, xmmword ptr [rip + .LCPI1_0] # xmm4 = [255,255,255,255,255,255,255,255]
 13189  	pand	xmm2, xmm4
 13190  	pmullw	xmm5, xmm1
 13191  	pand	xmm5, xmm4
 13192  	packuswb	xmm5, xmm2
 13193  	punpckhbw	xmm0, xmm0              # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 13194  	pmovzxbw	xmm2, xmm3                      # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 13195  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 13196  	pmullw	xmm3, xmm0
 13197  	pand	xmm3, xmm4
 13198  	pmullw	xmm2, xmm1
 13199  	pand	xmm2, xmm4
 13200  	packuswb	xmm2, xmm3
 13201  	movdqu	xmmword ptr [r8 + rax], xmm5
 13202  	movdqu	xmmword ptr [r8 + rax + 16], xmm2
 13203  .LBB1_976:
 13204  	cmp	rdi, r10
 13205  	je	.LBB1_1069
 13206  	jmp	.LBB1_977
 13207  .LBB1_981:
 13208  	xor	eax, eax
 13209  .LBB1_982:
 13210  	test	r9b, 1
 13211  	je	.LBB1_984
 13212  # %bb.983:
 13213  	movdqu	xmm2, xmmword ptr [rdx + rax]
 13214  	movdqu	xmm3, xmmword ptr [rdx + rax + 16]
 13215  	movdqa	xmm4, xmm0
 13216  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 13217  	pmovzxbw	xmm5, xmm2                      # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 13218  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 13219  	pmullw	xmm2, xmm4
 13220  	movdqa	xmm4, xmmword ptr [rip + .LCPI1_0] # xmm4 = [255,255,255,255,255,255,255,255]
 13221  	pand	xmm2, xmm4
 13222  	pmullw	xmm5, xmm1
 13223  	pand	xmm5, xmm4
 13224  	packuswb	xmm5, xmm2
 13225  	punpckhbw	xmm0, xmm0              # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 13226  	pmovzxbw	xmm2, xmm3                      # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 13227  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 13228  	pmullw	xmm3, xmm0
 13229  	pand	xmm3, xmm4
 13230  	pmullw	xmm2, xmm1
 13231  	pand	xmm2, xmm4
 13232  	packuswb	xmm2, xmm3
 13233  	movdqu	xmmword ptr [r8 + rax], xmm5
 13234  	movdqu	xmmword ptr [r8 + rax + 16], xmm2
 13235  .LBB1_984:
 13236  	cmp	rdi, r10
 13237  	je	.LBB1_1069
 13238  	jmp	.LBB1_985
 13239  .LBB1_989:
 13240  	xor	edi, edi
 13241  .LBB1_990:
 13242  	test	r9b, 1
 13243  	je	.LBB1_992
 13244  # %bb.991:
 13245  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 13246  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 13247  	paddb	xmm1, xmm0
 13248  	paddb	xmm2, xmm0
 13249  	movdqu	xmmword ptr [r8 + rdi], xmm1
 13250  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 13251  .LBB1_992:
 13252  	cmp	rsi, r10
 13253  	je	.LBB1_1069
 13254  	jmp	.LBB1_993
 13255  .LBB1_997:
 13256  	xor	edi, edi
 13257  .LBB1_998:
 13258  	test	r9b, 1
 13259  	je	.LBB1_1000
 13260  # %bb.999:
 13261  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 13262  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 13263  	psubb	xmm1, xmm0
 13264  	psubb	xmm2, xmm0
 13265  	movdqu	xmmword ptr [r8 + rdi], xmm1
 13266  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 13267  .LBB1_1000:
 13268  	cmp	rsi, r10
 13269  	je	.LBB1_1069
 13270  	jmp	.LBB1_1001
 13271  .LBB1_1005:
 13272  	xor	edi, edi
 13273  .LBB1_1006:
 13274  	test	r9b, 1
 13275  	je	.LBB1_1008
 13276  # %bb.1007:
 13277  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 13278  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 13279  	paddb	xmm1, xmm0
 13280  	paddb	xmm2, xmm0
 13281  	movdqu	xmmword ptr [r8 + rdi], xmm1
 13282  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 13283  .LBB1_1008:
 13284  	cmp	rsi, r10
 13285  	je	.LBB1_1069
 13286  	jmp	.LBB1_1009
 13287  .LBB1_1013:
 13288  	xor	edi, edi
 13289  .LBB1_1014:
 13290  	test	r9b, 1
 13291  	je	.LBB1_1016
 13292  # %bb.1015:
 13293  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 13294  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 13295  	psubb	xmm1, xmm0
 13296  	psubb	xmm2, xmm0
 13297  	movdqu	xmmword ptr [r8 + rdi], xmm1
 13298  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 13299  .LBB1_1016:
 13300  	cmp	rsi, r10
 13301  	je	.LBB1_1069
 13302  	jmp	.LBB1_1017
 13303  .LBB1_1021:
 13304  	xor	edi, edi
 13305  .LBB1_1022:
 13306  	test	r9b, 1
 13307  	je	.LBB1_1024
 13308  # %bb.1023:
 13309  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 13310  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 13311  	pmulld	xmm1, xmm0
 13312  	pmulld	xmm2, xmm0
 13313  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 13314  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 13315  .LBB1_1024:
 13316  	cmp	rsi, r10
 13317  	je	.LBB1_1069
 13318  	jmp	.LBB1_1025
 13319  .LBB1_1029:
 13320  	xor	edi, edi
 13321  .LBB1_1030:
 13322  	test	r9b, 1
 13323  	je	.LBB1_1032
 13324  # %bb.1031:
 13325  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 13326  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 13327  	pmulld	xmm1, xmm0
 13328  	pmulld	xmm2, xmm0
 13329  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 13330  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 13331  .LBB1_1032:
 13332  	cmp	rsi, r10
 13333  	je	.LBB1_1069
 13334  	jmp	.LBB1_1033
 13335  .LBB1_1037:
 13336  	xor	edi, edi
 13337  .LBB1_1038:
 13338  	test	r9b, 1
 13339  	je	.LBB1_1040
 13340  # %bb.1039:
 13341  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 13342  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 13343  	paddd	xmm1, xmm0
 13344  	paddd	xmm2, xmm0
 13345  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 13346  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 13347  .LBB1_1040:
 13348  	cmp	rsi, r10
 13349  	je	.LBB1_1069
 13350  	jmp	.LBB1_1041
 13351  .LBB1_1045:
 13352  	xor	edi, edi
 13353  .LBB1_1046:
 13354  	test	r9b, 1
 13355  	je	.LBB1_1048
 13356  # %bb.1047:
 13357  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 13358  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 13359  	psubd	xmm1, xmm0
 13360  	psubd	xmm2, xmm0
 13361  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 13362  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 13363  .LBB1_1048:
 13364  	cmp	rsi, r10
 13365  	je	.LBB1_1069
 13366  	jmp	.LBB1_1049
 13367  .LBB1_1053:
 13368  	xor	edi, edi
 13369  .LBB1_1054:
 13370  	test	r9b, 1
 13371  	je	.LBB1_1056
 13372  # %bb.1055:
 13373  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 13374  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 13375  	paddd	xmm1, xmm0
 13376  	paddd	xmm2, xmm0
 13377  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 13378  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 13379  .LBB1_1056:
 13380  	cmp	rsi, r10
 13381  	je	.LBB1_1069
 13382  	jmp	.LBB1_1057
 13383  .LBB1_1061:
 13384  	xor	edi, edi
 13385  .LBB1_1062:
 13386  	test	r9b, 1
 13387  	je	.LBB1_1064
 13388  # %bb.1063:
 13389  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 13390  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 13391  	psubd	xmm1, xmm0
 13392  	psubd	xmm2, xmm0
 13393  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 13394  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 13395  .LBB1_1064:
 13396  	cmp	rsi, r10
 13397  	je	.LBB1_1069
 13398  	jmp	.LBB1_1065
 13399  .Lfunc_end1:
 13400  	.size	arithmetic_arr_scalar_sse4, .Lfunc_end1-arithmetic_arr_scalar_sse4
 13401                                          # -- End function
 13402  	.section	.rodata.cst16,"aM",@progbits,16
 13403  	.p2align	4                               # -- Begin function arithmetic_scalar_arr_sse4
 13404  .LCPI2_0:
 13405  	.short	255                             # 0xff
 13406  	.short	255                             # 0xff
 13407  	.short	255                             # 0xff
 13408  	.short	255                             # 0xff
 13409  	.short	255                             # 0xff
 13410  	.short	255                             # 0xff
 13411  	.short	255                             # 0xff
 13412  	.short	255                             # 0xff
 13413  	.text
 13414  	.globl	arithmetic_scalar_arr_sse4
 13415  	.p2align	4, 0x90
 13416  	.type	arithmetic_scalar_arr_sse4,@function
 13417  arithmetic_scalar_arr_sse4:             # @arithmetic_scalar_arr_sse4
 13418  # %bb.0:
 13419  	push	rbp
 13420  	mov	rbp, rsp
 13421  	and	rsp, -8
 13422  	cmp	sil, 20
 13423  	jg	.LBB2_12
 13424  # %bb.1:
 13425  	test	sil, sil
 13426  	je	.LBB2_23
 13427  # %bb.2:
 13428  	cmp	sil, 1
 13429  	je	.LBB2_31
 13430  # %bb.3:
 13431  	cmp	sil, 2
 13432  	jne	.LBB2_1069
 13433  # %bb.4:
 13434  	cmp	edi, 6
 13435  	jg	.LBB2_55
 13436  # %bb.5:
 13437  	cmp	edi, 3
 13438  	jle	.LBB2_97
 13439  # %bb.6:
 13440  	cmp	edi, 4
 13441  	je	.LBB2_157
 13442  # %bb.7:
 13443  	cmp	edi, 5
 13444  	je	.LBB2_160
 13445  # %bb.8:
 13446  	cmp	edi, 6
 13447  	jne	.LBB2_1069
 13448  # %bb.9:
 13449  	test	r9d, r9d
 13450  	jle	.LBB2_1069
 13451  # %bb.10:
 13452  	mov	eax, dword ptr [rdx]
 13453  	mov	r10d, r9d
 13454  	cmp	r9d, 8
 13455  	jb	.LBB2_11
 13456  # %bb.265:
 13457  	lea	rdx, [rcx + 4*r10]
 13458  	cmp	rdx, r8
 13459  	jbe	.LBB2_453
 13460  # %bb.266:
 13461  	lea	rdx, [r8 + 4*r10]
 13462  	cmp	rdx, rcx
 13463  	jbe	.LBB2_453
 13464  .LBB2_11:
 13465  	xor	esi, esi
 13466  .LBB2_625:
 13467  	mov	r9, rsi
 13468  	not	r9
 13469  	add	r9, r10
 13470  	mov	rdi, r10
 13471  	and	rdi, 3
 13472  	je	.LBB2_627
 13473  .LBB2_626:                              # =>This Inner Loop Header: Depth=1
 13474  	mov	edx, dword ptr [rcx + 4*rsi]
 13475  	imul	edx, eax
 13476  	mov	dword ptr [r8 + 4*rsi], edx
 13477  	add	rsi, 1
 13478  	add	rdi, -1
 13479  	jne	.LBB2_626
 13480  .LBB2_627:
 13481  	cmp	r9, 3
 13482  	jb	.LBB2_1069
 13483  .LBB2_628:                              # =>This Inner Loop Header: Depth=1
 13484  	mov	edx, dword ptr [rcx + 4*rsi]
 13485  	imul	edx, eax
 13486  	mov	dword ptr [r8 + 4*rsi], edx
 13487  	mov	edx, dword ptr [rcx + 4*rsi + 4]
 13488  	imul	edx, eax
 13489  	mov	dword ptr [r8 + 4*rsi + 4], edx
 13490  	mov	edx, dword ptr [rcx + 4*rsi + 8]
 13491  	imul	edx, eax
 13492  	mov	dword ptr [r8 + 4*rsi + 8], edx
 13493  	mov	edx, dword ptr [rcx + 4*rsi + 12]
 13494  	imul	edx, eax
 13495  	mov	dword ptr [r8 + 4*rsi + 12], edx
 13496  	add	rsi, 4
 13497  	cmp	r10, rsi
 13498  	jne	.LBB2_628
 13499  	jmp	.LBB2_1069
 13500  .LBB2_12:
 13501  	cmp	sil, 21
 13502  	je	.LBB2_39
 13503  # %bb.13:
 13504  	cmp	sil, 22
 13505  	je	.LBB2_47
 13506  # %bb.14:
 13507  	cmp	sil, 23
 13508  	jne	.LBB2_1069
 13509  # %bb.15:
 13510  	cmp	edi, 6
 13511  	jg	.LBB2_62
 13512  # %bb.16:
 13513  	cmp	edi, 3
 13514  	jle	.LBB2_102
 13515  # %bb.17:
 13516  	cmp	edi, 4
 13517  	je	.LBB2_163
 13518  # %bb.18:
 13519  	cmp	edi, 5
 13520  	je	.LBB2_166
 13521  # %bb.19:
 13522  	cmp	edi, 6
 13523  	jne	.LBB2_1069
 13524  # %bb.20:
 13525  	test	r9d, r9d
 13526  	jle	.LBB2_1069
 13527  # %bb.21:
 13528  	mov	eax, dword ptr [rdx]
 13529  	mov	r10d, r9d
 13530  	cmp	r9d, 8
 13531  	jb	.LBB2_22
 13532  # %bb.268:
 13533  	lea	rdx, [rcx + 4*r10]
 13534  	cmp	rdx, r8
 13535  	jbe	.LBB2_456
 13536  # %bb.269:
 13537  	lea	rdx, [r8 + 4*r10]
 13538  	cmp	rdx, rcx
 13539  	jbe	.LBB2_456
 13540  .LBB2_22:
 13541  	xor	esi, esi
 13542  .LBB2_633:
 13543  	mov	r9, rsi
 13544  	not	r9
 13545  	add	r9, r10
 13546  	mov	rdi, r10
 13547  	and	rdi, 3
 13548  	je	.LBB2_635
 13549  .LBB2_634:                              # =>This Inner Loop Header: Depth=1
 13550  	mov	edx, dword ptr [rcx + 4*rsi]
 13551  	imul	edx, eax
 13552  	mov	dword ptr [r8 + 4*rsi], edx
 13553  	add	rsi, 1
 13554  	add	rdi, -1
 13555  	jne	.LBB2_634
 13556  .LBB2_635:
 13557  	cmp	r9, 3
 13558  	jb	.LBB2_1069
 13559  .LBB2_636:                              # =>This Inner Loop Header: Depth=1
 13560  	mov	edx, dword ptr [rcx + 4*rsi]
 13561  	imul	edx, eax
 13562  	mov	dword ptr [r8 + 4*rsi], edx
 13563  	mov	edx, dword ptr [rcx + 4*rsi + 4]
 13564  	imul	edx, eax
 13565  	mov	dword ptr [r8 + 4*rsi + 4], edx
 13566  	mov	edx, dword ptr [rcx + 4*rsi + 8]
 13567  	imul	edx, eax
 13568  	mov	dword ptr [r8 + 4*rsi + 8], edx
 13569  	mov	edx, dword ptr [rcx + 4*rsi + 12]
 13570  	imul	edx, eax
 13571  	mov	dword ptr [r8 + 4*rsi + 12], edx
 13572  	add	rsi, 4
 13573  	cmp	r10, rsi
 13574  	jne	.LBB2_636
 13575  	jmp	.LBB2_1069
 13576  .LBB2_23:
 13577  	cmp	edi, 6
 13578  	jg	.LBB2_69
 13579  # %bb.24:
 13580  	cmp	edi, 3
 13581  	jle	.LBB2_107
 13582  # %bb.25:
 13583  	cmp	edi, 4
 13584  	je	.LBB2_169
 13585  # %bb.26:
 13586  	cmp	edi, 5
 13587  	je	.LBB2_172
 13588  # %bb.27:
 13589  	cmp	edi, 6
 13590  	jne	.LBB2_1069
 13591  # %bb.28:
 13592  	test	r9d, r9d
 13593  	jle	.LBB2_1069
 13594  # %bb.29:
 13595  	mov	eax, dword ptr [rdx]
 13596  	mov	r10d, r9d
 13597  	cmp	r9d, 8
 13598  	jb	.LBB2_30
 13599  # %bb.271:
 13600  	lea	rdx, [rcx + 4*r10]
 13601  	cmp	rdx, r8
 13602  	jbe	.LBB2_459
 13603  # %bb.272:
 13604  	lea	rdx, [r8 + 4*r10]
 13605  	cmp	rdx, rcx
 13606  	jbe	.LBB2_459
 13607  .LBB2_30:
 13608  	xor	esi, esi
 13609  .LBB2_641:
 13610  	mov	r9, rsi
 13611  	not	r9
 13612  	add	r9, r10
 13613  	mov	rdi, r10
 13614  	and	rdi, 3
 13615  	je	.LBB2_643
 13616  .LBB2_642:                              # =>This Inner Loop Header: Depth=1
 13617  	mov	edx, dword ptr [rcx + 4*rsi]
 13618  	add	edx, eax
 13619  	mov	dword ptr [r8 + 4*rsi], edx
 13620  	add	rsi, 1
 13621  	add	rdi, -1
 13622  	jne	.LBB2_642
 13623  .LBB2_643:
 13624  	cmp	r9, 3
 13625  	jb	.LBB2_1069
 13626  .LBB2_644:                              # =>This Inner Loop Header: Depth=1
 13627  	mov	edx, dword ptr [rcx + 4*rsi]
 13628  	add	edx, eax
 13629  	mov	dword ptr [r8 + 4*rsi], edx
 13630  	mov	edx, dword ptr [rcx + 4*rsi + 4]
 13631  	add	edx, eax
 13632  	mov	dword ptr [r8 + 4*rsi + 4], edx
 13633  	mov	edx, dword ptr [rcx + 4*rsi + 8]
 13634  	add	edx, eax
 13635  	mov	dword ptr [r8 + 4*rsi + 8], edx
 13636  	mov	edx, dword ptr [rcx + 4*rsi + 12]
 13637  	add	edx, eax
 13638  	mov	dword ptr [r8 + 4*rsi + 12], edx
 13639  	add	rsi, 4
 13640  	cmp	r10, rsi
 13641  	jne	.LBB2_644
 13642  	jmp	.LBB2_1069
 13643  .LBB2_31:
 13644  	cmp	edi, 6
 13645  	jg	.LBB2_76
 13646  # %bb.32:
 13647  	cmp	edi, 3
 13648  	jle	.LBB2_112
 13649  # %bb.33:
 13650  	cmp	edi, 4
 13651  	je	.LBB2_175
 13652  # %bb.34:
 13653  	cmp	edi, 5
 13654  	je	.LBB2_178
 13655  # %bb.35:
 13656  	cmp	edi, 6
 13657  	jne	.LBB2_1069
 13658  # %bb.36:
 13659  	test	r9d, r9d
 13660  	jle	.LBB2_1069
 13661  # %bb.37:
 13662  	mov	r11d, dword ptr [rdx]
 13663  	mov	r10d, r9d
 13664  	cmp	r9d, 8
 13665  	jb	.LBB2_38
 13666  # %bb.274:
 13667  	lea	rdx, [rcx + 4*r10]
 13668  	cmp	rdx, r8
 13669  	jbe	.LBB2_462
 13670  # %bb.275:
 13671  	lea	rdx, [r8 + 4*r10]
 13672  	cmp	rdx, rcx
 13673  	jbe	.LBB2_462
 13674  .LBB2_38:
 13675  	xor	esi, esi
 13676  .LBB2_649:
 13677  	mov	rdx, rsi
 13678  	not	rdx
 13679  	add	rdx, r10
 13680  	mov	rdi, r10
 13681  	and	rdi, 3
 13682  	je	.LBB2_651
 13683  .LBB2_650:                              # =>This Inner Loop Header: Depth=1
 13684  	mov	eax, r11d
 13685  	sub	eax, dword ptr [rcx + 4*rsi]
 13686  	mov	dword ptr [r8 + 4*rsi], eax
 13687  	add	rsi, 1
 13688  	add	rdi, -1
 13689  	jne	.LBB2_650
 13690  .LBB2_651:
 13691  	cmp	rdx, 3
 13692  	jb	.LBB2_1069
 13693  .LBB2_652:                              # =>This Inner Loop Header: Depth=1
 13694  	mov	eax, r11d
 13695  	sub	eax, dword ptr [rcx + 4*rsi]
 13696  	mov	dword ptr [r8 + 4*rsi], eax
 13697  	mov	eax, r11d
 13698  	sub	eax, dword ptr [rcx + 4*rsi + 4]
 13699  	mov	dword ptr [r8 + 4*rsi + 4], eax
 13700  	mov	eax, r11d
 13701  	sub	eax, dword ptr [rcx + 4*rsi + 8]
 13702  	mov	dword ptr [r8 + 4*rsi + 8], eax
 13703  	mov	eax, r11d
 13704  	sub	eax, dword ptr [rcx + 4*rsi + 12]
 13705  	mov	dword ptr [r8 + 4*rsi + 12], eax
 13706  	add	rsi, 4
 13707  	cmp	r10, rsi
 13708  	jne	.LBB2_652
 13709  	jmp	.LBB2_1069
 13710  .LBB2_39:
 13711  	cmp	edi, 6
 13712  	jg	.LBB2_83
 13713  # %bb.40:
 13714  	cmp	edi, 3
 13715  	jle	.LBB2_117
 13716  # %bb.41:
 13717  	cmp	edi, 4
 13718  	je	.LBB2_181
 13719  # %bb.42:
 13720  	cmp	edi, 5
 13721  	je	.LBB2_184
 13722  # %bb.43:
 13723  	cmp	edi, 6
 13724  	jne	.LBB2_1069
 13725  # %bb.44:
 13726  	test	r9d, r9d
 13727  	jle	.LBB2_1069
 13728  # %bb.45:
 13729  	mov	eax, dword ptr [rdx]
 13730  	mov	r10d, r9d
 13731  	cmp	r9d, 8
 13732  	jb	.LBB2_46
 13733  # %bb.277:
 13734  	lea	rdx, [rcx + 4*r10]
 13735  	cmp	rdx, r8
 13736  	jbe	.LBB2_465
 13737  # %bb.278:
 13738  	lea	rdx, [r8 + 4*r10]
 13739  	cmp	rdx, rcx
 13740  	jbe	.LBB2_465
 13741  .LBB2_46:
 13742  	xor	esi, esi
 13743  .LBB2_657:
 13744  	mov	r9, rsi
 13745  	not	r9
 13746  	add	r9, r10
 13747  	mov	rdi, r10
 13748  	and	rdi, 3
 13749  	je	.LBB2_659
 13750  .LBB2_658:                              # =>This Inner Loop Header: Depth=1
 13751  	mov	edx, dword ptr [rcx + 4*rsi]
 13752  	add	edx, eax
 13753  	mov	dword ptr [r8 + 4*rsi], edx
 13754  	add	rsi, 1
 13755  	add	rdi, -1
 13756  	jne	.LBB2_658
 13757  .LBB2_659:
 13758  	cmp	r9, 3
 13759  	jb	.LBB2_1069
 13760  .LBB2_660:                              # =>This Inner Loop Header: Depth=1
 13761  	mov	edx, dword ptr [rcx + 4*rsi]
 13762  	add	edx, eax
 13763  	mov	dword ptr [r8 + 4*rsi], edx
 13764  	mov	edx, dword ptr [rcx + 4*rsi + 4]
 13765  	add	edx, eax
 13766  	mov	dword ptr [r8 + 4*rsi + 4], edx
 13767  	mov	edx, dword ptr [rcx + 4*rsi + 8]
 13768  	add	edx, eax
 13769  	mov	dword ptr [r8 + 4*rsi + 8], edx
 13770  	mov	edx, dword ptr [rcx + 4*rsi + 12]
 13771  	add	edx, eax
 13772  	mov	dword ptr [r8 + 4*rsi + 12], edx
 13773  	add	rsi, 4
 13774  	cmp	r10, rsi
 13775  	jne	.LBB2_660
 13776  	jmp	.LBB2_1069
 13777  .LBB2_47:
 13778  	cmp	edi, 6
 13779  	jg	.LBB2_90
 13780  # %bb.48:
 13781  	cmp	edi, 3
 13782  	jle	.LBB2_122
 13783  # %bb.49:
 13784  	cmp	edi, 4
 13785  	je	.LBB2_187
 13786  # %bb.50:
 13787  	cmp	edi, 5
 13788  	je	.LBB2_190
 13789  # %bb.51:
 13790  	cmp	edi, 6
 13791  	jne	.LBB2_1069
 13792  # %bb.52:
 13793  	test	r9d, r9d
 13794  	jle	.LBB2_1069
 13795  # %bb.53:
 13796  	mov	r11d, dword ptr [rdx]
 13797  	mov	r10d, r9d
 13798  	cmp	r9d, 8
 13799  	jb	.LBB2_54
 13800  # %bb.280:
 13801  	lea	rdx, [rcx + 4*r10]
 13802  	cmp	rdx, r8
 13803  	jbe	.LBB2_468
 13804  # %bb.281:
 13805  	lea	rdx, [r8 + 4*r10]
 13806  	cmp	rdx, rcx
 13807  	jbe	.LBB2_468
 13808  .LBB2_54:
 13809  	xor	esi, esi
 13810  .LBB2_665:
 13811  	mov	rdx, rsi
 13812  	not	rdx
 13813  	add	rdx, r10
 13814  	mov	rdi, r10
 13815  	and	rdi, 3
 13816  	je	.LBB2_667
 13817  .LBB2_666:                              # =>This Inner Loop Header: Depth=1
 13818  	mov	eax, r11d
 13819  	sub	eax, dword ptr [rcx + 4*rsi]
 13820  	mov	dword ptr [r8 + 4*rsi], eax
 13821  	add	rsi, 1
 13822  	add	rdi, -1
 13823  	jne	.LBB2_666
 13824  .LBB2_667:
 13825  	cmp	rdx, 3
 13826  	jb	.LBB2_1069
 13827  .LBB2_668:                              # =>This Inner Loop Header: Depth=1
 13828  	mov	eax, r11d
 13829  	sub	eax, dword ptr [rcx + 4*rsi]
 13830  	mov	dword ptr [r8 + 4*rsi], eax
 13831  	mov	eax, r11d
 13832  	sub	eax, dword ptr [rcx + 4*rsi + 4]
 13833  	mov	dword ptr [r8 + 4*rsi + 4], eax
 13834  	mov	eax, r11d
 13835  	sub	eax, dword ptr [rcx + 4*rsi + 8]
 13836  	mov	dword ptr [r8 + 4*rsi + 8], eax
 13837  	mov	eax, r11d
 13838  	sub	eax, dword ptr [rcx + 4*rsi + 12]
 13839  	mov	dword ptr [r8 + 4*rsi + 12], eax
 13840  	add	rsi, 4
 13841  	cmp	r10, rsi
 13842  	jne	.LBB2_668
 13843  	jmp	.LBB2_1069
 13844  .LBB2_55:
 13845  	cmp	edi, 8
 13846  	jle	.LBB2_127
 13847  # %bb.56:
 13848  	cmp	edi, 9
 13849  	je	.LBB2_193
 13850  # %bb.57:
 13851  	cmp	edi, 11
 13852  	je	.LBB2_196
 13853  # %bb.58:
 13854  	cmp	edi, 12
 13855  	jne	.LBB2_1069
 13856  # %bb.59:
 13857  	test	r9d, r9d
 13858  	jle	.LBB2_1069
 13859  # %bb.60:
 13860  	movsd	xmm0, qword ptr [rdx]           # xmm0 = mem[0],zero
 13861  	mov	eax, r9d
 13862  	cmp	r9d, 4
 13863  	jb	.LBB2_61
 13864  # %bb.283:
 13865  	lea	rdx, [rcx + 8*rax]
 13866  	cmp	rdx, r8
 13867  	jbe	.LBB2_471
 13868  # %bb.284:
 13869  	lea	rdx, [r8 + 8*rax]
 13870  	cmp	rdx, rcx
 13871  	jbe	.LBB2_471
 13872  .LBB2_61:
 13873  	xor	edx, edx
 13874  .LBB2_673:
 13875  	mov	rsi, rdx
 13876  	not	rsi
 13877  	add	rsi, rax
 13878  	mov	rdi, rax
 13879  	and	rdi, 3
 13880  	je	.LBB2_675
 13881  .LBB2_674:                              # =>This Inner Loop Header: Depth=1
 13882  	movsd	xmm1, qword ptr [rcx + 8*rdx]   # xmm1 = mem[0],zero
 13883  	mulsd	xmm1, xmm0
 13884  	movsd	qword ptr [r8 + 8*rdx], xmm1
 13885  	add	rdx, 1
 13886  	add	rdi, -1
 13887  	jne	.LBB2_674
 13888  .LBB2_675:
 13889  	cmp	rsi, 3
 13890  	jb	.LBB2_1069
 13891  .LBB2_676:                              # =>This Inner Loop Header: Depth=1
 13892  	movsd	xmm1, qword ptr [rcx + 8*rdx]   # xmm1 = mem[0],zero
 13893  	mulsd	xmm1, xmm0
 13894  	movsd	qword ptr [r8 + 8*rdx], xmm1
 13895  	movsd	xmm1, qword ptr [rcx + 8*rdx + 8] # xmm1 = mem[0],zero
 13896  	mulsd	xmm1, xmm0
 13897  	movsd	qword ptr [r8 + 8*rdx + 8], xmm1
 13898  	movsd	xmm1, qword ptr [rcx + 8*rdx + 16] # xmm1 = mem[0],zero
 13899  	mulsd	xmm1, xmm0
 13900  	movsd	qword ptr [r8 + 8*rdx + 16], xmm1
 13901  	movsd	xmm1, qword ptr [rcx + 8*rdx + 24] # xmm1 = mem[0],zero
 13902  	mulsd	xmm1, xmm0
 13903  	movsd	qword ptr [r8 + 8*rdx + 24], xmm1
 13904  	add	rdx, 4
 13905  	cmp	rax, rdx
 13906  	jne	.LBB2_676
 13907  	jmp	.LBB2_1069
 13908  .LBB2_62:
 13909  	cmp	edi, 8
 13910  	jle	.LBB2_132
 13911  # %bb.63:
 13912  	cmp	edi, 9
 13913  	je	.LBB2_199
 13914  # %bb.64:
 13915  	cmp	edi, 11
 13916  	je	.LBB2_202
 13917  # %bb.65:
 13918  	cmp	edi, 12
 13919  	jne	.LBB2_1069
 13920  # %bb.66:
 13921  	test	r9d, r9d
 13922  	jle	.LBB2_1069
 13923  # %bb.67:
 13924  	movsd	xmm0, qword ptr [rdx]           # xmm0 = mem[0],zero
 13925  	mov	eax, r9d
 13926  	cmp	r9d, 4
 13927  	jb	.LBB2_68
 13928  # %bb.286:
 13929  	lea	rdx, [rcx + 8*rax]
 13930  	cmp	rdx, r8
 13931  	jbe	.LBB2_474
 13932  # %bb.287:
 13933  	lea	rdx, [r8 + 8*rax]
 13934  	cmp	rdx, rcx
 13935  	jbe	.LBB2_474
 13936  .LBB2_68:
 13937  	xor	edx, edx
 13938  .LBB2_681:
 13939  	mov	rsi, rdx
 13940  	not	rsi
 13941  	add	rsi, rax
 13942  	mov	rdi, rax
 13943  	and	rdi, 3
 13944  	je	.LBB2_683
 13945  .LBB2_682:                              # =>This Inner Loop Header: Depth=1
 13946  	movsd	xmm1, qword ptr [rcx + 8*rdx]   # xmm1 = mem[0],zero
 13947  	mulsd	xmm1, xmm0
 13948  	movsd	qword ptr [r8 + 8*rdx], xmm1
 13949  	add	rdx, 1
 13950  	add	rdi, -1
 13951  	jne	.LBB2_682
 13952  .LBB2_683:
 13953  	cmp	rsi, 3
 13954  	jb	.LBB2_1069
 13955  .LBB2_684:                              # =>This Inner Loop Header: Depth=1
 13956  	movsd	xmm1, qword ptr [rcx + 8*rdx]   # xmm1 = mem[0],zero
 13957  	mulsd	xmm1, xmm0
 13958  	movsd	qword ptr [r8 + 8*rdx], xmm1
 13959  	movsd	xmm1, qword ptr [rcx + 8*rdx + 8] # xmm1 = mem[0],zero
 13960  	mulsd	xmm1, xmm0
 13961  	movsd	qword ptr [r8 + 8*rdx + 8], xmm1
 13962  	movsd	xmm1, qword ptr [rcx + 8*rdx + 16] # xmm1 = mem[0],zero
 13963  	mulsd	xmm1, xmm0
 13964  	movsd	qword ptr [r8 + 8*rdx + 16], xmm1
 13965  	movsd	xmm1, qword ptr [rcx + 8*rdx + 24] # xmm1 = mem[0],zero
 13966  	mulsd	xmm1, xmm0
 13967  	movsd	qword ptr [r8 + 8*rdx + 24], xmm1
 13968  	add	rdx, 4
 13969  	cmp	rax, rdx
 13970  	jne	.LBB2_684
 13971  	jmp	.LBB2_1069
 13972  .LBB2_69:
 13973  	cmp	edi, 8
 13974  	jle	.LBB2_137
 13975  # %bb.70:
 13976  	cmp	edi, 9
 13977  	je	.LBB2_205
 13978  # %bb.71:
 13979  	cmp	edi, 11
 13980  	je	.LBB2_208
 13981  # %bb.72:
 13982  	cmp	edi, 12
 13983  	jne	.LBB2_1069
 13984  # %bb.73:
 13985  	test	r9d, r9d
 13986  	jle	.LBB2_1069
 13987  # %bb.74:
 13988  	movsd	xmm0, qword ptr [rdx]           # xmm0 = mem[0],zero
 13989  	mov	eax, r9d
 13990  	cmp	r9d, 4
 13991  	jb	.LBB2_75
 13992  # %bb.289:
 13993  	lea	rdx, [rcx + 8*rax]
 13994  	cmp	rdx, r8
 13995  	jbe	.LBB2_477
 13996  # %bb.290:
 13997  	lea	rdx, [r8 + 8*rax]
 13998  	cmp	rdx, rcx
 13999  	jbe	.LBB2_477
 14000  .LBB2_75:
 14001  	xor	edx, edx
 14002  .LBB2_689:
 14003  	mov	rsi, rdx
 14004  	not	rsi
 14005  	add	rsi, rax
 14006  	mov	rdi, rax
 14007  	and	rdi, 3
 14008  	je	.LBB2_691
 14009  .LBB2_690:                              # =>This Inner Loop Header: Depth=1
 14010  	movsd	xmm1, qword ptr [rcx + 8*rdx]   # xmm1 = mem[0],zero
 14011  	addsd	xmm1, xmm0
 14012  	movsd	qword ptr [r8 + 8*rdx], xmm1
 14013  	add	rdx, 1
 14014  	add	rdi, -1
 14015  	jne	.LBB2_690
 14016  .LBB2_691:
 14017  	cmp	rsi, 3
 14018  	jb	.LBB2_1069
 14019  .LBB2_692:                              # =>This Inner Loop Header: Depth=1
 14020  	movsd	xmm1, qword ptr [rcx + 8*rdx]   # xmm1 = mem[0],zero
 14021  	addsd	xmm1, xmm0
 14022  	movsd	qword ptr [r8 + 8*rdx], xmm1
 14023  	movsd	xmm1, qword ptr [rcx + 8*rdx + 8] # xmm1 = mem[0],zero
 14024  	addsd	xmm1, xmm0
 14025  	movsd	qword ptr [r8 + 8*rdx + 8], xmm1
 14026  	movsd	xmm1, qword ptr [rcx + 8*rdx + 16] # xmm1 = mem[0],zero
 14027  	addsd	xmm1, xmm0
 14028  	movsd	qword ptr [r8 + 8*rdx + 16], xmm1
 14029  	movsd	xmm1, qword ptr [rcx + 8*rdx + 24] # xmm1 = mem[0],zero
 14030  	addsd	xmm1, xmm0
 14031  	movsd	qword ptr [r8 + 8*rdx + 24], xmm1
 14032  	add	rdx, 4
 14033  	cmp	rax, rdx
 14034  	jne	.LBB2_692
 14035  	jmp	.LBB2_1069
 14036  .LBB2_76:
 14037  	cmp	edi, 8
 14038  	jle	.LBB2_142
 14039  # %bb.77:
 14040  	cmp	edi, 9
 14041  	je	.LBB2_211
 14042  # %bb.78:
 14043  	cmp	edi, 11
 14044  	je	.LBB2_214
 14045  # %bb.79:
 14046  	cmp	edi, 12
 14047  	jne	.LBB2_1069
 14048  # %bb.80:
 14049  	test	r9d, r9d
 14050  	jle	.LBB2_1069
 14051  # %bb.81:
 14052  	movsd	xmm0, qword ptr [rdx]           # xmm0 = mem[0],zero
 14053  	mov	eax, r9d
 14054  	cmp	r9d, 4
 14055  	jb	.LBB2_82
 14056  # %bb.292:
 14057  	lea	rdx, [rcx + 8*rax]
 14058  	cmp	rdx, r8
 14059  	jbe	.LBB2_480
 14060  # %bb.293:
 14061  	lea	rdx, [r8 + 8*rax]
 14062  	cmp	rdx, rcx
 14063  	jbe	.LBB2_480
 14064  .LBB2_82:
 14065  	xor	edx, edx
 14066  .LBB2_697:
 14067  	mov	rsi, rdx
 14068  	not	rsi
 14069  	add	rsi, rax
 14070  	mov	rdi, rax
 14071  	and	rdi, 3
 14072  	je	.LBB2_699
 14073  .LBB2_698:                              # =>This Inner Loop Header: Depth=1
 14074  	movapd	xmm1, xmm0
 14075  	subsd	xmm1, qword ptr [rcx + 8*rdx]
 14076  	movsd	qword ptr [r8 + 8*rdx], xmm1
 14077  	add	rdx, 1
 14078  	add	rdi, -1
 14079  	jne	.LBB2_698
 14080  .LBB2_699:
 14081  	cmp	rsi, 3
 14082  	jb	.LBB2_1069
 14083  .LBB2_700:                              # =>This Inner Loop Header: Depth=1
 14084  	movapd	xmm1, xmm0
 14085  	subsd	xmm1, qword ptr [rcx + 8*rdx]
 14086  	movsd	qword ptr [r8 + 8*rdx], xmm1
 14087  	movapd	xmm1, xmm0
 14088  	subsd	xmm1, qword ptr [rcx + 8*rdx + 8]
 14089  	movsd	qword ptr [r8 + 8*rdx + 8], xmm1
 14090  	movapd	xmm1, xmm0
 14091  	subsd	xmm1, qword ptr [rcx + 8*rdx + 16]
 14092  	movsd	qword ptr [r8 + 8*rdx + 16], xmm1
 14093  	movapd	xmm1, xmm0
 14094  	subsd	xmm1, qword ptr [rcx + 8*rdx + 24]
 14095  	movsd	qword ptr [r8 + 8*rdx + 24], xmm1
 14096  	add	rdx, 4
 14097  	cmp	rax, rdx
 14098  	jne	.LBB2_700
 14099  	jmp	.LBB2_1069
 14100  .LBB2_83:
 14101  	cmp	edi, 8
 14102  	jle	.LBB2_147
 14103  # %bb.84:
 14104  	cmp	edi, 9
 14105  	je	.LBB2_217
 14106  # %bb.85:
 14107  	cmp	edi, 11
 14108  	je	.LBB2_220
 14109  # %bb.86:
 14110  	cmp	edi, 12
 14111  	jne	.LBB2_1069
 14112  # %bb.87:
 14113  	test	r9d, r9d
 14114  	jle	.LBB2_1069
 14115  # %bb.88:
 14116  	movsd	xmm0, qword ptr [rdx]           # xmm0 = mem[0],zero
 14117  	mov	eax, r9d
 14118  	cmp	r9d, 4
 14119  	jb	.LBB2_89
 14120  # %bb.295:
 14121  	lea	rdx, [rcx + 8*rax]
 14122  	cmp	rdx, r8
 14123  	jbe	.LBB2_483
 14124  # %bb.296:
 14125  	lea	rdx, [r8 + 8*rax]
 14126  	cmp	rdx, rcx
 14127  	jbe	.LBB2_483
 14128  .LBB2_89:
 14129  	xor	edx, edx
 14130  .LBB2_705:
 14131  	mov	rsi, rdx
 14132  	not	rsi
 14133  	add	rsi, rax
 14134  	mov	rdi, rax
 14135  	and	rdi, 3
 14136  	je	.LBB2_707
 14137  .LBB2_706:                              # =>This Inner Loop Header: Depth=1
 14138  	movsd	xmm1, qword ptr [rcx + 8*rdx]   # xmm1 = mem[0],zero
 14139  	addsd	xmm1, xmm0
 14140  	movsd	qword ptr [r8 + 8*rdx], xmm1
 14141  	add	rdx, 1
 14142  	add	rdi, -1
 14143  	jne	.LBB2_706
 14144  .LBB2_707:
 14145  	cmp	rsi, 3
 14146  	jb	.LBB2_1069
 14147  .LBB2_708:                              # =>This Inner Loop Header: Depth=1
 14148  	movsd	xmm1, qword ptr [rcx + 8*rdx]   # xmm1 = mem[0],zero
 14149  	addsd	xmm1, xmm0
 14150  	movsd	qword ptr [r8 + 8*rdx], xmm1
 14151  	movsd	xmm1, qword ptr [rcx + 8*rdx + 8] # xmm1 = mem[0],zero
 14152  	addsd	xmm1, xmm0
 14153  	movsd	qword ptr [r8 + 8*rdx + 8], xmm1
 14154  	movsd	xmm1, qword ptr [rcx + 8*rdx + 16] # xmm1 = mem[0],zero
 14155  	addsd	xmm1, xmm0
 14156  	movsd	qword ptr [r8 + 8*rdx + 16], xmm1
 14157  	movsd	xmm1, qword ptr [rcx + 8*rdx + 24] # xmm1 = mem[0],zero
 14158  	addsd	xmm1, xmm0
 14159  	movsd	qword ptr [r8 + 8*rdx + 24], xmm1
 14160  	add	rdx, 4
 14161  	cmp	rax, rdx
 14162  	jne	.LBB2_708
 14163  	jmp	.LBB2_1069
 14164  .LBB2_90:
 14165  	cmp	edi, 8
 14166  	jle	.LBB2_152
 14167  # %bb.91:
 14168  	cmp	edi, 9
 14169  	je	.LBB2_223
 14170  # %bb.92:
 14171  	cmp	edi, 11
 14172  	je	.LBB2_226
 14173  # %bb.93:
 14174  	cmp	edi, 12
 14175  	jne	.LBB2_1069
 14176  # %bb.94:
 14177  	test	r9d, r9d
 14178  	jle	.LBB2_1069
 14179  # %bb.95:
 14180  	movsd	xmm0, qword ptr [rdx]           # xmm0 = mem[0],zero
 14181  	mov	eax, r9d
 14182  	cmp	r9d, 4
 14183  	jb	.LBB2_96
 14184  # %bb.298:
 14185  	lea	rdx, [rcx + 8*rax]
 14186  	cmp	rdx, r8
 14187  	jbe	.LBB2_486
 14188  # %bb.299:
 14189  	lea	rdx, [r8 + 8*rax]
 14190  	cmp	rdx, rcx
 14191  	jbe	.LBB2_486
 14192  .LBB2_96:
 14193  	xor	edx, edx
 14194  .LBB2_713:
 14195  	mov	rsi, rdx
 14196  	not	rsi
 14197  	add	rsi, rax
 14198  	mov	rdi, rax
 14199  	and	rdi, 3
 14200  	je	.LBB2_715
 14201  .LBB2_714:                              # =>This Inner Loop Header: Depth=1
 14202  	movapd	xmm1, xmm0
 14203  	subsd	xmm1, qword ptr [rcx + 8*rdx]
 14204  	movsd	qword ptr [r8 + 8*rdx], xmm1
 14205  	add	rdx, 1
 14206  	add	rdi, -1
 14207  	jne	.LBB2_714
 14208  .LBB2_715:
 14209  	cmp	rsi, 3
 14210  	jb	.LBB2_1069
 14211  .LBB2_716:                              # =>This Inner Loop Header: Depth=1
 14212  	movapd	xmm1, xmm0
 14213  	subsd	xmm1, qword ptr [rcx + 8*rdx]
 14214  	movsd	qword ptr [r8 + 8*rdx], xmm1
 14215  	movapd	xmm1, xmm0
 14216  	subsd	xmm1, qword ptr [rcx + 8*rdx + 8]
 14217  	movsd	qword ptr [r8 + 8*rdx + 8], xmm1
 14218  	movapd	xmm1, xmm0
 14219  	subsd	xmm1, qword ptr [rcx + 8*rdx + 16]
 14220  	movsd	qword ptr [r8 + 8*rdx + 16], xmm1
 14221  	movapd	xmm1, xmm0
 14222  	subsd	xmm1, qword ptr [rcx + 8*rdx + 24]
 14223  	movsd	qword ptr [r8 + 8*rdx + 24], xmm1
 14224  	add	rdx, 4
 14225  	cmp	rax, rdx
 14226  	jne	.LBB2_716
 14227  	jmp	.LBB2_1069
 14228  .LBB2_97:
 14229  	cmp	edi, 2
 14230  	je	.LBB2_229
 14231  # %bb.98:
 14232  	cmp	edi, 3
 14233  	jne	.LBB2_1069
 14234  # %bb.99:
 14235  	test	r9d, r9d
 14236  	jle	.LBB2_1069
 14237  # %bb.100:
 14238  	mov	dl, byte ptr [rdx]
 14239  	mov	r10d, r9d
 14240  	cmp	r9d, 32
 14241  	jb	.LBB2_101
 14242  # %bb.301:
 14243  	lea	rax, [rcx + r10]
 14244  	cmp	rax, r8
 14245  	jbe	.LBB2_489
 14246  # %bb.302:
 14247  	lea	rax, [r8 + r10]
 14248  	cmp	rax, rcx
 14249  	jbe	.LBB2_489
 14250  .LBB2_101:
 14251  	xor	edi, edi
 14252  .LBB2_721:
 14253  	mov	r9, rdi
 14254  	not	r9
 14255  	add	r9, r10
 14256  	mov	rsi, r10
 14257  	and	rsi, 3
 14258  	je	.LBB2_723
 14259  .LBB2_722:                              # =>This Inner Loop Header: Depth=1
 14260  	movzx	eax, byte ptr [rcx + rdi]
 14261  	mul	dl
 14262  	mov	byte ptr [r8 + rdi], al
 14263  	add	rdi, 1
 14264  	add	rsi, -1
 14265  	jne	.LBB2_722
 14266  .LBB2_723:
 14267  	cmp	r9, 3
 14268  	jb	.LBB2_1069
 14269  .LBB2_724:                              # =>This Inner Loop Header: Depth=1
 14270  	movzx	eax, byte ptr [rcx + rdi]
 14271  	mul	dl
 14272  	mov	byte ptr [r8 + rdi], al
 14273  	movzx	eax, byte ptr [rcx + rdi + 1]
 14274  	mul	dl
 14275  	mov	byte ptr [r8 + rdi + 1], al
 14276  	movzx	eax, byte ptr [rcx + rdi + 2]
 14277  	mul	dl
 14278  	mov	byte ptr [r8 + rdi + 2], al
 14279  	movzx	eax, byte ptr [rcx + rdi + 3]
 14280  	mul	dl
 14281  	mov	byte ptr [r8 + rdi + 3], al
 14282  	add	rdi, 4
 14283  	cmp	r10, rdi
 14284  	jne	.LBB2_724
 14285  	jmp	.LBB2_1069
 14286  .LBB2_102:
 14287  	cmp	edi, 2
 14288  	je	.LBB2_232
 14289  # %bb.103:
 14290  	cmp	edi, 3
 14291  	jne	.LBB2_1069
 14292  # %bb.104:
 14293  	test	r9d, r9d
 14294  	jle	.LBB2_1069
 14295  # %bb.105:
 14296  	mov	dl, byte ptr [rdx]
 14297  	mov	r10d, r9d
 14298  	cmp	r9d, 32
 14299  	jb	.LBB2_106
 14300  # %bb.304:
 14301  	lea	rax, [rcx + r10]
 14302  	cmp	rax, r8
 14303  	jbe	.LBB2_492
 14304  # %bb.305:
 14305  	lea	rax, [r8 + r10]
 14306  	cmp	rax, rcx
 14307  	jbe	.LBB2_492
 14308  .LBB2_106:
 14309  	xor	edi, edi
 14310  .LBB2_729:
 14311  	mov	r9, rdi
 14312  	not	r9
 14313  	add	r9, r10
 14314  	mov	rsi, r10
 14315  	and	rsi, 3
 14316  	je	.LBB2_731
 14317  .LBB2_730:                              # =>This Inner Loop Header: Depth=1
 14318  	movzx	eax, byte ptr [rcx + rdi]
 14319  	mul	dl
 14320  	mov	byte ptr [r8 + rdi], al
 14321  	add	rdi, 1
 14322  	add	rsi, -1
 14323  	jne	.LBB2_730
 14324  .LBB2_731:
 14325  	cmp	r9, 3
 14326  	jb	.LBB2_1069
 14327  .LBB2_732:                              # =>This Inner Loop Header: Depth=1
 14328  	movzx	eax, byte ptr [rcx + rdi]
 14329  	mul	dl
 14330  	mov	byte ptr [r8 + rdi], al
 14331  	movzx	eax, byte ptr [rcx + rdi + 1]
 14332  	mul	dl
 14333  	mov	byte ptr [r8 + rdi + 1], al
 14334  	movzx	eax, byte ptr [rcx + rdi + 2]
 14335  	mul	dl
 14336  	mov	byte ptr [r8 + rdi + 2], al
 14337  	movzx	eax, byte ptr [rcx + rdi + 3]
 14338  	mul	dl
 14339  	mov	byte ptr [r8 + rdi + 3], al
 14340  	add	rdi, 4
 14341  	cmp	r10, rdi
 14342  	jne	.LBB2_732
 14343  	jmp	.LBB2_1069
 14344  .LBB2_107:
 14345  	cmp	edi, 2
 14346  	je	.LBB2_235
 14347  # %bb.108:
 14348  	cmp	edi, 3
 14349  	jne	.LBB2_1069
 14350  # %bb.109:
 14351  	test	r9d, r9d
 14352  	jle	.LBB2_1069
 14353  # %bb.110:
 14354  	mov	al, byte ptr [rdx]
 14355  	mov	r10d, r9d
 14356  	cmp	r9d, 32
 14357  	jb	.LBB2_111
 14358  # %bb.307:
 14359  	lea	rdx, [rcx + r10]
 14360  	cmp	rdx, r8
 14361  	jbe	.LBB2_495
 14362  # %bb.308:
 14363  	lea	rdx, [r8 + r10]
 14364  	cmp	rdx, rcx
 14365  	jbe	.LBB2_495
 14366  .LBB2_111:
 14367  	xor	esi, esi
 14368  .LBB2_737:
 14369  	mov	r9, rsi
 14370  	not	r9
 14371  	add	r9, r10
 14372  	mov	rdi, r10
 14373  	and	rdi, 3
 14374  	je	.LBB2_739
 14375  .LBB2_738:                              # =>This Inner Loop Header: Depth=1
 14376  	movzx	edx, byte ptr [rcx + rsi]
 14377  	add	dl, al
 14378  	mov	byte ptr [r8 + rsi], dl
 14379  	add	rsi, 1
 14380  	add	rdi, -1
 14381  	jne	.LBB2_738
 14382  .LBB2_739:
 14383  	cmp	r9, 3
 14384  	jb	.LBB2_1069
 14385  .LBB2_740:                              # =>This Inner Loop Header: Depth=1
 14386  	movzx	edx, byte ptr [rcx + rsi]
 14387  	add	dl, al
 14388  	mov	byte ptr [r8 + rsi], dl
 14389  	movzx	edx, byte ptr [rcx + rsi + 1]
 14390  	add	dl, al
 14391  	mov	byte ptr [r8 + rsi + 1], dl
 14392  	movzx	edx, byte ptr [rcx + rsi + 2]
 14393  	add	dl, al
 14394  	mov	byte ptr [r8 + rsi + 2], dl
 14395  	movzx	edx, byte ptr [rcx + rsi + 3]
 14396  	add	dl, al
 14397  	mov	byte ptr [r8 + rsi + 3], dl
 14398  	add	rsi, 4
 14399  	cmp	r10, rsi
 14400  	jne	.LBB2_740
 14401  	jmp	.LBB2_1069
 14402  .LBB2_112:
 14403  	cmp	edi, 2
 14404  	je	.LBB2_238
 14405  # %bb.113:
 14406  	cmp	edi, 3
 14407  	jne	.LBB2_1069
 14408  # %bb.114:
 14409  	test	r9d, r9d
 14410  	jle	.LBB2_1069
 14411  # %bb.115:
 14412  	mov	r11b, byte ptr [rdx]
 14413  	mov	r10d, r9d
 14414  	cmp	r9d, 32
 14415  	jb	.LBB2_116
 14416  # %bb.310:
 14417  	lea	rdx, [rcx + r10]
 14418  	cmp	rdx, r8
 14419  	jbe	.LBB2_498
 14420  # %bb.311:
 14421  	lea	rdx, [r8 + r10]
 14422  	cmp	rdx, rcx
 14423  	jbe	.LBB2_498
 14424  .LBB2_116:
 14425  	xor	esi, esi
 14426  .LBB2_745:
 14427  	mov	rdx, rsi
 14428  	not	rdx
 14429  	add	rdx, r10
 14430  	mov	rdi, r10
 14431  	and	rdi, 3
 14432  	je	.LBB2_747
 14433  .LBB2_746:                              # =>This Inner Loop Header: Depth=1
 14434  	mov	eax, r11d
 14435  	sub	al, byte ptr [rcx + rsi]
 14436  	mov	byte ptr [r8 + rsi], al
 14437  	add	rsi, 1
 14438  	add	rdi, -1
 14439  	jne	.LBB2_746
 14440  .LBB2_747:
 14441  	cmp	rdx, 3
 14442  	jb	.LBB2_1069
 14443  .LBB2_748:                              # =>This Inner Loop Header: Depth=1
 14444  	mov	eax, r11d
 14445  	sub	al, byte ptr [rcx + rsi]
 14446  	mov	byte ptr [r8 + rsi], al
 14447  	mov	eax, r11d
 14448  	sub	al, byte ptr [rcx + rsi + 1]
 14449  	mov	byte ptr [r8 + rsi + 1], al
 14450  	mov	eax, r11d
 14451  	sub	al, byte ptr [rcx + rsi + 2]
 14452  	mov	byte ptr [r8 + rsi + 2], al
 14453  	mov	eax, r11d
 14454  	sub	al, byte ptr [rcx + rsi + 3]
 14455  	mov	byte ptr [r8 + rsi + 3], al
 14456  	add	rsi, 4
 14457  	cmp	r10, rsi
 14458  	jne	.LBB2_748
 14459  	jmp	.LBB2_1069
 14460  .LBB2_117:
 14461  	cmp	edi, 2
 14462  	je	.LBB2_241
 14463  # %bb.118:
 14464  	cmp	edi, 3
 14465  	jne	.LBB2_1069
 14466  # %bb.119:
 14467  	test	r9d, r9d
 14468  	jle	.LBB2_1069
 14469  # %bb.120:
 14470  	mov	al, byte ptr [rdx]
 14471  	mov	r10d, r9d
 14472  	cmp	r9d, 32
 14473  	jb	.LBB2_121
 14474  # %bb.313:
 14475  	lea	rdx, [rcx + r10]
 14476  	cmp	rdx, r8
 14477  	jbe	.LBB2_501
 14478  # %bb.314:
 14479  	lea	rdx, [r8 + r10]
 14480  	cmp	rdx, rcx
 14481  	jbe	.LBB2_501
 14482  .LBB2_121:
 14483  	xor	esi, esi
 14484  .LBB2_753:
 14485  	mov	r9, rsi
 14486  	not	r9
 14487  	add	r9, r10
 14488  	mov	rdi, r10
 14489  	and	rdi, 3
 14490  	je	.LBB2_755
 14491  .LBB2_754:                              # =>This Inner Loop Header: Depth=1
 14492  	movzx	edx, byte ptr [rcx + rsi]
 14493  	add	dl, al
 14494  	mov	byte ptr [r8 + rsi], dl
 14495  	add	rsi, 1
 14496  	add	rdi, -1
 14497  	jne	.LBB2_754
 14498  .LBB2_755:
 14499  	cmp	r9, 3
 14500  	jb	.LBB2_1069
 14501  .LBB2_756:                              # =>This Inner Loop Header: Depth=1
 14502  	movzx	edx, byte ptr [rcx + rsi]
 14503  	add	dl, al
 14504  	mov	byte ptr [r8 + rsi], dl
 14505  	movzx	edx, byte ptr [rcx + rsi + 1]
 14506  	add	dl, al
 14507  	mov	byte ptr [r8 + rsi + 1], dl
 14508  	movzx	edx, byte ptr [rcx + rsi + 2]
 14509  	add	dl, al
 14510  	mov	byte ptr [r8 + rsi + 2], dl
 14511  	movzx	edx, byte ptr [rcx + rsi + 3]
 14512  	add	dl, al
 14513  	mov	byte ptr [r8 + rsi + 3], dl
 14514  	add	rsi, 4
 14515  	cmp	r10, rsi
 14516  	jne	.LBB2_756
 14517  	jmp	.LBB2_1069
 14518  .LBB2_122:
 14519  	cmp	edi, 2
 14520  	je	.LBB2_244
 14521  # %bb.123:
 14522  	cmp	edi, 3
 14523  	jne	.LBB2_1069
 14524  # %bb.124:
 14525  	test	r9d, r9d
 14526  	jle	.LBB2_1069
 14527  # %bb.125:
 14528  	mov	r11b, byte ptr [rdx]
 14529  	mov	r10d, r9d
 14530  	cmp	r9d, 32
 14531  	jb	.LBB2_126
 14532  # %bb.316:
 14533  	lea	rdx, [rcx + r10]
 14534  	cmp	rdx, r8
 14535  	jbe	.LBB2_504
 14536  # %bb.317:
 14537  	lea	rdx, [r8 + r10]
 14538  	cmp	rdx, rcx
 14539  	jbe	.LBB2_504
 14540  .LBB2_126:
 14541  	xor	esi, esi
 14542  .LBB2_761:
 14543  	mov	rdx, rsi
 14544  	not	rdx
 14545  	add	rdx, r10
 14546  	mov	rdi, r10
 14547  	and	rdi, 3
 14548  	je	.LBB2_763
 14549  .LBB2_762:                              # =>This Inner Loop Header: Depth=1
 14550  	mov	eax, r11d
 14551  	sub	al, byte ptr [rcx + rsi]
 14552  	mov	byte ptr [r8 + rsi], al
 14553  	add	rsi, 1
 14554  	add	rdi, -1
 14555  	jne	.LBB2_762
 14556  .LBB2_763:
 14557  	cmp	rdx, 3
 14558  	jb	.LBB2_1069
 14559  .LBB2_764:                              # =>This Inner Loop Header: Depth=1
 14560  	mov	eax, r11d
 14561  	sub	al, byte ptr [rcx + rsi]
 14562  	mov	byte ptr [r8 + rsi], al
 14563  	mov	eax, r11d
 14564  	sub	al, byte ptr [rcx + rsi + 1]
 14565  	mov	byte ptr [r8 + rsi + 1], al
 14566  	mov	eax, r11d
 14567  	sub	al, byte ptr [rcx + rsi + 2]
 14568  	mov	byte ptr [r8 + rsi + 2], al
 14569  	mov	eax, r11d
 14570  	sub	al, byte ptr [rcx + rsi + 3]
 14571  	mov	byte ptr [r8 + rsi + 3], al
 14572  	add	rsi, 4
 14573  	cmp	r10, rsi
 14574  	jne	.LBB2_764
 14575  	jmp	.LBB2_1069
 14576  .LBB2_127:
 14577  	cmp	edi, 7
 14578  	je	.LBB2_247
 14579  # %bb.128:
 14580  	cmp	edi, 8
 14581  	jne	.LBB2_1069
 14582  # %bb.129:
 14583  	test	r9d, r9d
 14584  	jle	.LBB2_1069
 14585  # %bb.130:
 14586  	mov	rax, qword ptr [rdx]
 14587  	mov	esi, r9d
 14588  	lea	rdi, [rsi - 1]
 14589  	mov	r9d, esi
 14590  	and	r9d, 3
 14591  	cmp	rdi, 3
 14592  	jae	.LBB2_319
 14593  # %bb.131:
 14594  	xor	edi, edi
 14595  	jmp	.LBB2_321
 14596  .LBB2_132:
 14597  	cmp	edi, 7
 14598  	je	.LBB2_250
 14599  # %bb.133:
 14600  	cmp	edi, 8
 14601  	jne	.LBB2_1069
 14602  # %bb.134:
 14603  	test	r9d, r9d
 14604  	jle	.LBB2_1069
 14605  # %bb.135:
 14606  	mov	rax, qword ptr [rdx]
 14607  	mov	esi, r9d
 14608  	lea	rdi, [rsi - 1]
 14609  	mov	r9d, esi
 14610  	and	r9d, 3
 14611  	cmp	rdi, 3
 14612  	jae	.LBB2_324
 14613  # %bb.136:
 14614  	xor	edi, edi
 14615  	jmp	.LBB2_326
 14616  .LBB2_137:
 14617  	cmp	edi, 7
 14618  	je	.LBB2_253
 14619  # %bb.138:
 14620  	cmp	edi, 8
 14621  	jne	.LBB2_1069
 14622  # %bb.139:
 14623  	test	r9d, r9d
 14624  	jle	.LBB2_1069
 14625  # %bb.140:
 14626  	mov	rax, qword ptr [rdx]
 14627  	mov	r10d, r9d
 14628  	cmp	r9d, 4
 14629  	jb	.LBB2_141
 14630  # %bb.329:
 14631  	lea	rdx, [rcx + 8*r10]
 14632  	cmp	rdx, r8
 14633  	jbe	.LBB2_507
 14634  # %bb.330:
 14635  	lea	rdx, [r8 + 8*r10]
 14636  	cmp	rdx, rcx
 14637  	jbe	.LBB2_507
 14638  .LBB2_141:
 14639  	xor	esi, esi
 14640  .LBB2_769:
 14641  	mov	r9, rsi
 14642  	not	r9
 14643  	add	r9, r10
 14644  	mov	rdi, r10
 14645  	and	rdi, 3
 14646  	je	.LBB2_771
 14647  .LBB2_770:                              # =>This Inner Loop Header: Depth=1
 14648  	mov	rdx, qword ptr [rcx + 8*rsi]
 14649  	add	rdx, rax
 14650  	mov	qword ptr [r8 + 8*rsi], rdx
 14651  	add	rsi, 1
 14652  	add	rdi, -1
 14653  	jne	.LBB2_770
 14654  .LBB2_771:
 14655  	cmp	r9, 3
 14656  	jb	.LBB2_1069
 14657  .LBB2_772:                              # =>This Inner Loop Header: Depth=1
 14658  	mov	rdx, qword ptr [rcx + 8*rsi]
 14659  	add	rdx, rax
 14660  	mov	qword ptr [r8 + 8*rsi], rdx
 14661  	mov	rdx, qword ptr [rcx + 8*rsi + 8]
 14662  	add	rdx, rax
 14663  	mov	qword ptr [r8 + 8*rsi + 8], rdx
 14664  	mov	rdx, qword ptr [rcx + 8*rsi + 16]
 14665  	add	rdx, rax
 14666  	mov	qword ptr [r8 + 8*rsi + 16], rdx
 14667  	mov	rdx, qword ptr [rcx + 8*rsi + 24]
 14668  	add	rdx, rax
 14669  	mov	qword ptr [r8 + 8*rsi + 24], rdx
 14670  	add	rsi, 4
 14671  	cmp	r10, rsi
 14672  	jne	.LBB2_772
 14673  	jmp	.LBB2_1069
 14674  .LBB2_142:
 14675  	cmp	edi, 7
 14676  	je	.LBB2_256
 14677  # %bb.143:
 14678  	cmp	edi, 8
 14679  	jne	.LBB2_1069
 14680  # %bb.144:
 14681  	test	r9d, r9d
 14682  	jle	.LBB2_1069
 14683  # %bb.145:
 14684  	mov	r11, qword ptr [rdx]
 14685  	mov	r10d, r9d
 14686  	cmp	r9d, 4
 14687  	jb	.LBB2_146
 14688  # %bb.332:
 14689  	lea	rdx, [rcx + 8*r10]
 14690  	cmp	rdx, r8
 14691  	jbe	.LBB2_510
 14692  # %bb.333:
 14693  	lea	rdx, [r8 + 8*r10]
 14694  	cmp	rdx, rcx
 14695  	jbe	.LBB2_510
 14696  .LBB2_146:
 14697  	xor	esi, esi
 14698  .LBB2_777:
 14699  	mov	rdx, rsi
 14700  	not	rdx
 14701  	add	rdx, r10
 14702  	mov	rdi, r10
 14703  	and	rdi, 3
 14704  	je	.LBB2_779
 14705  .LBB2_778:                              # =>This Inner Loop Header: Depth=1
 14706  	mov	rax, r11
 14707  	sub	rax, qword ptr [rcx + 8*rsi]
 14708  	mov	qword ptr [r8 + 8*rsi], rax
 14709  	add	rsi, 1
 14710  	add	rdi, -1
 14711  	jne	.LBB2_778
 14712  .LBB2_779:
 14713  	cmp	rdx, 3
 14714  	jb	.LBB2_1069
 14715  .LBB2_780:                              # =>This Inner Loop Header: Depth=1
 14716  	mov	rax, r11
 14717  	sub	rax, qword ptr [rcx + 8*rsi]
 14718  	mov	qword ptr [r8 + 8*rsi], rax
 14719  	mov	rax, r11
 14720  	sub	rax, qword ptr [rcx + 8*rsi + 8]
 14721  	mov	qword ptr [r8 + 8*rsi + 8], rax
 14722  	mov	rax, r11
 14723  	sub	rax, qword ptr [rcx + 8*rsi + 16]
 14724  	mov	qword ptr [r8 + 8*rsi + 16], rax
 14725  	mov	rax, r11
 14726  	sub	rax, qword ptr [rcx + 8*rsi + 24]
 14727  	mov	qword ptr [r8 + 8*rsi + 24], rax
 14728  	add	rsi, 4
 14729  	cmp	r10, rsi
 14730  	jne	.LBB2_780
 14731  	jmp	.LBB2_1069
 14732  .LBB2_147:
 14733  	cmp	edi, 7
 14734  	je	.LBB2_259
 14735  # %bb.148:
 14736  	cmp	edi, 8
 14737  	jne	.LBB2_1069
 14738  # %bb.149:
 14739  	test	r9d, r9d
 14740  	jle	.LBB2_1069
 14741  # %bb.150:
 14742  	mov	rax, qword ptr [rdx]
 14743  	mov	r10d, r9d
 14744  	cmp	r9d, 4
 14745  	jb	.LBB2_151
 14746  # %bb.335:
 14747  	lea	rdx, [rcx + 8*r10]
 14748  	cmp	rdx, r8
 14749  	jbe	.LBB2_513
 14750  # %bb.336:
 14751  	lea	rdx, [r8 + 8*r10]
 14752  	cmp	rdx, rcx
 14753  	jbe	.LBB2_513
 14754  .LBB2_151:
 14755  	xor	esi, esi
 14756  .LBB2_785:
 14757  	mov	r9, rsi
 14758  	not	r9
 14759  	add	r9, r10
 14760  	mov	rdi, r10
 14761  	and	rdi, 3
 14762  	je	.LBB2_787
 14763  .LBB2_786:                              # =>This Inner Loop Header: Depth=1
 14764  	mov	rdx, qword ptr [rcx + 8*rsi]
 14765  	add	rdx, rax
 14766  	mov	qword ptr [r8 + 8*rsi], rdx
 14767  	add	rsi, 1
 14768  	add	rdi, -1
 14769  	jne	.LBB2_786
 14770  .LBB2_787:
 14771  	cmp	r9, 3
 14772  	jb	.LBB2_1069
 14773  .LBB2_788:                              # =>This Inner Loop Header: Depth=1
 14774  	mov	rdx, qword ptr [rcx + 8*rsi]
 14775  	add	rdx, rax
 14776  	mov	qword ptr [r8 + 8*rsi], rdx
 14777  	mov	rdx, qword ptr [rcx + 8*rsi + 8]
 14778  	add	rdx, rax
 14779  	mov	qword ptr [r8 + 8*rsi + 8], rdx
 14780  	mov	rdx, qword ptr [rcx + 8*rsi + 16]
 14781  	add	rdx, rax
 14782  	mov	qword ptr [r8 + 8*rsi + 16], rdx
 14783  	mov	rdx, qword ptr [rcx + 8*rsi + 24]
 14784  	add	rdx, rax
 14785  	mov	qword ptr [r8 + 8*rsi + 24], rdx
 14786  	add	rsi, 4
 14787  	cmp	r10, rsi
 14788  	jne	.LBB2_788
 14789  	jmp	.LBB2_1069
 14790  .LBB2_152:
 14791  	cmp	edi, 7
 14792  	je	.LBB2_262
 14793  # %bb.153:
 14794  	cmp	edi, 8
 14795  	jne	.LBB2_1069
 14796  # %bb.154:
 14797  	test	r9d, r9d
 14798  	jle	.LBB2_1069
 14799  # %bb.155:
 14800  	mov	r11, qword ptr [rdx]
 14801  	mov	r10d, r9d
 14802  	cmp	r9d, 4
 14803  	jb	.LBB2_156
 14804  # %bb.338:
 14805  	lea	rdx, [rcx + 8*r10]
 14806  	cmp	rdx, r8
 14807  	jbe	.LBB2_516
 14808  # %bb.339:
 14809  	lea	rdx, [r8 + 8*r10]
 14810  	cmp	rdx, rcx
 14811  	jbe	.LBB2_516
 14812  .LBB2_156:
 14813  	xor	esi, esi
 14814  .LBB2_793:
 14815  	mov	rdx, rsi
 14816  	not	rdx
 14817  	add	rdx, r10
 14818  	mov	rdi, r10
 14819  	and	rdi, 3
 14820  	je	.LBB2_795
 14821  .LBB2_794:                              # =>This Inner Loop Header: Depth=1
 14822  	mov	rax, r11
 14823  	sub	rax, qword ptr [rcx + 8*rsi]
 14824  	mov	qword ptr [r8 + 8*rsi], rax
 14825  	add	rsi, 1
 14826  	add	rdi, -1
 14827  	jne	.LBB2_794
 14828  .LBB2_795:
 14829  	cmp	rdx, 3
 14830  	jb	.LBB2_1069
 14831  .LBB2_796:                              # =>This Inner Loop Header: Depth=1
 14832  	mov	rax, r11
 14833  	sub	rax, qword ptr [rcx + 8*rsi]
 14834  	mov	qword ptr [r8 + 8*rsi], rax
 14835  	mov	rax, r11
 14836  	sub	rax, qword ptr [rcx + 8*rsi + 8]
 14837  	mov	qword ptr [r8 + 8*rsi + 8], rax
 14838  	mov	rax, r11
 14839  	sub	rax, qword ptr [rcx + 8*rsi + 16]
 14840  	mov	qword ptr [r8 + 8*rsi + 16], rax
 14841  	mov	rax, r11
 14842  	sub	rax, qword ptr [rcx + 8*rsi + 24]
 14843  	mov	qword ptr [r8 + 8*rsi + 24], rax
 14844  	add	rsi, 4
 14845  	cmp	r10, rsi
 14846  	jne	.LBB2_796
 14847  	jmp	.LBB2_1069
 14848  .LBB2_157:
 14849  	test	r9d, r9d
 14850  	jle	.LBB2_1069
 14851  # %bb.158:
 14852  	movzx	eax, word ptr [rdx]
 14853  	mov	r10d, r9d
 14854  	cmp	r9d, 16
 14855  	jb	.LBB2_159
 14856  # %bb.341:
 14857  	lea	rdx, [rcx + 2*r10]
 14858  	cmp	rdx, r8
 14859  	jbe	.LBB2_519
 14860  # %bb.342:
 14861  	lea	rdx, [r8 + 2*r10]
 14862  	cmp	rdx, rcx
 14863  	jbe	.LBB2_519
 14864  .LBB2_159:
 14865  	xor	esi, esi
 14866  .LBB2_801:
 14867  	mov	r9, rsi
 14868  	not	r9
 14869  	add	r9, r10
 14870  	mov	rdi, r10
 14871  	and	rdi, 3
 14872  	je	.LBB2_803
 14873  .LBB2_802:                              # =>This Inner Loop Header: Depth=1
 14874  	movzx	edx, word ptr [rcx + 2*rsi]
 14875  	imul	dx, ax
 14876  	mov	word ptr [r8 + 2*rsi], dx
 14877  	add	rsi, 1
 14878  	add	rdi, -1
 14879  	jne	.LBB2_802
 14880  .LBB2_803:
 14881  	cmp	r9, 3
 14882  	jb	.LBB2_1069
 14883  .LBB2_804:                              # =>This Inner Loop Header: Depth=1
 14884  	movzx	edx, word ptr [rcx + 2*rsi]
 14885  	imul	dx, ax
 14886  	mov	word ptr [r8 + 2*rsi], dx
 14887  	movzx	edx, word ptr [rcx + 2*rsi + 2]
 14888  	imul	dx, ax
 14889  	mov	word ptr [r8 + 2*rsi + 2], dx
 14890  	movzx	edx, word ptr [rcx + 2*rsi + 4]
 14891  	imul	dx, ax
 14892  	mov	word ptr [r8 + 2*rsi + 4], dx
 14893  	movzx	edx, word ptr [rcx + 2*rsi + 6]
 14894  	imul	dx, ax
 14895  	mov	word ptr [r8 + 2*rsi + 6], dx
 14896  	add	rsi, 4
 14897  	cmp	r10, rsi
 14898  	jne	.LBB2_804
 14899  	jmp	.LBB2_1069
 14900  .LBB2_160:
 14901  	test	r9d, r9d
 14902  	jle	.LBB2_1069
 14903  # %bb.161:
 14904  	movzx	eax, word ptr [rdx]
 14905  	mov	r10d, r9d
 14906  	cmp	r9d, 16
 14907  	jb	.LBB2_162
 14908  # %bb.344:
 14909  	lea	rdx, [rcx + 2*r10]
 14910  	cmp	rdx, r8
 14911  	jbe	.LBB2_522
 14912  # %bb.345:
 14913  	lea	rdx, [r8 + 2*r10]
 14914  	cmp	rdx, rcx
 14915  	jbe	.LBB2_522
 14916  .LBB2_162:
 14917  	xor	esi, esi
 14918  .LBB2_809:
 14919  	mov	r9, rsi
 14920  	not	r9
 14921  	add	r9, r10
 14922  	mov	rdi, r10
 14923  	and	rdi, 3
 14924  	je	.LBB2_811
 14925  .LBB2_810:                              # =>This Inner Loop Header: Depth=1
 14926  	movzx	edx, word ptr [rcx + 2*rsi]
 14927  	imul	dx, ax
 14928  	mov	word ptr [r8 + 2*rsi], dx
 14929  	add	rsi, 1
 14930  	add	rdi, -1
 14931  	jne	.LBB2_810
 14932  .LBB2_811:
 14933  	cmp	r9, 3
 14934  	jb	.LBB2_1069
 14935  .LBB2_812:                              # =>This Inner Loop Header: Depth=1
 14936  	movzx	edx, word ptr [rcx + 2*rsi]
 14937  	imul	dx, ax
 14938  	mov	word ptr [r8 + 2*rsi], dx
 14939  	movzx	edx, word ptr [rcx + 2*rsi + 2]
 14940  	imul	dx, ax
 14941  	mov	word ptr [r8 + 2*rsi + 2], dx
 14942  	movzx	edx, word ptr [rcx + 2*rsi + 4]
 14943  	imul	dx, ax
 14944  	mov	word ptr [r8 + 2*rsi + 4], dx
 14945  	movzx	edx, word ptr [rcx + 2*rsi + 6]
 14946  	imul	dx, ax
 14947  	mov	word ptr [r8 + 2*rsi + 6], dx
 14948  	add	rsi, 4
 14949  	cmp	r10, rsi
 14950  	jne	.LBB2_812
 14951  	jmp	.LBB2_1069
 14952  .LBB2_163:
 14953  	test	r9d, r9d
 14954  	jle	.LBB2_1069
 14955  # %bb.164:
 14956  	movzx	eax, word ptr [rdx]
 14957  	mov	r10d, r9d
 14958  	cmp	r9d, 16
 14959  	jb	.LBB2_165
 14960  # %bb.347:
 14961  	lea	rdx, [rcx + 2*r10]
 14962  	cmp	rdx, r8
 14963  	jbe	.LBB2_525
 14964  # %bb.348:
 14965  	lea	rdx, [r8 + 2*r10]
 14966  	cmp	rdx, rcx
 14967  	jbe	.LBB2_525
 14968  .LBB2_165:
 14969  	xor	esi, esi
 14970  .LBB2_817:
 14971  	mov	r9, rsi
 14972  	not	r9
 14973  	add	r9, r10
 14974  	mov	rdi, r10
 14975  	and	rdi, 3
 14976  	je	.LBB2_819
 14977  .LBB2_818:                              # =>This Inner Loop Header: Depth=1
 14978  	movzx	edx, word ptr [rcx + 2*rsi]
 14979  	imul	dx, ax
 14980  	mov	word ptr [r8 + 2*rsi], dx
 14981  	add	rsi, 1
 14982  	add	rdi, -1
 14983  	jne	.LBB2_818
 14984  .LBB2_819:
 14985  	cmp	r9, 3
 14986  	jb	.LBB2_1069
 14987  .LBB2_820:                              # =>This Inner Loop Header: Depth=1
 14988  	movzx	edx, word ptr [rcx + 2*rsi]
 14989  	imul	dx, ax
 14990  	mov	word ptr [r8 + 2*rsi], dx
 14991  	movzx	edx, word ptr [rcx + 2*rsi + 2]
 14992  	imul	dx, ax
 14993  	mov	word ptr [r8 + 2*rsi + 2], dx
 14994  	movzx	edx, word ptr [rcx + 2*rsi + 4]
 14995  	imul	dx, ax
 14996  	mov	word ptr [r8 + 2*rsi + 4], dx
 14997  	movzx	edx, word ptr [rcx + 2*rsi + 6]
 14998  	imul	dx, ax
 14999  	mov	word ptr [r8 + 2*rsi + 6], dx
 15000  	add	rsi, 4
 15001  	cmp	r10, rsi
 15002  	jne	.LBB2_820
 15003  	jmp	.LBB2_1069
 15004  .LBB2_166:
 15005  	test	r9d, r9d
 15006  	jle	.LBB2_1069
 15007  # %bb.167:
 15008  	movzx	eax, word ptr [rdx]
 15009  	mov	r10d, r9d
 15010  	cmp	r9d, 16
 15011  	jb	.LBB2_168
 15012  # %bb.350:
 15013  	lea	rdx, [rcx + 2*r10]
 15014  	cmp	rdx, r8
 15015  	jbe	.LBB2_528
 15016  # %bb.351:
 15017  	lea	rdx, [r8 + 2*r10]
 15018  	cmp	rdx, rcx
 15019  	jbe	.LBB2_528
 15020  .LBB2_168:
 15021  	xor	esi, esi
 15022  .LBB2_825:
 15023  	mov	r9, rsi
 15024  	not	r9
 15025  	add	r9, r10
 15026  	mov	rdi, r10
 15027  	and	rdi, 3
 15028  	je	.LBB2_827
 15029  .LBB2_826:                              # =>This Inner Loop Header: Depth=1
 15030  	movzx	edx, word ptr [rcx + 2*rsi]
 15031  	imul	dx, ax
 15032  	mov	word ptr [r8 + 2*rsi], dx
 15033  	add	rsi, 1
 15034  	add	rdi, -1
 15035  	jne	.LBB2_826
 15036  .LBB2_827:
 15037  	cmp	r9, 3
 15038  	jb	.LBB2_1069
 15039  .LBB2_828:                              # =>This Inner Loop Header: Depth=1
 15040  	movzx	edx, word ptr [rcx + 2*rsi]
 15041  	imul	dx, ax
 15042  	mov	word ptr [r8 + 2*rsi], dx
 15043  	movzx	edx, word ptr [rcx + 2*rsi + 2]
 15044  	imul	dx, ax
 15045  	mov	word ptr [r8 + 2*rsi + 2], dx
 15046  	movzx	edx, word ptr [rcx + 2*rsi + 4]
 15047  	imul	dx, ax
 15048  	mov	word ptr [r8 + 2*rsi + 4], dx
 15049  	movzx	edx, word ptr [rcx + 2*rsi + 6]
 15050  	imul	dx, ax
 15051  	mov	word ptr [r8 + 2*rsi + 6], dx
 15052  	add	rsi, 4
 15053  	cmp	r10, rsi
 15054  	jne	.LBB2_828
 15055  	jmp	.LBB2_1069
 15056  .LBB2_169:
 15057  	test	r9d, r9d
 15058  	jle	.LBB2_1069
 15059  # %bb.170:
 15060  	movzx	eax, word ptr [rdx]
 15061  	mov	r10d, r9d
 15062  	cmp	r9d, 16
 15063  	jb	.LBB2_171
 15064  # %bb.353:
 15065  	lea	rdx, [rcx + 2*r10]
 15066  	cmp	rdx, r8
 15067  	jbe	.LBB2_531
 15068  # %bb.354:
 15069  	lea	rdx, [r8 + 2*r10]
 15070  	cmp	rdx, rcx
 15071  	jbe	.LBB2_531
 15072  .LBB2_171:
 15073  	xor	esi, esi
 15074  .LBB2_833:
 15075  	mov	r9, rsi
 15076  	not	r9
 15077  	add	r9, r10
 15078  	mov	rdi, r10
 15079  	and	rdi, 3
 15080  	je	.LBB2_835
 15081  .LBB2_834:                              # =>This Inner Loop Header: Depth=1
 15082  	movzx	edx, word ptr [rcx + 2*rsi]
 15083  	add	dx, ax
 15084  	mov	word ptr [r8 + 2*rsi], dx
 15085  	add	rsi, 1
 15086  	add	rdi, -1
 15087  	jne	.LBB2_834
 15088  .LBB2_835:
 15089  	cmp	r9, 3
 15090  	jb	.LBB2_1069
 15091  .LBB2_836:                              # =>This Inner Loop Header: Depth=1
 15092  	movzx	edx, word ptr [rcx + 2*rsi]
 15093  	add	dx, ax
 15094  	mov	word ptr [r8 + 2*rsi], dx
 15095  	movzx	edx, word ptr [rcx + 2*rsi + 2]
 15096  	add	dx, ax
 15097  	mov	word ptr [r8 + 2*rsi + 2], dx
 15098  	movzx	edx, word ptr [rcx + 2*rsi + 4]
 15099  	add	dx, ax
 15100  	mov	word ptr [r8 + 2*rsi + 4], dx
 15101  	movzx	edx, word ptr [rcx + 2*rsi + 6]
 15102  	add	dx, ax
 15103  	mov	word ptr [r8 + 2*rsi + 6], dx
 15104  	add	rsi, 4
 15105  	cmp	r10, rsi
 15106  	jne	.LBB2_836
 15107  	jmp	.LBB2_1069
 15108  .LBB2_172:
 15109  	test	r9d, r9d
 15110  	jle	.LBB2_1069
 15111  # %bb.173:
 15112  	movzx	eax, word ptr [rdx]
 15113  	mov	r10d, r9d
 15114  	cmp	r9d, 16
 15115  	jb	.LBB2_174
 15116  # %bb.356:
 15117  	lea	rdx, [rcx + 2*r10]
 15118  	cmp	rdx, r8
 15119  	jbe	.LBB2_534
 15120  # %bb.357:
 15121  	lea	rdx, [r8 + 2*r10]
 15122  	cmp	rdx, rcx
 15123  	jbe	.LBB2_534
 15124  .LBB2_174:
 15125  	xor	esi, esi
 15126  .LBB2_841:
 15127  	mov	r9, rsi
 15128  	not	r9
 15129  	add	r9, r10
 15130  	mov	rdi, r10
 15131  	and	rdi, 3
 15132  	je	.LBB2_843
 15133  .LBB2_842:                              # =>This Inner Loop Header: Depth=1
 15134  	movzx	edx, word ptr [rcx + 2*rsi]
 15135  	add	dx, ax
 15136  	mov	word ptr [r8 + 2*rsi], dx
 15137  	add	rsi, 1
 15138  	add	rdi, -1
 15139  	jne	.LBB2_842
 15140  .LBB2_843:
 15141  	cmp	r9, 3
 15142  	jb	.LBB2_1069
 15143  .LBB2_844:                              # =>This Inner Loop Header: Depth=1
 15144  	movzx	edx, word ptr [rcx + 2*rsi]
 15145  	add	dx, ax
 15146  	mov	word ptr [r8 + 2*rsi], dx
 15147  	movzx	edx, word ptr [rcx + 2*rsi + 2]
 15148  	add	dx, ax
 15149  	mov	word ptr [r8 + 2*rsi + 2], dx
 15150  	movzx	edx, word ptr [rcx + 2*rsi + 4]
 15151  	add	dx, ax
 15152  	mov	word ptr [r8 + 2*rsi + 4], dx
 15153  	movzx	edx, word ptr [rcx + 2*rsi + 6]
 15154  	add	dx, ax
 15155  	mov	word ptr [r8 + 2*rsi + 6], dx
 15156  	add	rsi, 4
 15157  	cmp	r10, rsi
 15158  	jne	.LBB2_844
 15159  	jmp	.LBB2_1069
 15160  .LBB2_175:
 15161  	test	r9d, r9d
 15162  	jle	.LBB2_1069
 15163  # %bb.176:
 15164  	movzx	eax, word ptr [rdx]
 15165  	mov	r10d, r9d
 15166  	cmp	r9d, 16
 15167  	jb	.LBB2_177
 15168  # %bb.359:
 15169  	lea	rdx, [rcx + 2*r10]
 15170  	cmp	rdx, r8
 15171  	jbe	.LBB2_537
 15172  # %bb.360:
 15173  	lea	rdx, [r8 + 2*r10]
 15174  	cmp	rdx, rcx
 15175  	jbe	.LBB2_537
 15176  .LBB2_177:
 15177  	xor	esi, esi
 15178  .LBB2_849:
 15179  	mov	r9, rsi
 15180  	not	r9
 15181  	add	r9, r10
 15182  	mov	rdi, r10
 15183  	and	rdi, 3
 15184  	je	.LBB2_851
 15185  .LBB2_850:                              # =>This Inner Loop Header: Depth=1
 15186  	mov	edx, eax
 15187  	sub	dx, word ptr [rcx + 2*rsi]
 15188  	mov	word ptr [r8 + 2*rsi], dx
 15189  	add	rsi, 1
 15190  	add	rdi, -1
 15191  	jne	.LBB2_850
 15192  .LBB2_851:
 15193  	cmp	r9, 3
 15194  	jb	.LBB2_1069
 15195  .LBB2_852:                              # =>This Inner Loop Header: Depth=1
 15196  	mov	edx, eax
 15197  	sub	dx, word ptr [rcx + 2*rsi]
 15198  	mov	word ptr [r8 + 2*rsi], dx
 15199  	mov	edx, eax
 15200  	sub	dx, word ptr [rcx + 2*rsi + 2]
 15201  	mov	word ptr [r8 + 2*rsi + 2], dx
 15202  	mov	edx, eax
 15203  	sub	dx, word ptr [rcx + 2*rsi + 4]
 15204  	mov	word ptr [r8 + 2*rsi + 4], dx
 15205  	mov	edx, eax
 15206  	sub	dx, word ptr [rcx + 2*rsi + 6]
 15207  	mov	word ptr [r8 + 2*rsi + 6], dx
 15208  	add	rsi, 4
 15209  	cmp	r10, rsi
 15210  	jne	.LBB2_852
 15211  	jmp	.LBB2_1069
 15212  .LBB2_178:
 15213  	test	r9d, r9d
 15214  	jle	.LBB2_1069
 15215  # %bb.179:
 15216  	movzx	eax, word ptr [rdx]
 15217  	mov	r10d, r9d
 15218  	cmp	r9d, 16
 15219  	jb	.LBB2_180
 15220  # %bb.362:
 15221  	lea	rdx, [rcx + 2*r10]
 15222  	cmp	rdx, r8
 15223  	jbe	.LBB2_540
 15224  # %bb.363:
 15225  	lea	rdx, [r8 + 2*r10]
 15226  	cmp	rdx, rcx
 15227  	jbe	.LBB2_540
 15228  .LBB2_180:
 15229  	xor	esi, esi
 15230  .LBB2_857:
 15231  	mov	r9, rsi
 15232  	not	r9
 15233  	add	r9, r10
 15234  	mov	rdi, r10
 15235  	and	rdi, 3
 15236  	je	.LBB2_859
 15237  .LBB2_858:                              # =>This Inner Loop Header: Depth=1
 15238  	mov	edx, eax
 15239  	sub	dx, word ptr [rcx + 2*rsi]
 15240  	mov	word ptr [r8 + 2*rsi], dx
 15241  	add	rsi, 1
 15242  	add	rdi, -1
 15243  	jne	.LBB2_858
 15244  .LBB2_859:
 15245  	cmp	r9, 3
 15246  	jb	.LBB2_1069
 15247  .LBB2_860:                              # =>This Inner Loop Header: Depth=1
 15248  	mov	edx, eax
 15249  	sub	dx, word ptr [rcx + 2*rsi]
 15250  	mov	word ptr [r8 + 2*rsi], dx
 15251  	mov	edx, eax
 15252  	sub	dx, word ptr [rcx + 2*rsi + 2]
 15253  	mov	word ptr [r8 + 2*rsi + 2], dx
 15254  	mov	edx, eax
 15255  	sub	dx, word ptr [rcx + 2*rsi + 4]
 15256  	mov	word ptr [r8 + 2*rsi + 4], dx
 15257  	mov	edx, eax
 15258  	sub	dx, word ptr [rcx + 2*rsi + 6]
 15259  	mov	word ptr [r8 + 2*rsi + 6], dx
 15260  	add	rsi, 4
 15261  	cmp	r10, rsi
 15262  	jne	.LBB2_860
 15263  	jmp	.LBB2_1069
 15264  .LBB2_181:
 15265  	test	r9d, r9d
 15266  	jle	.LBB2_1069
 15267  # %bb.182:
 15268  	movzx	eax, word ptr [rdx]
 15269  	mov	r10d, r9d
 15270  	cmp	r9d, 16
 15271  	jb	.LBB2_183
 15272  # %bb.365:
 15273  	lea	rdx, [rcx + 2*r10]
 15274  	cmp	rdx, r8
 15275  	jbe	.LBB2_543
 15276  # %bb.366:
 15277  	lea	rdx, [r8 + 2*r10]
 15278  	cmp	rdx, rcx
 15279  	jbe	.LBB2_543
 15280  .LBB2_183:
 15281  	xor	esi, esi
 15282  .LBB2_865:
 15283  	mov	r9, rsi
 15284  	not	r9
 15285  	add	r9, r10
 15286  	mov	rdi, r10
 15287  	and	rdi, 3
 15288  	je	.LBB2_867
 15289  .LBB2_866:                              # =>This Inner Loop Header: Depth=1
 15290  	movzx	edx, word ptr [rcx + 2*rsi]
 15291  	add	dx, ax
 15292  	mov	word ptr [r8 + 2*rsi], dx
 15293  	add	rsi, 1
 15294  	add	rdi, -1
 15295  	jne	.LBB2_866
 15296  .LBB2_867:
 15297  	cmp	r9, 3
 15298  	jb	.LBB2_1069
 15299  .LBB2_868:                              # =>This Inner Loop Header: Depth=1
 15300  	movzx	edx, word ptr [rcx + 2*rsi]
 15301  	add	dx, ax
 15302  	mov	word ptr [r8 + 2*rsi], dx
 15303  	movzx	edx, word ptr [rcx + 2*rsi + 2]
 15304  	add	dx, ax
 15305  	mov	word ptr [r8 + 2*rsi + 2], dx
 15306  	movzx	edx, word ptr [rcx + 2*rsi + 4]
 15307  	add	dx, ax
 15308  	mov	word ptr [r8 + 2*rsi + 4], dx
 15309  	movzx	edx, word ptr [rcx + 2*rsi + 6]
 15310  	add	dx, ax
 15311  	mov	word ptr [r8 + 2*rsi + 6], dx
 15312  	add	rsi, 4
 15313  	cmp	r10, rsi
 15314  	jne	.LBB2_868
 15315  	jmp	.LBB2_1069
 15316  .LBB2_184:
 15317  	test	r9d, r9d
 15318  	jle	.LBB2_1069
 15319  # %bb.185:
 15320  	movzx	eax, word ptr [rdx]
 15321  	mov	r10d, r9d
 15322  	cmp	r9d, 16
 15323  	jb	.LBB2_186
 15324  # %bb.368:
 15325  	lea	rdx, [rcx + 2*r10]
 15326  	cmp	rdx, r8
 15327  	jbe	.LBB2_546
 15328  # %bb.369:
 15329  	lea	rdx, [r8 + 2*r10]
 15330  	cmp	rdx, rcx
 15331  	jbe	.LBB2_546
 15332  .LBB2_186:
 15333  	xor	esi, esi
 15334  .LBB2_873:
 15335  	mov	r9, rsi
 15336  	not	r9
 15337  	add	r9, r10
 15338  	mov	rdi, r10
 15339  	and	rdi, 3
 15340  	je	.LBB2_875
 15341  .LBB2_874:                              # =>This Inner Loop Header: Depth=1
 15342  	movzx	edx, word ptr [rcx + 2*rsi]
 15343  	add	dx, ax
 15344  	mov	word ptr [r8 + 2*rsi], dx
 15345  	add	rsi, 1
 15346  	add	rdi, -1
 15347  	jne	.LBB2_874
 15348  .LBB2_875:
 15349  	cmp	r9, 3
 15350  	jb	.LBB2_1069
 15351  .LBB2_876:                              # =>This Inner Loop Header: Depth=1
 15352  	movzx	edx, word ptr [rcx + 2*rsi]
 15353  	add	dx, ax
 15354  	mov	word ptr [r8 + 2*rsi], dx
 15355  	movzx	edx, word ptr [rcx + 2*rsi + 2]
 15356  	add	dx, ax
 15357  	mov	word ptr [r8 + 2*rsi + 2], dx
 15358  	movzx	edx, word ptr [rcx + 2*rsi + 4]
 15359  	add	dx, ax
 15360  	mov	word ptr [r8 + 2*rsi + 4], dx
 15361  	movzx	edx, word ptr [rcx + 2*rsi + 6]
 15362  	add	dx, ax
 15363  	mov	word ptr [r8 + 2*rsi + 6], dx
 15364  	add	rsi, 4
 15365  	cmp	r10, rsi
 15366  	jne	.LBB2_876
 15367  	jmp	.LBB2_1069
 15368  .LBB2_187:
 15369  	test	r9d, r9d
 15370  	jle	.LBB2_1069
 15371  # %bb.188:
 15372  	movzx	eax, word ptr [rdx]
 15373  	mov	r10d, r9d
 15374  	cmp	r9d, 16
 15375  	jb	.LBB2_189
 15376  # %bb.371:
 15377  	lea	rdx, [rcx + 2*r10]
 15378  	cmp	rdx, r8
 15379  	jbe	.LBB2_549
 15380  # %bb.372:
 15381  	lea	rdx, [r8 + 2*r10]
 15382  	cmp	rdx, rcx
 15383  	jbe	.LBB2_549
 15384  .LBB2_189:
 15385  	xor	esi, esi
 15386  .LBB2_881:
 15387  	mov	r9, rsi
 15388  	not	r9
 15389  	add	r9, r10
 15390  	mov	rdi, r10
 15391  	and	rdi, 3
 15392  	je	.LBB2_883
 15393  .LBB2_882:                              # =>This Inner Loop Header: Depth=1
 15394  	mov	edx, eax
 15395  	sub	dx, word ptr [rcx + 2*rsi]
 15396  	mov	word ptr [r8 + 2*rsi], dx
 15397  	add	rsi, 1
 15398  	add	rdi, -1
 15399  	jne	.LBB2_882
 15400  .LBB2_883:
 15401  	cmp	r9, 3
 15402  	jb	.LBB2_1069
 15403  .LBB2_884:                              # =>This Inner Loop Header: Depth=1
 15404  	mov	edx, eax
 15405  	sub	dx, word ptr [rcx + 2*rsi]
 15406  	mov	word ptr [r8 + 2*rsi], dx
 15407  	mov	edx, eax
 15408  	sub	dx, word ptr [rcx + 2*rsi + 2]
 15409  	mov	word ptr [r8 + 2*rsi + 2], dx
 15410  	mov	edx, eax
 15411  	sub	dx, word ptr [rcx + 2*rsi + 4]
 15412  	mov	word ptr [r8 + 2*rsi + 4], dx
 15413  	mov	edx, eax
 15414  	sub	dx, word ptr [rcx + 2*rsi + 6]
 15415  	mov	word ptr [r8 + 2*rsi + 6], dx
 15416  	add	rsi, 4
 15417  	cmp	r10, rsi
 15418  	jne	.LBB2_884
 15419  	jmp	.LBB2_1069
 15420  .LBB2_190:
 15421  	test	r9d, r9d
 15422  	jle	.LBB2_1069
 15423  # %bb.191:
 15424  	movzx	eax, word ptr [rdx]
 15425  	mov	r10d, r9d
 15426  	cmp	r9d, 16
 15427  	jb	.LBB2_192
 15428  # %bb.374:
 15429  	lea	rdx, [rcx + 2*r10]
 15430  	cmp	rdx, r8
 15431  	jbe	.LBB2_552
 15432  # %bb.375:
 15433  	lea	rdx, [r8 + 2*r10]
 15434  	cmp	rdx, rcx
 15435  	jbe	.LBB2_552
 15436  .LBB2_192:
 15437  	xor	esi, esi
 15438  .LBB2_889:
 15439  	mov	r9, rsi
 15440  	not	r9
 15441  	add	r9, r10
 15442  	mov	rdi, r10
 15443  	and	rdi, 3
 15444  	je	.LBB2_891
 15445  .LBB2_890:                              # =>This Inner Loop Header: Depth=1
 15446  	mov	edx, eax
 15447  	sub	dx, word ptr [rcx + 2*rsi]
 15448  	mov	word ptr [r8 + 2*rsi], dx
 15449  	add	rsi, 1
 15450  	add	rdi, -1
 15451  	jne	.LBB2_890
 15452  .LBB2_891:
 15453  	cmp	r9, 3
 15454  	jb	.LBB2_1069
 15455  .LBB2_892:                              # =>This Inner Loop Header: Depth=1
 15456  	mov	edx, eax
 15457  	sub	dx, word ptr [rcx + 2*rsi]
 15458  	mov	word ptr [r8 + 2*rsi], dx
 15459  	mov	edx, eax
 15460  	sub	dx, word ptr [rcx + 2*rsi + 2]
 15461  	mov	word ptr [r8 + 2*rsi + 2], dx
 15462  	mov	edx, eax
 15463  	sub	dx, word ptr [rcx + 2*rsi + 4]
 15464  	mov	word ptr [r8 + 2*rsi + 4], dx
 15465  	mov	edx, eax
 15466  	sub	dx, word ptr [rcx + 2*rsi + 6]
 15467  	mov	word ptr [r8 + 2*rsi + 6], dx
 15468  	add	rsi, 4
 15469  	cmp	r10, rsi
 15470  	jne	.LBB2_892
 15471  	jmp	.LBB2_1069
 15472  .LBB2_193:
 15473  	test	r9d, r9d
 15474  	jle	.LBB2_1069
 15475  # %bb.194:
 15476  	mov	rax, qword ptr [rdx]
 15477  	mov	esi, r9d
 15478  	lea	rdi, [rsi - 1]
 15479  	mov	r9d, esi
 15480  	and	r9d, 3
 15481  	cmp	rdi, 3
 15482  	jae	.LBB2_377
 15483  # %bb.195:
 15484  	xor	edi, edi
 15485  	jmp	.LBB2_379
 15486  .LBB2_196:
 15487  	test	r9d, r9d
 15488  	jle	.LBB2_1069
 15489  # %bb.197:
 15490  	movss	xmm0, dword ptr [rdx]           # xmm0 = mem[0],zero,zero,zero
 15491  	mov	eax, r9d
 15492  	cmp	r9d, 8
 15493  	jb	.LBB2_198
 15494  # %bb.382:
 15495  	lea	rdx, [rcx + 4*rax]
 15496  	cmp	rdx, r8
 15497  	jbe	.LBB2_555
 15498  # %bb.383:
 15499  	lea	rdx, [r8 + 4*rax]
 15500  	cmp	rdx, rcx
 15501  	jbe	.LBB2_555
 15502  .LBB2_198:
 15503  	xor	edx, edx
 15504  .LBB2_897:
 15505  	mov	rsi, rdx
 15506  	not	rsi
 15507  	add	rsi, rax
 15508  	mov	rdi, rax
 15509  	and	rdi, 3
 15510  	je	.LBB2_899
 15511  .LBB2_898:                              # =>This Inner Loop Header: Depth=1
 15512  	movss	xmm1, dword ptr [rcx + 4*rdx]   # xmm1 = mem[0],zero,zero,zero
 15513  	mulss	xmm1, xmm0
 15514  	movss	dword ptr [r8 + 4*rdx], xmm1
 15515  	add	rdx, 1
 15516  	add	rdi, -1
 15517  	jne	.LBB2_898
 15518  .LBB2_899:
 15519  	cmp	rsi, 3
 15520  	jb	.LBB2_1069
 15521  .LBB2_900:                              # =>This Inner Loop Header: Depth=1
 15522  	movss	xmm1, dword ptr [rcx + 4*rdx]   # xmm1 = mem[0],zero,zero,zero
 15523  	mulss	xmm1, xmm0
 15524  	movss	dword ptr [r8 + 4*rdx], xmm1
 15525  	movss	xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero
 15526  	mulss	xmm1, xmm0
 15527  	movss	dword ptr [r8 + 4*rdx + 4], xmm1
 15528  	movss	xmm1, dword ptr [rcx + 4*rdx + 8] # xmm1 = mem[0],zero,zero,zero
 15529  	mulss	xmm1, xmm0
 15530  	movss	dword ptr [r8 + 4*rdx + 8], xmm1
 15531  	movss	xmm1, dword ptr [rcx + 4*rdx + 12] # xmm1 = mem[0],zero,zero,zero
 15532  	mulss	xmm1, xmm0
 15533  	movss	dword ptr [r8 + 4*rdx + 12], xmm1
 15534  	add	rdx, 4
 15535  	cmp	rax, rdx
 15536  	jne	.LBB2_900
 15537  	jmp	.LBB2_1069
 15538  .LBB2_199:
 15539  	test	r9d, r9d
 15540  	jle	.LBB2_1069
 15541  # %bb.200:
 15542  	mov	rax, qword ptr [rdx]
 15543  	mov	esi, r9d
 15544  	lea	rdi, [rsi - 1]
 15545  	mov	r9d, esi
 15546  	and	r9d, 3
 15547  	cmp	rdi, 3
 15548  	jae	.LBB2_385
 15549  # %bb.201:
 15550  	xor	edi, edi
 15551  	jmp	.LBB2_387
 15552  .LBB2_202:
 15553  	test	r9d, r9d
 15554  	jle	.LBB2_1069
 15555  # %bb.203:
 15556  	movss	xmm0, dword ptr [rdx]           # xmm0 = mem[0],zero,zero,zero
 15557  	mov	eax, r9d
 15558  	cmp	r9d, 8
 15559  	jb	.LBB2_204
 15560  # %bb.390:
 15561  	lea	rdx, [rcx + 4*rax]
 15562  	cmp	rdx, r8
 15563  	jbe	.LBB2_558
 15564  # %bb.391:
 15565  	lea	rdx, [r8 + 4*rax]
 15566  	cmp	rdx, rcx
 15567  	jbe	.LBB2_558
 15568  .LBB2_204:
 15569  	xor	edx, edx
 15570  .LBB2_905:
 15571  	mov	rsi, rdx
 15572  	not	rsi
 15573  	add	rsi, rax
 15574  	mov	rdi, rax
 15575  	and	rdi, 3
 15576  	je	.LBB2_907
 15577  .LBB2_906:                              # =>This Inner Loop Header: Depth=1
 15578  	movss	xmm1, dword ptr [rcx + 4*rdx]   # xmm1 = mem[0],zero,zero,zero
 15579  	mulss	xmm1, xmm0
 15580  	movss	dword ptr [r8 + 4*rdx], xmm1
 15581  	add	rdx, 1
 15582  	add	rdi, -1
 15583  	jne	.LBB2_906
 15584  .LBB2_907:
 15585  	cmp	rsi, 3
 15586  	jb	.LBB2_1069
 15587  .LBB2_908:                              # =>This Inner Loop Header: Depth=1
 15588  	movss	xmm1, dword ptr [rcx + 4*rdx]   # xmm1 = mem[0],zero,zero,zero
 15589  	mulss	xmm1, xmm0
 15590  	movss	dword ptr [r8 + 4*rdx], xmm1
 15591  	movss	xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero
 15592  	mulss	xmm1, xmm0
 15593  	movss	dword ptr [r8 + 4*rdx + 4], xmm1
 15594  	movss	xmm1, dword ptr [rcx + 4*rdx + 8] # xmm1 = mem[0],zero,zero,zero
 15595  	mulss	xmm1, xmm0
 15596  	movss	dword ptr [r8 + 4*rdx + 8], xmm1
 15597  	movss	xmm1, dword ptr [rcx + 4*rdx + 12] # xmm1 = mem[0],zero,zero,zero
 15598  	mulss	xmm1, xmm0
 15599  	movss	dword ptr [r8 + 4*rdx + 12], xmm1
 15600  	add	rdx, 4
 15601  	cmp	rax, rdx
 15602  	jne	.LBB2_908
 15603  	jmp	.LBB2_1069
 15604  .LBB2_205:
 15605  	test	r9d, r9d
 15606  	jle	.LBB2_1069
 15607  # %bb.206:
 15608  	mov	rax, qword ptr [rdx]
 15609  	mov	r10d, r9d
 15610  	cmp	r9d, 4
 15611  	jb	.LBB2_207
 15612  # %bb.393:
 15613  	lea	rdx, [rcx + 8*r10]
 15614  	cmp	rdx, r8
 15615  	jbe	.LBB2_561
 15616  # %bb.394:
 15617  	lea	rdx, [r8 + 8*r10]
 15618  	cmp	rdx, rcx
 15619  	jbe	.LBB2_561
 15620  .LBB2_207:
 15621  	xor	esi, esi
 15622  .LBB2_913:
 15623  	mov	r9, rsi
 15624  	not	r9
 15625  	add	r9, r10
 15626  	mov	rdi, r10
 15627  	and	rdi, 3
 15628  	je	.LBB2_915
 15629  .LBB2_914:                              # =>This Inner Loop Header: Depth=1
 15630  	mov	rdx, qword ptr [rcx + 8*rsi]
 15631  	add	rdx, rax
 15632  	mov	qword ptr [r8 + 8*rsi], rdx
 15633  	add	rsi, 1
 15634  	add	rdi, -1
 15635  	jne	.LBB2_914
 15636  .LBB2_915:
 15637  	cmp	r9, 3
 15638  	jb	.LBB2_1069
 15639  .LBB2_916:                              # =>This Inner Loop Header: Depth=1
 15640  	mov	rdx, qword ptr [rcx + 8*rsi]
 15641  	add	rdx, rax
 15642  	mov	qword ptr [r8 + 8*rsi], rdx
 15643  	mov	rdx, qword ptr [rcx + 8*rsi + 8]
 15644  	add	rdx, rax
 15645  	mov	qword ptr [r8 + 8*rsi + 8], rdx
 15646  	mov	rdx, qword ptr [rcx + 8*rsi + 16]
 15647  	add	rdx, rax
 15648  	mov	qword ptr [r8 + 8*rsi + 16], rdx
 15649  	mov	rdx, qword ptr [rcx + 8*rsi + 24]
 15650  	add	rdx, rax
 15651  	mov	qword ptr [r8 + 8*rsi + 24], rdx
 15652  	add	rsi, 4
 15653  	cmp	r10, rsi
 15654  	jne	.LBB2_916
 15655  	jmp	.LBB2_1069
 15656  .LBB2_208:
 15657  	test	r9d, r9d
 15658  	jle	.LBB2_1069
 15659  # %bb.209:
 15660  	movss	xmm0, dword ptr [rdx]           # xmm0 = mem[0],zero,zero,zero
 15661  	mov	eax, r9d
 15662  	cmp	r9d, 8
 15663  	jb	.LBB2_210
 15664  # %bb.396:
 15665  	lea	rdx, [rcx + 4*rax]
 15666  	cmp	rdx, r8
 15667  	jbe	.LBB2_564
 15668  # %bb.397:
 15669  	lea	rdx, [r8 + 4*rax]
 15670  	cmp	rdx, rcx
 15671  	jbe	.LBB2_564
 15672  .LBB2_210:
 15673  	xor	edx, edx
 15674  .LBB2_921:
 15675  	mov	rsi, rdx
 15676  	not	rsi
 15677  	add	rsi, rax
 15678  	mov	rdi, rax
 15679  	and	rdi, 3
 15680  	je	.LBB2_923
 15681  .LBB2_922:                              # =>This Inner Loop Header: Depth=1
 15682  	movss	xmm1, dword ptr [rcx + 4*rdx]   # xmm1 = mem[0],zero,zero,zero
 15683  	addss	xmm1, xmm0
 15684  	movss	dword ptr [r8 + 4*rdx], xmm1
 15685  	add	rdx, 1
 15686  	add	rdi, -1
 15687  	jne	.LBB2_922
 15688  .LBB2_923:
 15689  	cmp	rsi, 3
 15690  	jb	.LBB2_1069
 15691  .LBB2_924:                              # =>This Inner Loop Header: Depth=1
 15692  	movss	xmm1, dword ptr [rcx + 4*rdx]   # xmm1 = mem[0],zero,zero,zero
 15693  	addss	xmm1, xmm0
 15694  	movss	dword ptr [r8 + 4*rdx], xmm1
 15695  	movss	xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero
 15696  	addss	xmm1, xmm0
 15697  	movss	dword ptr [r8 + 4*rdx + 4], xmm1
 15698  	movss	xmm1, dword ptr [rcx + 4*rdx + 8] # xmm1 = mem[0],zero,zero,zero
 15699  	addss	xmm1, xmm0
 15700  	movss	dword ptr [r8 + 4*rdx + 8], xmm1
 15701  	movss	xmm1, dword ptr [rcx + 4*rdx + 12] # xmm1 = mem[0],zero,zero,zero
 15702  	addss	xmm1, xmm0
 15703  	movss	dword ptr [r8 + 4*rdx + 12], xmm1
 15704  	add	rdx, 4
 15705  	cmp	rax, rdx
 15706  	jne	.LBB2_924
 15707  	jmp	.LBB2_1069
 15708  .LBB2_211:
 15709  	test	r9d, r9d
 15710  	jle	.LBB2_1069
 15711  # %bb.212:
 15712  	mov	r11, qword ptr [rdx]
 15713  	mov	r10d, r9d
 15714  	cmp	r9d, 4
 15715  	jb	.LBB2_213
 15716  # %bb.399:
 15717  	lea	rdx, [rcx + 8*r10]
 15718  	cmp	rdx, r8
 15719  	jbe	.LBB2_567
 15720  # %bb.400:
 15721  	lea	rdx, [r8 + 8*r10]
 15722  	cmp	rdx, rcx
 15723  	jbe	.LBB2_567
 15724  .LBB2_213:
 15725  	xor	esi, esi
 15726  .LBB2_929:
 15727  	mov	rdx, rsi
 15728  	not	rdx
 15729  	add	rdx, r10
 15730  	mov	rdi, r10
 15731  	and	rdi, 3
 15732  	je	.LBB2_931
 15733  .LBB2_930:                              # =>This Inner Loop Header: Depth=1
 15734  	mov	rax, r11
 15735  	sub	rax, qword ptr [rcx + 8*rsi]
 15736  	mov	qword ptr [r8 + 8*rsi], rax
 15737  	add	rsi, 1
 15738  	add	rdi, -1
 15739  	jne	.LBB2_930
 15740  .LBB2_931:
 15741  	cmp	rdx, 3
 15742  	jb	.LBB2_1069
 15743  .LBB2_932:                              # =>This Inner Loop Header: Depth=1
 15744  	mov	rax, r11
 15745  	sub	rax, qword ptr [rcx + 8*rsi]
 15746  	mov	qword ptr [r8 + 8*rsi], rax
 15747  	mov	rax, r11
 15748  	sub	rax, qword ptr [rcx + 8*rsi + 8]
 15749  	mov	qword ptr [r8 + 8*rsi + 8], rax
 15750  	mov	rax, r11
 15751  	sub	rax, qword ptr [rcx + 8*rsi + 16]
 15752  	mov	qword ptr [r8 + 8*rsi + 16], rax
 15753  	mov	rax, r11
 15754  	sub	rax, qword ptr [rcx + 8*rsi + 24]
 15755  	mov	qword ptr [r8 + 8*rsi + 24], rax
 15756  	add	rsi, 4
 15757  	cmp	r10, rsi
 15758  	jne	.LBB2_932
 15759  	jmp	.LBB2_1069
 15760  .LBB2_214:
 15761  	test	r9d, r9d
 15762  	jle	.LBB2_1069
 15763  # %bb.215:
 15764  	movss	xmm0, dword ptr [rdx]           # xmm0 = mem[0],zero,zero,zero
 15765  	mov	eax, r9d
 15766  	cmp	r9d, 8
 15767  	jb	.LBB2_216
 15768  # %bb.402:
 15769  	lea	rdx, [rcx + 4*rax]
 15770  	cmp	rdx, r8
 15771  	jbe	.LBB2_570
 15772  # %bb.403:
 15773  	lea	rdx, [r8 + 4*rax]
 15774  	cmp	rdx, rcx
 15775  	jbe	.LBB2_570
 15776  .LBB2_216:
 15777  	xor	edx, edx
 15778  .LBB2_937:
 15779  	mov	rsi, rdx
 15780  	not	rsi
 15781  	add	rsi, rax
 15782  	mov	rdi, rax
 15783  	and	rdi, 3
 15784  	je	.LBB2_939
 15785  .LBB2_938:                              # =>This Inner Loop Header: Depth=1
 15786  	movaps	xmm1, xmm0
 15787  	subss	xmm1, dword ptr [rcx + 4*rdx]
 15788  	movss	dword ptr [r8 + 4*rdx], xmm1
 15789  	add	rdx, 1
 15790  	add	rdi, -1
 15791  	jne	.LBB2_938
 15792  .LBB2_939:
 15793  	cmp	rsi, 3
 15794  	jb	.LBB2_1069
 15795  .LBB2_940:                              # =>This Inner Loop Header: Depth=1
 15796  	movaps	xmm1, xmm0
 15797  	subss	xmm1, dword ptr [rcx + 4*rdx]
 15798  	movss	dword ptr [r8 + 4*rdx], xmm1
 15799  	movaps	xmm1, xmm0
 15800  	subss	xmm1, dword ptr [rcx + 4*rdx + 4]
 15801  	movss	dword ptr [r8 + 4*rdx + 4], xmm1
 15802  	movaps	xmm1, xmm0
 15803  	subss	xmm1, dword ptr [rcx + 4*rdx + 8]
 15804  	movss	dword ptr [r8 + 4*rdx + 8], xmm1
 15805  	movaps	xmm1, xmm0
 15806  	subss	xmm1, dword ptr [rcx + 4*rdx + 12]
 15807  	movss	dword ptr [r8 + 4*rdx + 12], xmm1
 15808  	add	rdx, 4
 15809  	cmp	rax, rdx
 15810  	jne	.LBB2_940
 15811  	jmp	.LBB2_1069
 15812  .LBB2_217:
 15813  	test	r9d, r9d
 15814  	jle	.LBB2_1069
 15815  # %bb.218:
 15816  	mov	rax, qword ptr [rdx]
 15817  	mov	r10d, r9d
 15818  	cmp	r9d, 4
 15819  	jb	.LBB2_219
 15820  # %bb.405:
 15821  	lea	rdx, [rcx + 8*r10]
 15822  	cmp	rdx, r8
 15823  	jbe	.LBB2_573
 15824  # %bb.406:
 15825  	lea	rdx, [r8 + 8*r10]
 15826  	cmp	rdx, rcx
 15827  	jbe	.LBB2_573
 15828  .LBB2_219:
 15829  	xor	esi, esi
 15830  .LBB2_945:
 15831  	mov	r9, rsi
 15832  	not	r9
 15833  	add	r9, r10
 15834  	mov	rdi, r10
 15835  	and	rdi, 3
 15836  	je	.LBB2_947
 15837  .LBB2_946:                              # =>This Inner Loop Header: Depth=1
 15838  	mov	rdx, qword ptr [rcx + 8*rsi]
 15839  	add	rdx, rax
 15840  	mov	qword ptr [r8 + 8*rsi], rdx
 15841  	add	rsi, 1
 15842  	add	rdi, -1
 15843  	jne	.LBB2_946
 15844  .LBB2_947:
 15845  	cmp	r9, 3
 15846  	jb	.LBB2_1069
 15847  .LBB2_948:                              # =>This Inner Loop Header: Depth=1
 15848  	mov	rdx, qword ptr [rcx + 8*rsi]
 15849  	add	rdx, rax
 15850  	mov	qword ptr [r8 + 8*rsi], rdx
 15851  	mov	rdx, qword ptr [rcx + 8*rsi + 8]
 15852  	add	rdx, rax
 15853  	mov	qword ptr [r8 + 8*rsi + 8], rdx
 15854  	mov	rdx, qword ptr [rcx + 8*rsi + 16]
 15855  	add	rdx, rax
 15856  	mov	qword ptr [r8 + 8*rsi + 16], rdx
 15857  	mov	rdx, qword ptr [rcx + 8*rsi + 24]
 15858  	add	rdx, rax
 15859  	mov	qword ptr [r8 + 8*rsi + 24], rdx
 15860  	add	rsi, 4
 15861  	cmp	r10, rsi
 15862  	jne	.LBB2_948
 15863  	jmp	.LBB2_1069
 15864  .LBB2_220:
 15865  	test	r9d, r9d
 15866  	jle	.LBB2_1069
 15867  # %bb.221:
 15868  	movss	xmm0, dword ptr [rdx]           # xmm0 = mem[0],zero,zero,zero
 15869  	mov	eax, r9d
 15870  	cmp	r9d, 8
 15871  	jb	.LBB2_222
 15872  # %bb.408:
 15873  	lea	rdx, [rcx + 4*rax]
 15874  	cmp	rdx, r8
 15875  	jbe	.LBB2_576
 15876  # %bb.409:
 15877  	lea	rdx, [r8 + 4*rax]
 15878  	cmp	rdx, rcx
 15879  	jbe	.LBB2_576
 15880  .LBB2_222:
 15881  	xor	edx, edx
 15882  .LBB2_953:
 15883  	mov	rsi, rdx
 15884  	not	rsi
 15885  	add	rsi, rax
 15886  	mov	rdi, rax
 15887  	and	rdi, 3
 15888  	je	.LBB2_955
 15889  .LBB2_954:                              # =>This Inner Loop Header: Depth=1
 15890  	movss	xmm1, dword ptr [rcx + 4*rdx]   # xmm1 = mem[0],zero,zero,zero
 15891  	addss	xmm1, xmm0
 15892  	movss	dword ptr [r8 + 4*rdx], xmm1
 15893  	add	rdx, 1
 15894  	add	rdi, -1
 15895  	jne	.LBB2_954
 15896  .LBB2_955:
 15897  	cmp	rsi, 3
 15898  	jb	.LBB2_1069
 15899  .LBB2_956:                              # =>This Inner Loop Header: Depth=1
 15900  	movss	xmm1, dword ptr [rcx + 4*rdx]   # xmm1 = mem[0],zero,zero,zero
 15901  	addss	xmm1, xmm0
 15902  	movss	dword ptr [r8 + 4*rdx], xmm1
 15903  	movss	xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero
 15904  	addss	xmm1, xmm0
 15905  	movss	dword ptr [r8 + 4*rdx + 4], xmm1
 15906  	movss	xmm1, dword ptr [rcx + 4*rdx + 8] # xmm1 = mem[0],zero,zero,zero
 15907  	addss	xmm1, xmm0
 15908  	movss	dword ptr [r8 + 4*rdx + 8], xmm1
 15909  	movss	xmm1, dword ptr [rcx + 4*rdx + 12] # xmm1 = mem[0],zero,zero,zero
 15910  	addss	xmm1, xmm0
 15911  	movss	dword ptr [r8 + 4*rdx + 12], xmm1
 15912  	add	rdx, 4
 15913  	cmp	rax, rdx
 15914  	jne	.LBB2_956
 15915  	jmp	.LBB2_1069
 15916  .LBB2_223:
 15917  	test	r9d, r9d
 15918  	jle	.LBB2_1069
 15919  # %bb.224:
 15920  	mov	r11, qword ptr [rdx]
 15921  	mov	r10d, r9d
 15922  	cmp	r9d, 4
 15923  	jb	.LBB2_225
 15924  # %bb.411:
 15925  	lea	rdx, [rcx + 8*r10]
 15926  	cmp	rdx, r8
 15927  	jbe	.LBB2_579
 15928  # %bb.412:
 15929  	lea	rdx, [r8 + 8*r10]
 15930  	cmp	rdx, rcx
 15931  	jbe	.LBB2_579
 15932  .LBB2_225:
 15933  	xor	esi, esi
 15934  .LBB2_961:
 15935  	mov	rdx, rsi
 15936  	not	rdx
 15937  	add	rdx, r10
 15938  	mov	rdi, r10
 15939  	and	rdi, 3
 15940  	je	.LBB2_963
 15941  .LBB2_962:                              # =>This Inner Loop Header: Depth=1
 15942  	mov	rax, r11
 15943  	sub	rax, qword ptr [rcx + 8*rsi]
 15944  	mov	qword ptr [r8 + 8*rsi], rax
 15945  	add	rsi, 1
 15946  	add	rdi, -1
 15947  	jne	.LBB2_962
 15948  .LBB2_963:
 15949  	cmp	rdx, 3
 15950  	jb	.LBB2_1069
 15951  .LBB2_964:                              # =>This Inner Loop Header: Depth=1
 15952  	mov	rax, r11
 15953  	sub	rax, qword ptr [rcx + 8*rsi]
 15954  	mov	qword ptr [r8 + 8*rsi], rax
 15955  	mov	rax, r11
 15956  	sub	rax, qword ptr [rcx + 8*rsi + 8]
 15957  	mov	qword ptr [r8 + 8*rsi + 8], rax
 15958  	mov	rax, r11
 15959  	sub	rax, qword ptr [rcx + 8*rsi + 16]
 15960  	mov	qword ptr [r8 + 8*rsi + 16], rax
 15961  	mov	rax, r11
 15962  	sub	rax, qword ptr [rcx + 8*rsi + 24]
 15963  	mov	qword ptr [r8 + 8*rsi + 24], rax
 15964  	add	rsi, 4
 15965  	cmp	r10, rsi
 15966  	jne	.LBB2_964
 15967  	jmp	.LBB2_1069
 15968  .LBB2_226:
 15969  	test	r9d, r9d
 15970  	jle	.LBB2_1069
 15971  # %bb.227:
 15972  	movss	xmm0, dword ptr [rdx]           # xmm0 = mem[0],zero,zero,zero
 15973  	mov	eax, r9d
 15974  	cmp	r9d, 8
 15975  	jb	.LBB2_228
 15976  # %bb.414:
 15977  	lea	rdx, [rcx + 4*rax]
 15978  	cmp	rdx, r8
 15979  	jbe	.LBB2_582
 15980  # %bb.415:
 15981  	lea	rdx, [r8 + 4*rax]
 15982  	cmp	rdx, rcx
 15983  	jbe	.LBB2_582
 15984  .LBB2_228:
 15985  	xor	edx, edx
 15986  .LBB2_969:
 15987  	mov	rsi, rdx
 15988  	not	rsi
 15989  	add	rsi, rax
 15990  	mov	rdi, rax
 15991  	and	rdi, 3
 15992  	je	.LBB2_971
 15993  .LBB2_970:                              # =>This Inner Loop Header: Depth=1
 15994  	movaps	xmm1, xmm0
 15995  	subss	xmm1, dword ptr [rcx + 4*rdx]
 15996  	movss	dword ptr [r8 + 4*rdx], xmm1
 15997  	add	rdx, 1
 15998  	add	rdi, -1
 15999  	jne	.LBB2_970
 16000  .LBB2_971:
 16001  	cmp	rsi, 3
 16002  	jb	.LBB2_1069
 16003  .LBB2_972:                              # =>This Inner Loop Header: Depth=1
 16004  	movaps	xmm1, xmm0
 16005  	subss	xmm1, dword ptr [rcx + 4*rdx]
 16006  	movss	dword ptr [r8 + 4*rdx], xmm1
 16007  	movaps	xmm1, xmm0
 16008  	subss	xmm1, dword ptr [rcx + 4*rdx + 4]
 16009  	movss	dword ptr [r8 + 4*rdx + 4], xmm1
 16010  	movaps	xmm1, xmm0
 16011  	subss	xmm1, dword ptr [rcx + 4*rdx + 8]
 16012  	movss	dword ptr [r8 + 4*rdx + 8], xmm1
 16013  	movaps	xmm1, xmm0
 16014  	subss	xmm1, dword ptr [rcx + 4*rdx + 12]
 16015  	movss	dword ptr [r8 + 4*rdx + 12], xmm1
 16016  	add	rdx, 4
 16017  	cmp	rax, rdx
 16018  	jne	.LBB2_972
 16019  	jmp	.LBB2_1069
 16020  .LBB2_229:
 16021  	test	r9d, r9d
 16022  	jle	.LBB2_1069
 16023  # %bb.230:
 16024  	mov	dl, byte ptr [rdx]
 16025  	mov	r10d, r9d
 16026  	cmp	r9d, 32
 16027  	jb	.LBB2_231
 16028  # %bb.417:
 16029  	lea	rax, [rcx + r10]
 16030  	cmp	rax, r8
 16031  	jbe	.LBB2_585
 16032  # %bb.418:
 16033  	lea	rax, [r8 + r10]
 16034  	cmp	rax, rcx
 16035  	jbe	.LBB2_585
 16036  .LBB2_231:
 16037  	xor	edi, edi
 16038  .LBB2_977:
 16039  	mov	r9, rdi
 16040  	not	r9
 16041  	add	r9, r10
 16042  	mov	rsi, r10
 16043  	and	rsi, 3
 16044  	je	.LBB2_979
 16045  .LBB2_978:                              # =>This Inner Loop Header: Depth=1
 16046  	movzx	eax, byte ptr [rcx + rdi]
 16047  	mul	dl
 16048  	mov	byte ptr [r8 + rdi], al
 16049  	add	rdi, 1
 16050  	add	rsi, -1
 16051  	jne	.LBB2_978
 16052  .LBB2_979:
 16053  	cmp	r9, 3
 16054  	jb	.LBB2_1069
 16055  .LBB2_980:                              # =>This Inner Loop Header: Depth=1
 16056  	movzx	eax, byte ptr [rcx + rdi]
 16057  	mul	dl
 16058  	mov	byte ptr [r8 + rdi], al
 16059  	movzx	eax, byte ptr [rcx + rdi + 1]
 16060  	mul	dl
 16061  	mov	byte ptr [r8 + rdi + 1], al
 16062  	movzx	eax, byte ptr [rcx + rdi + 2]
 16063  	mul	dl
 16064  	mov	byte ptr [r8 + rdi + 2], al
 16065  	movzx	eax, byte ptr [rcx + rdi + 3]
 16066  	mul	dl
 16067  	mov	byte ptr [r8 + rdi + 3], al
 16068  	add	rdi, 4
 16069  	cmp	r10, rdi
 16070  	jne	.LBB2_980
 16071  	jmp	.LBB2_1069
 16072  .LBB2_232:
 16073  	test	r9d, r9d
 16074  	jle	.LBB2_1069
 16075  # %bb.233:
 16076  	mov	dl, byte ptr [rdx]
 16077  	mov	r10d, r9d
 16078  	cmp	r9d, 32
 16079  	jb	.LBB2_234
 16080  # %bb.420:
 16081  	lea	rax, [rcx + r10]
 16082  	cmp	rax, r8
 16083  	jbe	.LBB2_588
 16084  # %bb.421:
 16085  	lea	rax, [r8 + r10]
 16086  	cmp	rax, rcx
 16087  	jbe	.LBB2_588
 16088  .LBB2_234:
 16089  	xor	edi, edi
 16090  .LBB2_985:
 16091  	mov	r9, rdi
 16092  	not	r9
 16093  	add	r9, r10
 16094  	mov	rsi, r10
 16095  	and	rsi, 3
 16096  	je	.LBB2_987
 16097  .LBB2_986:                              # =>This Inner Loop Header: Depth=1
 16098  	movzx	eax, byte ptr [rcx + rdi]
 16099  	mul	dl
 16100  	mov	byte ptr [r8 + rdi], al
 16101  	add	rdi, 1
 16102  	add	rsi, -1
 16103  	jne	.LBB2_986
 16104  .LBB2_987:
 16105  	cmp	r9, 3
 16106  	jb	.LBB2_1069
 16107  .LBB2_988:                              # =>This Inner Loop Header: Depth=1
 16108  	movzx	eax, byte ptr [rcx + rdi]
 16109  	mul	dl
 16110  	mov	byte ptr [r8 + rdi], al
 16111  	movzx	eax, byte ptr [rcx + rdi + 1]
 16112  	mul	dl
 16113  	mov	byte ptr [r8 + rdi + 1], al
 16114  	movzx	eax, byte ptr [rcx + rdi + 2]
 16115  	mul	dl
 16116  	mov	byte ptr [r8 + rdi + 2], al
 16117  	movzx	eax, byte ptr [rcx + rdi + 3]
 16118  	mul	dl
 16119  	mov	byte ptr [r8 + rdi + 3], al
 16120  	add	rdi, 4
 16121  	cmp	r10, rdi
 16122  	jne	.LBB2_988
 16123  	jmp	.LBB2_1069
 16124  .LBB2_235:
 16125  	test	r9d, r9d
 16126  	jle	.LBB2_1069
 16127  # %bb.236:
 16128  	mov	al, byte ptr [rdx]
 16129  	mov	r10d, r9d
 16130  	cmp	r9d, 32
 16131  	jb	.LBB2_237
 16132  # %bb.423:
 16133  	lea	rdx, [rcx + r10]
 16134  	cmp	rdx, r8
 16135  	jbe	.LBB2_591
 16136  # %bb.424:
 16137  	lea	rdx, [r8 + r10]
 16138  	cmp	rdx, rcx
 16139  	jbe	.LBB2_591
 16140  .LBB2_237:
 16141  	xor	esi, esi
 16142  .LBB2_993:
 16143  	mov	r9, rsi
 16144  	not	r9
 16145  	add	r9, r10
 16146  	mov	rdi, r10
 16147  	and	rdi, 3
 16148  	je	.LBB2_995
 16149  .LBB2_994:                              # =>This Inner Loop Header: Depth=1
 16150  	movzx	edx, byte ptr [rcx + rsi]
 16151  	add	dl, al
 16152  	mov	byte ptr [r8 + rsi], dl
 16153  	add	rsi, 1
 16154  	add	rdi, -1
 16155  	jne	.LBB2_994
 16156  .LBB2_995:
 16157  	cmp	r9, 3
 16158  	jb	.LBB2_1069
 16159  .LBB2_996:                              # =>This Inner Loop Header: Depth=1
 16160  	movzx	edx, byte ptr [rcx + rsi]
 16161  	add	dl, al
 16162  	mov	byte ptr [r8 + rsi], dl
 16163  	movzx	edx, byte ptr [rcx + rsi + 1]
 16164  	add	dl, al
 16165  	mov	byte ptr [r8 + rsi + 1], dl
 16166  	movzx	edx, byte ptr [rcx + rsi + 2]
 16167  	add	dl, al
 16168  	mov	byte ptr [r8 + rsi + 2], dl
 16169  	movzx	edx, byte ptr [rcx + rsi + 3]
 16170  	add	dl, al
 16171  	mov	byte ptr [r8 + rsi + 3], dl
 16172  	add	rsi, 4
 16173  	cmp	r10, rsi
 16174  	jne	.LBB2_996
 16175  	jmp	.LBB2_1069
 16176  .LBB2_238:
 16177  	test	r9d, r9d
 16178  	jle	.LBB2_1069
 16179  # %bb.239:
 16180  	mov	r11b, byte ptr [rdx]
 16181  	mov	r10d, r9d
 16182  	cmp	r9d, 32
 16183  	jb	.LBB2_240
 16184  # %bb.426:
 16185  	lea	rdx, [rcx + r10]
 16186  	cmp	rdx, r8
 16187  	jbe	.LBB2_594
 16188  # %bb.427:
 16189  	lea	rdx, [r8 + r10]
 16190  	cmp	rdx, rcx
 16191  	jbe	.LBB2_594
 16192  .LBB2_240:
 16193  	xor	esi, esi
 16194  .LBB2_1001:
 16195  	mov	rdx, rsi
 16196  	not	rdx
 16197  	add	rdx, r10
 16198  	mov	rdi, r10
 16199  	and	rdi, 3
 16200  	je	.LBB2_1003
 16201  .LBB2_1002:                             # =>This Inner Loop Header: Depth=1
 16202  	mov	eax, r11d
 16203  	sub	al, byte ptr [rcx + rsi]
 16204  	mov	byte ptr [r8 + rsi], al
 16205  	add	rsi, 1
 16206  	add	rdi, -1
 16207  	jne	.LBB2_1002
 16208  .LBB2_1003:
 16209  	cmp	rdx, 3
 16210  	jb	.LBB2_1069
 16211  .LBB2_1004:                             # =>This Inner Loop Header: Depth=1
 16212  	mov	eax, r11d
 16213  	sub	al, byte ptr [rcx + rsi]
 16214  	mov	byte ptr [r8 + rsi], al
 16215  	mov	eax, r11d
 16216  	sub	al, byte ptr [rcx + rsi + 1]
 16217  	mov	byte ptr [r8 + rsi + 1], al
 16218  	mov	eax, r11d
 16219  	sub	al, byte ptr [rcx + rsi + 2]
 16220  	mov	byte ptr [r8 + rsi + 2], al
 16221  	mov	eax, r11d
 16222  	sub	al, byte ptr [rcx + rsi + 3]
 16223  	mov	byte ptr [r8 + rsi + 3], al
 16224  	add	rsi, 4
 16225  	cmp	r10, rsi
 16226  	jne	.LBB2_1004
 16227  	jmp	.LBB2_1069
 16228  .LBB2_241:
 16229  	test	r9d, r9d
 16230  	jle	.LBB2_1069
 16231  # %bb.242:
 16232  	mov	al, byte ptr [rdx]
 16233  	mov	r10d, r9d
 16234  	cmp	r9d, 32
 16235  	jb	.LBB2_243
 16236  # %bb.429:
 16237  	lea	rdx, [rcx + r10]
 16238  	cmp	rdx, r8
 16239  	jbe	.LBB2_597
 16240  # %bb.430:
 16241  	lea	rdx, [r8 + r10]
 16242  	cmp	rdx, rcx
 16243  	jbe	.LBB2_597
 16244  .LBB2_243:
 16245  	xor	esi, esi
 16246  .LBB2_1009:
 16247  	mov	r9, rsi
 16248  	not	r9
 16249  	add	r9, r10
 16250  	mov	rdi, r10
 16251  	and	rdi, 3
 16252  	je	.LBB2_1011
 16253  .LBB2_1010:                             # =>This Inner Loop Header: Depth=1
 16254  	movzx	edx, byte ptr [rcx + rsi]
 16255  	add	dl, al
 16256  	mov	byte ptr [r8 + rsi], dl
 16257  	add	rsi, 1
 16258  	add	rdi, -1
 16259  	jne	.LBB2_1010
 16260  .LBB2_1011:
 16261  	cmp	r9, 3
 16262  	jb	.LBB2_1069
 16263  .LBB2_1012:                             # =>This Inner Loop Header: Depth=1
 16264  	movzx	edx, byte ptr [rcx + rsi]
 16265  	add	dl, al
 16266  	mov	byte ptr [r8 + rsi], dl
 16267  	movzx	edx, byte ptr [rcx + rsi + 1]
 16268  	add	dl, al
 16269  	mov	byte ptr [r8 + rsi + 1], dl
 16270  	movzx	edx, byte ptr [rcx + rsi + 2]
 16271  	add	dl, al
 16272  	mov	byte ptr [r8 + rsi + 2], dl
 16273  	movzx	edx, byte ptr [rcx + rsi + 3]
 16274  	add	dl, al
 16275  	mov	byte ptr [r8 + rsi + 3], dl
 16276  	add	rsi, 4
 16277  	cmp	r10, rsi
 16278  	jne	.LBB2_1012
 16279  	jmp	.LBB2_1069
 16280  .LBB2_244:
 16281  	test	r9d, r9d
 16282  	jle	.LBB2_1069
 16283  # %bb.245:
 16284  	mov	r11b, byte ptr [rdx]
 16285  	mov	r10d, r9d
 16286  	cmp	r9d, 32
 16287  	jb	.LBB2_246
 16288  # %bb.432:
 16289  	lea	rdx, [rcx + r10]
 16290  	cmp	rdx, r8
 16291  	jbe	.LBB2_600
 16292  # %bb.433:
 16293  	lea	rdx, [r8 + r10]
 16294  	cmp	rdx, rcx
 16295  	jbe	.LBB2_600
 16296  .LBB2_246:
 16297  	xor	esi, esi
 16298  .LBB2_1017:
 16299  	mov	rdx, rsi
 16300  	not	rdx
 16301  	add	rdx, r10
 16302  	mov	rdi, r10
 16303  	and	rdi, 3
 16304  	je	.LBB2_1019
 16305  .LBB2_1018:                             # =>This Inner Loop Header: Depth=1
 16306  	mov	eax, r11d
 16307  	sub	al, byte ptr [rcx + rsi]
 16308  	mov	byte ptr [r8 + rsi], al
 16309  	add	rsi, 1
 16310  	add	rdi, -1
 16311  	jne	.LBB2_1018
 16312  .LBB2_1019:
 16313  	cmp	rdx, 3
 16314  	jb	.LBB2_1069
 16315  .LBB2_1020:                             # =>This Inner Loop Header: Depth=1
 16316  	mov	eax, r11d
 16317  	sub	al, byte ptr [rcx + rsi]
 16318  	mov	byte ptr [r8 + rsi], al
 16319  	mov	eax, r11d
 16320  	sub	al, byte ptr [rcx + rsi + 1]
 16321  	mov	byte ptr [r8 + rsi + 1], al
 16322  	mov	eax, r11d
 16323  	sub	al, byte ptr [rcx + rsi + 2]
 16324  	mov	byte ptr [r8 + rsi + 2], al
 16325  	mov	eax, r11d
 16326  	sub	al, byte ptr [rcx + rsi + 3]
 16327  	mov	byte ptr [r8 + rsi + 3], al
 16328  	add	rsi, 4
 16329  	cmp	r10, rsi
 16330  	jne	.LBB2_1020
 16331  	jmp	.LBB2_1069
 16332  .LBB2_247:
 16333  	test	r9d, r9d
 16334  	jle	.LBB2_1069
 16335  # %bb.248:
 16336  	mov	eax, dword ptr [rdx]
 16337  	mov	r10d, r9d
 16338  	cmp	r9d, 8
 16339  	jb	.LBB2_249
 16340  # %bb.435:
 16341  	lea	rdx, [rcx + 4*r10]
 16342  	cmp	rdx, r8
 16343  	jbe	.LBB2_603
 16344  # %bb.436:
 16345  	lea	rdx, [r8 + 4*r10]
 16346  	cmp	rdx, rcx
 16347  	jbe	.LBB2_603
 16348  .LBB2_249:
 16349  	xor	esi, esi
 16350  .LBB2_1025:
 16351  	mov	r9, rsi
 16352  	not	r9
 16353  	add	r9, r10
 16354  	mov	rdi, r10
 16355  	and	rdi, 3
 16356  	je	.LBB2_1027
 16357  .LBB2_1026:                             # =>This Inner Loop Header: Depth=1
 16358  	mov	edx, dword ptr [rcx + 4*rsi]
 16359  	imul	edx, eax
 16360  	mov	dword ptr [r8 + 4*rsi], edx
 16361  	add	rsi, 1
 16362  	add	rdi, -1
 16363  	jne	.LBB2_1026
 16364  .LBB2_1027:
 16365  	cmp	r9, 3
 16366  	jb	.LBB2_1069
 16367  .LBB2_1028:                             # =>This Inner Loop Header: Depth=1
 16368  	mov	edx, dword ptr [rcx + 4*rsi]
 16369  	imul	edx, eax
 16370  	mov	dword ptr [r8 + 4*rsi], edx
 16371  	mov	edx, dword ptr [rcx + 4*rsi + 4]
 16372  	imul	edx, eax
 16373  	mov	dword ptr [r8 + 4*rsi + 4], edx
 16374  	mov	edx, dword ptr [rcx + 4*rsi + 8]
 16375  	imul	edx, eax
 16376  	mov	dword ptr [r8 + 4*rsi + 8], edx
 16377  	mov	edx, dword ptr [rcx + 4*rsi + 12]
 16378  	imul	edx, eax
 16379  	mov	dword ptr [r8 + 4*rsi + 12], edx
 16380  	add	rsi, 4
 16381  	cmp	r10, rsi
 16382  	jne	.LBB2_1028
 16383  	jmp	.LBB2_1069
 16384  .LBB2_250:
 16385  	test	r9d, r9d
 16386  	jle	.LBB2_1069
 16387  # %bb.251:
 16388  	mov	eax, dword ptr [rdx]
 16389  	mov	r10d, r9d
 16390  	cmp	r9d, 8
 16391  	jb	.LBB2_252
 16392  # %bb.438:
 16393  	lea	rdx, [rcx + 4*r10]
 16394  	cmp	rdx, r8
 16395  	jbe	.LBB2_606
 16396  # %bb.439:
 16397  	lea	rdx, [r8 + 4*r10]
 16398  	cmp	rdx, rcx
 16399  	jbe	.LBB2_606
 16400  .LBB2_252:
 16401  	xor	esi, esi
 16402  .LBB2_1033:
 16403  	mov	r9, rsi
 16404  	not	r9
 16405  	add	r9, r10
 16406  	mov	rdi, r10
 16407  	and	rdi, 3
 16408  	je	.LBB2_1035
 16409  .LBB2_1034:                             # =>This Inner Loop Header: Depth=1
 16410  	mov	edx, dword ptr [rcx + 4*rsi]
 16411  	imul	edx, eax
 16412  	mov	dword ptr [r8 + 4*rsi], edx
 16413  	add	rsi, 1
 16414  	add	rdi, -1
 16415  	jne	.LBB2_1034
 16416  .LBB2_1035:
 16417  	cmp	r9, 3
 16418  	jb	.LBB2_1069
 16419  .LBB2_1036:                             # =>This Inner Loop Header: Depth=1
 16420  	mov	edx, dword ptr [rcx + 4*rsi]
 16421  	imul	edx, eax
 16422  	mov	dword ptr [r8 + 4*rsi], edx
 16423  	mov	edx, dword ptr [rcx + 4*rsi + 4]
 16424  	imul	edx, eax
 16425  	mov	dword ptr [r8 + 4*rsi + 4], edx
 16426  	mov	edx, dword ptr [rcx + 4*rsi + 8]
 16427  	imul	edx, eax
 16428  	mov	dword ptr [r8 + 4*rsi + 8], edx
 16429  	mov	edx, dword ptr [rcx + 4*rsi + 12]
 16430  	imul	edx, eax
 16431  	mov	dword ptr [r8 + 4*rsi + 12], edx
 16432  	add	rsi, 4
 16433  	cmp	r10, rsi
 16434  	jne	.LBB2_1036
 16435  	jmp	.LBB2_1069
 16436  .LBB2_253:
 16437  	test	r9d, r9d
 16438  	jle	.LBB2_1069
 16439  # %bb.254:
 16440  	mov	eax, dword ptr [rdx]
 16441  	mov	r10d, r9d
 16442  	cmp	r9d, 8
 16443  	jb	.LBB2_255
 16444  # %bb.441:
 16445  	lea	rdx, [rcx + 4*r10]
 16446  	cmp	rdx, r8
 16447  	jbe	.LBB2_609
 16448  # %bb.442:
 16449  	lea	rdx, [r8 + 4*r10]
 16450  	cmp	rdx, rcx
 16451  	jbe	.LBB2_609
 16452  .LBB2_255:
 16453  	xor	esi, esi
 16454  .LBB2_1041:
 16455  	mov	r9, rsi
 16456  	not	r9
 16457  	add	r9, r10
 16458  	mov	rdi, r10
 16459  	and	rdi, 3
 16460  	je	.LBB2_1043
 16461  .LBB2_1042:                             # =>This Inner Loop Header: Depth=1
 16462  	mov	edx, dword ptr [rcx + 4*rsi]
 16463  	add	edx, eax
 16464  	mov	dword ptr [r8 + 4*rsi], edx
 16465  	add	rsi, 1
 16466  	add	rdi, -1
 16467  	jne	.LBB2_1042
 16468  .LBB2_1043:
 16469  	cmp	r9, 3
 16470  	jb	.LBB2_1069
 16471  .LBB2_1044:                             # =>This Inner Loop Header: Depth=1
 16472  	mov	edx, dword ptr [rcx + 4*rsi]
 16473  	add	edx, eax
 16474  	mov	dword ptr [r8 + 4*rsi], edx
 16475  	mov	edx, dword ptr [rcx + 4*rsi + 4]
 16476  	add	edx, eax
 16477  	mov	dword ptr [r8 + 4*rsi + 4], edx
 16478  	mov	edx, dword ptr [rcx + 4*rsi + 8]
 16479  	add	edx, eax
 16480  	mov	dword ptr [r8 + 4*rsi + 8], edx
 16481  	mov	edx, dword ptr [rcx + 4*rsi + 12]
 16482  	add	edx, eax
 16483  	mov	dword ptr [r8 + 4*rsi + 12], edx
 16484  	add	rsi, 4
 16485  	cmp	r10, rsi
 16486  	jne	.LBB2_1044
 16487  	jmp	.LBB2_1069
 16488  .LBB2_256:
 16489  	test	r9d, r9d
 16490  	jle	.LBB2_1069
 16491  # %bb.257:
 16492  	mov	r11d, dword ptr [rdx]
 16493  	mov	r10d, r9d
 16494  	cmp	r9d, 8
 16495  	jb	.LBB2_258
 16496  # %bb.444:
 16497  	lea	rdx, [rcx + 4*r10]
 16498  	cmp	rdx, r8
 16499  	jbe	.LBB2_612
 16500  # %bb.445:
 16501  	lea	rdx, [r8 + 4*r10]
 16502  	cmp	rdx, rcx
 16503  	jbe	.LBB2_612
 16504  .LBB2_258:
 16505  	xor	esi, esi
 16506  .LBB2_1049:
 16507  	mov	rdx, rsi
 16508  	not	rdx
 16509  	add	rdx, r10
 16510  	mov	rdi, r10
 16511  	and	rdi, 3
 16512  	je	.LBB2_1051
 16513  .LBB2_1050:                             # =>This Inner Loop Header: Depth=1
 16514  	mov	eax, r11d
 16515  	sub	eax, dword ptr [rcx + 4*rsi]
 16516  	mov	dword ptr [r8 + 4*rsi], eax
 16517  	add	rsi, 1
 16518  	add	rdi, -1
 16519  	jne	.LBB2_1050
 16520  .LBB2_1051:
 16521  	cmp	rdx, 3
 16522  	jb	.LBB2_1069
 16523  .LBB2_1052:                             # =>This Inner Loop Header: Depth=1
 16524  	mov	eax, r11d
 16525  	sub	eax, dword ptr [rcx + 4*rsi]
 16526  	mov	dword ptr [r8 + 4*rsi], eax
 16527  	mov	eax, r11d
 16528  	sub	eax, dword ptr [rcx + 4*rsi + 4]
 16529  	mov	dword ptr [r8 + 4*rsi + 4], eax
 16530  	mov	eax, r11d
 16531  	sub	eax, dword ptr [rcx + 4*rsi + 8]
 16532  	mov	dword ptr [r8 + 4*rsi + 8], eax
 16533  	mov	eax, r11d
 16534  	sub	eax, dword ptr [rcx + 4*rsi + 12]
 16535  	mov	dword ptr [r8 + 4*rsi + 12], eax
 16536  	add	rsi, 4
 16537  	cmp	r10, rsi
 16538  	jne	.LBB2_1052
 16539  	jmp	.LBB2_1069
 16540  .LBB2_259:
 16541  	test	r9d, r9d
 16542  	jle	.LBB2_1069
 16543  # %bb.260:
 16544  	mov	eax, dword ptr [rdx]
 16545  	mov	r10d, r9d
 16546  	cmp	r9d, 8
 16547  	jb	.LBB2_261
 16548  # %bb.447:
 16549  	lea	rdx, [rcx + 4*r10]
 16550  	cmp	rdx, r8
 16551  	jbe	.LBB2_615
 16552  # %bb.448:
 16553  	lea	rdx, [r8 + 4*r10]
 16554  	cmp	rdx, rcx
 16555  	jbe	.LBB2_615
 16556  .LBB2_261:
 16557  	xor	esi, esi
 16558  .LBB2_1057:
 16559  	mov	r9, rsi
 16560  	not	r9
 16561  	add	r9, r10
 16562  	mov	rdi, r10
 16563  	and	rdi, 3
 16564  	je	.LBB2_1059
 16565  .LBB2_1058:                             # =>This Inner Loop Header: Depth=1
 16566  	mov	edx, dword ptr [rcx + 4*rsi]
 16567  	add	edx, eax
 16568  	mov	dword ptr [r8 + 4*rsi], edx
 16569  	add	rsi, 1
 16570  	add	rdi, -1
 16571  	jne	.LBB2_1058
 16572  .LBB2_1059:
 16573  	cmp	r9, 3
 16574  	jb	.LBB2_1069
 16575  .LBB2_1060:                             # =>This Inner Loop Header: Depth=1
 16576  	mov	edx, dword ptr [rcx + 4*rsi]
 16577  	add	edx, eax
 16578  	mov	dword ptr [r8 + 4*rsi], edx
 16579  	mov	edx, dword ptr [rcx + 4*rsi + 4]
 16580  	add	edx, eax
 16581  	mov	dword ptr [r8 + 4*rsi + 4], edx
 16582  	mov	edx, dword ptr [rcx + 4*rsi + 8]
 16583  	add	edx, eax
 16584  	mov	dword ptr [r8 + 4*rsi + 8], edx
 16585  	mov	edx, dword ptr [rcx + 4*rsi + 12]
 16586  	add	edx, eax
 16587  	mov	dword ptr [r8 + 4*rsi + 12], edx
 16588  	add	rsi, 4
 16589  	cmp	r10, rsi
 16590  	jne	.LBB2_1060
 16591  	jmp	.LBB2_1069
 16592  .LBB2_262:
 16593  	test	r9d, r9d
 16594  	jle	.LBB2_1069
 16595  # %bb.263:
 16596  	mov	r11d, dword ptr [rdx]
 16597  	mov	r10d, r9d
 16598  	cmp	r9d, 8
 16599  	jb	.LBB2_264
 16600  # %bb.450:
 16601  	lea	rdx, [rcx + 4*r10]
 16602  	cmp	rdx, r8
 16603  	jbe	.LBB2_618
 16604  # %bb.451:
 16605  	lea	rdx, [r8 + 4*r10]
 16606  	cmp	rdx, rcx
 16607  	jbe	.LBB2_618
 16608  .LBB2_264:
 16609  	xor	esi, esi
 16610  .LBB2_1065:
 16611  	mov	rdx, rsi
 16612  	not	rdx
 16613  	add	rdx, r10
 16614  	mov	rdi, r10
 16615  	and	rdi, 3
 16616  	je	.LBB2_1067
 16617  .LBB2_1066:                             # =>This Inner Loop Header: Depth=1
 16618  	mov	eax, r11d
 16619  	sub	eax, dword ptr [rcx + 4*rsi]
 16620  	mov	dword ptr [r8 + 4*rsi], eax
 16621  	add	rsi, 1
 16622  	add	rdi, -1
 16623  	jne	.LBB2_1066
 16624  .LBB2_1067:
 16625  	cmp	rdx, 3
 16626  	jb	.LBB2_1069
 16627  .LBB2_1068:                             # =>This Inner Loop Header: Depth=1
 16628  	mov	eax, r11d
 16629  	sub	eax, dword ptr [rcx + 4*rsi]
 16630  	mov	dword ptr [r8 + 4*rsi], eax
 16631  	mov	eax, r11d
 16632  	sub	eax, dword ptr [rcx + 4*rsi + 4]
 16633  	mov	dword ptr [r8 + 4*rsi + 4], eax
 16634  	mov	eax, r11d
 16635  	sub	eax, dword ptr [rcx + 4*rsi + 8]
 16636  	mov	dword ptr [r8 + 4*rsi + 8], eax
 16637  	mov	eax, r11d
 16638  	sub	eax, dword ptr [rcx + 4*rsi + 12]
 16639  	mov	dword ptr [r8 + 4*rsi + 12], eax
 16640  	add	rsi, 4
 16641  	cmp	r10, rsi
 16642  	jne	.LBB2_1068
 16643  	jmp	.LBB2_1069
 16644  .LBB2_319:
 16645  	and	esi, -4
 16646  	xor	edi, edi
 16647  .LBB2_320:                              # =>This Inner Loop Header: Depth=1
 16648  	mov	rdx, qword ptr [rcx + 8*rdi]
 16649  	imul	rdx, rax
 16650  	mov	qword ptr [r8 + 8*rdi], rdx
 16651  	mov	rdx, qword ptr [rcx + 8*rdi + 8]
 16652  	imul	rdx, rax
 16653  	mov	qword ptr [r8 + 8*rdi + 8], rdx
 16654  	mov	rdx, qword ptr [rcx + 8*rdi + 16]
 16655  	imul	rdx, rax
 16656  	mov	qword ptr [r8 + 8*rdi + 16], rdx
 16657  	mov	rdx, qword ptr [rcx + 8*rdi + 24]
 16658  	imul	rdx, rax
 16659  	mov	qword ptr [r8 + 8*rdi + 24], rdx
 16660  	add	rdi, 4
 16661  	cmp	rsi, rdi
 16662  	jne	.LBB2_320
 16663  .LBB2_321:
 16664  	test	r9, r9
 16665  	je	.LBB2_1069
 16666  # %bb.322:
 16667  	lea	rsi, [r8 + 8*rdi]
 16668  	lea	rcx, [rcx + 8*rdi]
 16669  	xor	edi, edi
 16670  .LBB2_323:                              # =>This Inner Loop Header: Depth=1
 16671  	mov	rdx, qword ptr [rcx + 8*rdi]
 16672  	imul	rdx, rax
 16673  	mov	qword ptr [rsi + 8*rdi], rdx
 16674  	add	rdi, 1
 16675  	cmp	r9, rdi
 16676  	jne	.LBB2_323
 16677  	jmp	.LBB2_1069
 16678  .LBB2_324:
 16679  	and	esi, -4
 16680  	xor	edi, edi
 16681  .LBB2_325:                              # =>This Inner Loop Header: Depth=1
 16682  	mov	rdx, qword ptr [rcx + 8*rdi]
 16683  	imul	rdx, rax
 16684  	mov	qword ptr [r8 + 8*rdi], rdx
 16685  	mov	rdx, qword ptr [rcx + 8*rdi + 8]
 16686  	imul	rdx, rax
 16687  	mov	qword ptr [r8 + 8*rdi + 8], rdx
 16688  	mov	rdx, qword ptr [rcx + 8*rdi + 16]
 16689  	imul	rdx, rax
 16690  	mov	qword ptr [r8 + 8*rdi + 16], rdx
 16691  	mov	rdx, qword ptr [rcx + 8*rdi + 24]
 16692  	imul	rdx, rax
 16693  	mov	qword ptr [r8 + 8*rdi + 24], rdx
 16694  	add	rdi, 4
 16695  	cmp	rsi, rdi
 16696  	jne	.LBB2_325
 16697  .LBB2_326:
 16698  	test	r9, r9
 16699  	je	.LBB2_1069
 16700  # %bb.327:
 16701  	lea	rsi, [r8 + 8*rdi]
 16702  	lea	rcx, [rcx + 8*rdi]
 16703  	xor	edi, edi
 16704  .LBB2_328:                              # =>This Inner Loop Header: Depth=1
 16705  	mov	rdx, qword ptr [rcx + 8*rdi]
 16706  	imul	rdx, rax
 16707  	mov	qword ptr [rsi + 8*rdi], rdx
 16708  	add	rdi, 1
 16709  	cmp	r9, rdi
 16710  	jne	.LBB2_328
 16711  	jmp	.LBB2_1069
 16712  .LBB2_377:
 16713  	and	esi, -4
 16714  	xor	edi, edi
 16715  .LBB2_378:                              # =>This Inner Loop Header: Depth=1
 16716  	mov	rdx, qword ptr [rcx + 8*rdi]
 16717  	imul	rdx, rax
 16718  	mov	qword ptr [r8 + 8*rdi], rdx
 16719  	mov	rdx, qword ptr [rcx + 8*rdi + 8]
 16720  	imul	rdx, rax
 16721  	mov	qword ptr [r8 + 8*rdi + 8], rdx
 16722  	mov	rdx, qword ptr [rcx + 8*rdi + 16]
 16723  	imul	rdx, rax
 16724  	mov	qword ptr [r8 + 8*rdi + 16], rdx
 16725  	mov	rdx, qword ptr [rcx + 8*rdi + 24]
 16726  	imul	rdx, rax
 16727  	mov	qword ptr [r8 + 8*rdi + 24], rdx
 16728  	add	rdi, 4
 16729  	cmp	rsi, rdi
 16730  	jne	.LBB2_378
 16731  .LBB2_379:
 16732  	test	r9, r9
 16733  	je	.LBB2_1069
 16734  # %bb.380:
 16735  	lea	rsi, [r8 + 8*rdi]
 16736  	lea	rcx, [rcx + 8*rdi]
 16737  	xor	edi, edi
 16738  .LBB2_381:                              # =>This Inner Loop Header: Depth=1
 16739  	mov	rdx, qword ptr [rcx + 8*rdi]
 16740  	imul	rdx, rax
 16741  	mov	qword ptr [rsi + 8*rdi], rdx
 16742  	add	rdi, 1
 16743  	cmp	r9, rdi
 16744  	jne	.LBB2_381
 16745  	jmp	.LBB2_1069
 16746  .LBB2_385:
 16747  	and	esi, -4
 16748  	xor	edi, edi
 16749  .LBB2_386:                              # =>This Inner Loop Header: Depth=1
 16750  	mov	rdx, qword ptr [rcx + 8*rdi]
 16751  	imul	rdx, rax
 16752  	mov	qword ptr [r8 + 8*rdi], rdx
 16753  	mov	rdx, qword ptr [rcx + 8*rdi + 8]
 16754  	imul	rdx, rax
 16755  	mov	qword ptr [r8 + 8*rdi + 8], rdx
 16756  	mov	rdx, qword ptr [rcx + 8*rdi + 16]
 16757  	imul	rdx, rax
 16758  	mov	qword ptr [r8 + 8*rdi + 16], rdx
 16759  	mov	rdx, qword ptr [rcx + 8*rdi + 24]
 16760  	imul	rdx, rax
 16761  	mov	qword ptr [r8 + 8*rdi + 24], rdx
 16762  	add	rdi, 4
 16763  	cmp	rsi, rdi
 16764  	jne	.LBB2_386
 16765  .LBB2_387:
 16766  	test	r9, r9
 16767  	je	.LBB2_1069
 16768  # %bb.388:
 16769  	lea	rsi, [r8 + 8*rdi]
 16770  	lea	rcx, [rcx + 8*rdi]
 16771  	xor	edi, edi
 16772  .LBB2_389:                              # =>This Inner Loop Header: Depth=1
 16773  	mov	rdx, qword ptr [rcx + 8*rdi]
 16774  	imul	rdx, rax
 16775  	mov	qword ptr [rsi + 8*rdi], rdx
 16776  	add	rdi, 1
 16777  	cmp	r9, rdi
 16778  	jne	.LBB2_389
 16779  .LBB2_1069:
 16780  	mov	rsp, rbp
 16781  	pop	rbp
 16782  	ret
 16783  .LBB2_453:
 16784  	mov	esi, r10d
 16785  	and	esi, -8
 16786  	movd	xmm0, eax
 16787  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 16788  	lea	rdx, [rsi - 8]
 16789  	mov	r9, rdx
 16790  	shr	r9, 3
 16791  	add	r9, 1
 16792  	test	rdx, rdx
 16793  	je	.LBB2_621
 16794  # %bb.454:
 16795  	mov	rdx, r9
 16796  	and	rdx, -2
 16797  	neg	rdx
 16798  	xor	edi, edi
 16799  .LBB2_455:                              # =>This Inner Loop Header: Depth=1
 16800  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 16801  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 16802  	pmulld	xmm1, xmm0
 16803  	pmulld	xmm2, xmm0
 16804  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 16805  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 16806  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi + 32]
 16807  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
 16808  	pmulld	xmm1, xmm0
 16809  	pmulld	xmm2, xmm0
 16810  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 16811  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 16812  	add	rdi, 16
 16813  	add	rdx, 2
 16814  	jne	.LBB2_455
 16815  	jmp	.LBB2_622
 16816  .LBB2_456:
 16817  	mov	esi, r10d
 16818  	and	esi, -8
 16819  	movd	xmm0, eax
 16820  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 16821  	lea	rdx, [rsi - 8]
 16822  	mov	r9, rdx
 16823  	shr	r9, 3
 16824  	add	r9, 1
 16825  	test	rdx, rdx
 16826  	je	.LBB2_629
 16827  # %bb.457:
 16828  	mov	rdx, r9
 16829  	and	rdx, -2
 16830  	neg	rdx
 16831  	xor	edi, edi
 16832  .LBB2_458:                              # =>This Inner Loop Header: Depth=1
 16833  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 16834  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 16835  	pmulld	xmm1, xmm0
 16836  	pmulld	xmm2, xmm0
 16837  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 16838  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 16839  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi + 32]
 16840  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
 16841  	pmulld	xmm1, xmm0
 16842  	pmulld	xmm2, xmm0
 16843  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 16844  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 16845  	add	rdi, 16
 16846  	add	rdx, 2
 16847  	jne	.LBB2_458
 16848  	jmp	.LBB2_630
 16849  .LBB2_459:
 16850  	mov	esi, r10d
 16851  	and	esi, -8
 16852  	movd	xmm0, eax
 16853  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 16854  	lea	rdx, [rsi - 8]
 16855  	mov	r9, rdx
 16856  	shr	r9, 3
 16857  	add	r9, 1
 16858  	test	rdx, rdx
 16859  	je	.LBB2_637
 16860  # %bb.460:
 16861  	mov	rdx, r9
 16862  	and	rdx, -2
 16863  	neg	rdx
 16864  	xor	edi, edi
 16865  .LBB2_461:                              # =>This Inner Loop Header: Depth=1
 16866  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 16867  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 16868  	paddd	xmm1, xmm0
 16869  	paddd	xmm2, xmm0
 16870  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 16871  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 16872  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi + 32]
 16873  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
 16874  	paddd	xmm1, xmm0
 16875  	paddd	xmm2, xmm0
 16876  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 16877  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 16878  	add	rdi, 16
 16879  	add	rdx, 2
 16880  	jne	.LBB2_461
 16881  	jmp	.LBB2_638
 16882  .LBB2_462:
 16883  	mov	esi, r10d
 16884  	and	esi, -8
 16885  	movd	xmm0, r11d
 16886  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 16887  	lea	rdx, [rsi - 8]
 16888  	mov	r9, rdx
 16889  	shr	r9, 3
 16890  	add	r9, 1
 16891  	test	rdx, rdx
 16892  	je	.LBB2_645
 16893  # %bb.463:
 16894  	mov	rdx, r9
 16895  	and	rdx, -2
 16896  	neg	rdx
 16897  	xor	edi, edi
 16898  .LBB2_464:                              # =>This Inner Loop Header: Depth=1
 16899  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 16900  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 16901  	movdqa	xmm3, xmm0
 16902  	psubd	xmm3, xmm1
 16903  	movdqa	xmm1, xmm0
 16904  	psubd	xmm1, xmm2
 16905  	movdqu	xmmword ptr [r8 + 4*rdi], xmm3
 16906  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm1
 16907  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi + 32]
 16908  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
 16909  	movdqa	xmm3, xmm0
 16910  	psubd	xmm3, xmm1
 16911  	movdqa	xmm1, xmm0
 16912  	psubd	xmm1, xmm2
 16913  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm3
 16914  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm1
 16915  	add	rdi, 16
 16916  	add	rdx, 2
 16917  	jne	.LBB2_464
 16918  	jmp	.LBB2_646
 16919  .LBB2_465:
 16920  	mov	esi, r10d
 16921  	and	esi, -8
 16922  	movd	xmm0, eax
 16923  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 16924  	lea	rdx, [rsi - 8]
 16925  	mov	r9, rdx
 16926  	shr	r9, 3
 16927  	add	r9, 1
 16928  	test	rdx, rdx
 16929  	je	.LBB2_653
 16930  # %bb.466:
 16931  	mov	rdx, r9
 16932  	and	rdx, -2
 16933  	neg	rdx
 16934  	xor	edi, edi
 16935  .LBB2_467:                              # =>This Inner Loop Header: Depth=1
 16936  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 16937  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 16938  	paddd	xmm1, xmm0
 16939  	paddd	xmm2, xmm0
 16940  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 16941  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 16942  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi + 32]
 16943  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
 16944  	paddd	xmm1, xmm0
 16945  	paddd	xmm2, xmm0
 16946  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 16947  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 16948  	add	rdi, 16
 16949  	add	rdx, 2
 16950  	jne	.LBB2_467
 16951  	jmp	.LBB2_654
 16952  .LBB2_468:
 16953  	mov	esi, r10d
 16954  	and	esi, -8
 16955  	movd	xmm0, r11d
 16956  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 16957  	lea	rdx, [rsi - 8]
 16958  	mov	r9, rdx
 16959  	shr	r9, 3
 16960  	add	r9, 1
 16961  	test	rdx, rdx
 16962  	je	.LBB2_661
 16963  # %bb.469:
 16964  	mov	rdx, r9
 16965  	and	rdx, -2
 16966  	neg	rdx
 16967  	xor	edi, edi
 16968  .LBB2_470:                              # =>This Inner Loop Header: Depth=1
 16969  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 16970  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 16971  	movdqa	xmm3, xmm0
 16972  	psubd	xmm3, xmm1
 16973  	movdqa	xmm1, xmm0
 16974  	psubd	xmm1, xmm2
 16975  	movdqu	xmmword ptr [r8 + 4*rdi], xmm3
 16976  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm1
 16977  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi + 32]
 16978  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
 16979  	movdqa	xmm3, xmm0
 16980  	psubd	xmm3, xmm1
 16981  	movdqa	xmm1, xmm0
 16982  	psubd	xmm1, xmm2
 16983  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm3
 16984  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm1
 16985  	add	rdi, 16
 16986  	add	rdx, 2
 16987  	jne	.LBB2_470
 16988  	jmp	.LBB2_662
 16989  .LBB2_471:
 16990  	mov	edx, eax
 16991  	and	edx, -4
 16992  	movddup	xmm1, xmm0                      # xmm1 = xmm0[0,0]
 16993  	lea	rsi, [rdx - 4]
 16994  	mov	r9, rsi
 16995  	shr	r9, 2
 16996  	add	r9, 1
 16997  	test	rsi, rsi
 16998  	je	.LBB2_669
 16999  # %bb.472:
 17000  	mov	rsi, r9
 17001  	and	rsi, -2
 17002  	neg	rsi
 17003  	xor	edi, edi
 17004  .LBB2_473:                              # =>This Inner Loop Header: Depth=1
 17005  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 17006  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 17007  	mulpd	xmm2, xmm1
 17008  	mulpd	xmm3, xmm1
 17009  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 17010  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 17011  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 32]
 17012  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 48]
 17013  	mulpd	xmm2, xmm1
 17014  	mulpd	xmm3, xmm1
 17015  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
 17016  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm3
 17017  	add	rdi, 8
 17018  	add	rsi, 2
 17019  	jne	.LBB2_473
 17020  	jmp	.LBB2_670
 17021  .LBB2_474:
 17022  	mov	edx, eax
 17023  	and	edx, -4
 17024  	movddup	xmm1, xmm0                      # xmm1 = xmm0[0,0]
 17025  	lea	rsi, [rdx - 4]
 17026  	mov	r9, rsi
 17027  	shr	r9, 2
 17028  	add	r9, 1
 17029  	test	rsi, rsi
 17030  	je	.LBB2_677
 17031  # %bb.475:
 17032  	mov	rsi, r9
 17033  	and	rsi, -2
 17034  	neg	rsi
 17035  	xor	edi, edi
 17036  .LBB2_476:                              # =>This Inner Loop Header: Depth=1
 17037  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 17038  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 17039  	mulpd	xmm2, xmm1
 17040  	mulpd	xmm3, xmm1
 17041  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 17042  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 17043  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 32]
 17044  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 48]
 17045  	mulpd	xmm2, xmm1
 17046  	mulpd	xmm3, xmm1
 17047  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
 17048  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm3
 17049  	add	rdi, 8
 17050  	add	rsi, 2
 17051  	jne	.LBB2_476
 17052  	jmp	.LBB2_678
 17053  .LBB2_477:
 17054  	mov	edx, eax
 17055  	and	edx, -4
 17056  	movddup	xmm1, xmm0                      # xmm1 = xmm0[0,0]
 17057  	lea	rsi, [rdx - 4]
 17058  	mov	r9, rsi
 17059  	shr	r9, 2
 17060  	add	r9, 1
 17061  	test	rsi, rsi
 17062  	je	.LBB2_685
 17063  # %bb.478:
 17064  	mov	rsi, r9
 17065  	and	rsi, -2
 17066  	neg	rsi
 17067  	xor	edi, edi
 17068  .LBB2_479:                              # =>This Inner Loop Header: Depth=1
 17069  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 17070  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 17071  	addpd	xmm2, xmm1
 17072  	addpd	xmm3, xmm1
 17073  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 17074  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 17075  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 32]
 17076  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 48]
 17077  	addpd	xmm2, xmm1
 17078  	addpd	xmm3, xmm1
 17079  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
 17080  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm3
 17081  	add	rdi, 8
 17082  	add	rsi, 2
 17083  	jne	.LBB2_479
 17084  	jmp	.LBB2_686
 17085  .LBB2_480:
 17086  	mov	edx, eax
 17087  	and	edx, -4
 17088  	movddup	xmm1, xmm0                      # xmm1 = xmm0[0,0]
 17089  	lea	rsi, [rdx - 4]
 17090  	mov	r9, rsi
 17091  	shr	r9, 2
 17092  	add	r9, 1
 17093  	test	rsi, rsi
 17094  	je	.LBB2_693
 17095  # %bb.481:
 17096  	mov	rsi, r9
 17097  	and	rsi, -2
 17098  	neg	rsi
 17099  	xor	edi, edi
 17100  .LBB2_482:                              # =>This Inner Loop Header: Depth=1
 17101  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 17102  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 17103  	movapd	xmm4, xmm1
 17104  	subpd	xmm4, xmm2
 17105  	movapd	xmm2, xmm1
 17106  	subpd	xmm2, xmm3
 17107  	movupd	xmmword ptr [r8 + 8*rdi], xmm4
 17108  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm2
 17109  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 32]
 17110  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 48]
 17111  	movapd	xmm4, xmm1
 17112  	subpd	xmm4, xmm2
 17113  	movapd	xmm2, xmm1
 17114  	subpd	xmm2, xmm3
 17115  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm4
 17116  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm2
 17117  	add	rdi, 8
 17118  	add	rsi, 2
 17119  	jne	.LBB2_482
 17120  	jmp	.LBB2_694
 17121  .LBB2_483:
 17122  	mov	edx, eax
 17123  	and	edx, -4
 17124  	movddup	xmm1, xmm0                      # xmm1 = xmm0[0,0]
 17125  	lea	rsi, [rdx - 4]
 17126  	mov	r9, rsi
 17127  	shr	r9, 2
 17128  	add	r9, 1
 17129  	test	rsi, rsi
 17130  	je	.LBB2_701
 17131  # %bb.484:
 17132  	mov	rsi, r9
 17133  	and	rsi, -2
 17134  	neg	rsi
 17135  	xor	edi, edi
 17136  .LBB2_485:                              # =>This Inner Loop Header: Depth=1
 17137  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 17138  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 17139  	addpd	xmm2, xmm1
 17140  	addpd	xmm3, xmm1
 17141  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 17142  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 17143  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 32]
 17144  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 48]
 17145  	addpd	xmm2, xmm1
 17146  	addpd	xmm3, xmm1
 17147  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm2
 17148  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm3
 17149  	add	rdi, 8
 17150  	add	rsi, 2
 17151  	jne	.LBB2_485
 17152  	jmp	.LBB2_702
 17153  .LBB2_486:
 17154  	mov	edx, eax
 17155  	and	edx, -4
 17156  	movddup	xmm1, xmm0                      # xmm1 = xmm0[0,0]
 17157  	lea	rsi, [rdx - 4]
 17158  	mov	r9, rsi
 17159  	shr	r9, 2
 17160  	add	r9, 1
 17161  	test	rsi, rsi
 17162  	je	.LBB2_709
 17163  # %bb.487:
 17164  	mov	rsi, r9
 17165  	and	rsi, -2
 17166  	neg	rsi
 17167  	xor	edi, edi
 17168  .LBB2_488:                              # =>This Inner Loop Header: Depth=1
 17169  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 17170  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 17171  	movapd	xmm4, xmm1
 17172  	subpd	xmm4, xmm2
 17173  	movapd	xmm2, xmm1
 17174  	subpd	xmm2, xmm3
 17175  	movupd	xmmword ptr [r8 + 8*rdi], xmm4
 17176  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm2
 17177  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 32]
 17178  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 48]
 17179  	movapd	xmm4, xmm1
 17180  	subpd	xmm4, xmm2
 17181  	movapd	xmm2, xmm1
 17182  	subpd	xmm2, xmm3
 17183  	movupd	xmmword ptr [r8 + 8*rdi + 32], xmm4
 17184  	movupd	xmmword ptr [r8 + 8*rdi + 48], xmm2
 17185  	add	rdi, 8
 17186  	add	rsi, 2
 17187  	jne	.LBB2_488
 17188  	jmp	.LBB2_710
 17189  .LBB2_489:
 17190  	mov	edi, r10d
 17191  	and	edi, -32
 17192  	movzx	eax, dl
 17193  	movd	xmm0, eax
 17194  	pxor	xmm1, xmm1
 17195  	pshufb	xmm0, xmm1
 17196  	lea	rax, [rdi - 32]
 17197  	mov	r9, rax
 17198  	shr	r9, 5
 17199  	add	r9, 1
 17200  	pmovzxbw	xmm1, xmm0                      # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 17201  	test	rax, rax
 17202  	je	.LBB2_717
 17203  # %bb.490:
 17204  	mov	rsi, r9
 17205  	and	rsi, -2
 17206  	neg	rsi
 17207  	xor	eax, eax
 17208  	movdqa	xmm2, xmm0
 17209  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 17210  	movdqa	xmm3, xmmword ptr [rip + .LCPI2_0] # xmm3 = [255,255,255,255,255,255,255,255]
 17211  	movdqa	xmm4, xmm0
 17212  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 17213  .LBB2_491:                              # =>This Inner Loop Header: Depth=1
 17214  	movdqu	xmm5, xmmword ptr [rcx + rax]
 17215  	movdqu	xmm6, xmmword ptr [rcx + rax + 16]
 17216  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 17217  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 17218  	pmullw	xmm5, xmm2
 17219  	pand	xmm5, xmm3
 17220  	pmullw	xmm7, xmm1
 17221  	pand	xmm7, xmm3
 17222  	packuswb	xmm7, xmm5
 17223  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 17224  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 17225  	pmullw	xmm6, xmm4
 17226  	pand	xmm6, xmm3
 17227  	pmullw	xmm5, xmm1
 17228  	pand	xmm5, xmm3
 17229  	packuswb	xmm5, xmm6
 17230  	movdqu	xmmword ptr [r8 + rax], xmm7
 17231  	movdqu	xmmword ptr [r8 + rax + 16], xmm5
 17232  	movdqu	xmm5, xmmword ptr [rcx + rax + 32]
 17233  	movdqu	xmm6, xmmword ptr [rcx + rax + 48]
 17234  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 17235  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 17236  	pmullw	xmm5, xmm2
 17237  	pand	xmm5, xmm3
 17238  	pmullw	xmm7, xmm1
 17239  	pand	xmm7, xmm3
 17240  	packuswb	xmm7, xmm5
 17241  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 17242  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 17243  	pmullw	xmm6, xmm4
 17244  	pand	xmm6, xmm3
 17245  	pmullw	xmm5, xmm1
 17246  	pand	xmm5, xmm3
 17247  	packuswb	xmm5, xmm6
 17248  	movdqu	xmmword ptr [r8 + rax + 32], xmm7
 17249  	movdqu	xmmword ptr [r8 + rax + 48], xmm5
 17250  	add	rax, 64
 17251  	add	rsi, 2
 17252  	jne	.LBB2_491
 17253  	jmp	.LBB2_718
 17254  .LBB2_492:
 17255  	mov	edi, r10d
 17256  	and	edi, -32
 17257  	movzx	eax, dl
 17258  	movd	xmm0, eax
 17259  	pxor	xmm1, xmm1
 17260  	pshufb	xmm0, xmm1
 17261  	lea	rax, [rdi - 32]
 17262  	mov	r9, rax
 17263  	shr	r9, 5
 17264  	add	r9, 1
 17265  	pmovzxbw	xmm1, xmm0                      # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 17266  	test	rax, rax
 17267  	je	.LBB2_725
 17268  # %bb.493:
 17269  	mov	rsi, r9
 17270  	and	rsi, -2
 17271  	neg	rsi
 17272  	xor	eax, eax
 17273  	movdqa	xmm2, xmm0
 17274  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 17275  	movdqa	xmm3, xmmword ptr [rip + .LCPI2_0] # xmm3 = [255,255,255,255,255,255,255,255]
 17276  	movdqa	xmm4, xmm0
 17277  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 17278  .LBB2_494:                              # =>This Inner Loop Header: Depth=1
 17279  	movdqu	xmm5, xmmword ptr [rcx + rax]
 17280  	movdqu	xmm6, xmmword ptr [rcx + rax + 16]
 17281  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 17282  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 17283  	pmullw	xmm5, xmm2
 17284  	pand	xmm5, xmm3
 17285  	pmullw	xmm7, xmm1
 17286  	pand	xmm7, xmm3
 17287  	packuswb	xmm7, xmm5
 17288  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 17289  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 17290  	pmullw	xmm6, xmm4
 17291  	pand	xmm6, xmm3
 17292  	pmullw	xmm5, xmm1
 17293  	pand	xmm5, xmm3
 17294  	packuswb	xmm5, xmm6
 17295  	movdqu	xmmword ptr [r8 + rax], xmm7
 17296  	movdqu	xmmword ptr [r8 + rax + 16], xmm5
 17297  	movdqu	xmm5, xmmword ptr [rcx + rax + 32]
 17298  	movdqu	xmm6, xmmword ptr [rcx + rax + 48]
 17299  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 17300  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 17301  	pmullw	xmm5, xmm2
 17302  	pand	xmm5, xmm3
 17303  	pmullw	xmm7, xmm1
 17304  	pand	xmm7, xmm3
 17305  	packuswb	xmm7, xmm5
 17306  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 17307  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 17308  	pmullw	xmm6, xmm4
 17309  	pand	xmm6, xmm3
 17310  	pmullw	xmm5, xmm1
 17311  	pand	xmm5, xmm3
 17312  	packuswb	xmm5, xmm6
 17313  	movdqu	xmmword ptr [r8 + rax + 32], xmm7
 17314  	movdqu	xmmword ptr [r8 + rax + 48], xmm5
 17315  	add	rax, 64
 17316  	add	rsi, 2
 17317  	jne	.LBB2_494
 17318  	jmp	.LBB2_726
 17319  .LBB2_495:
 17320  	mov	esi, r10d
 17321  	and	esi, -32
 17322  	movzx	edx, al
 17323  	movd	xmm0, edx
 17324  	pxor	xmm1, xmm1
 17325  	pshufb	xmm0, xmm1
 17326  	lea	rdx, [rsi - 32]
 17327  	mov	r9, rdx
 17328  	shr	r9, 5
 17329  	add	r9, 1
 17330  	test	rdx, rdx
 17331  	je	.LBB2_733
 17332  # %bb.496:
 17333  	mov	rdx, r9
 17334  	and	rdx, -2
 17335  	neg	rdx
 17336  	xor	edi, edi
 17337  .LBB2_497:                              # =>This Inner Loop Header: Depth=1
 17338  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 17339  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 17340  	paddb	xmm1, xmm0
 17341  	paddb	xmm2, xmm0
 17342  	movdqu	xmmword ptr [r8 + rdi], xmm1
 17343  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 17344  	movdqu	xmm1, xmmword ptr [rcx + rdi + 32]
 17345  	movdqu	xmm2, xmmword ptr [rcx + rdi + 48]
 17346  	paddb	xmm1, xmm0
 17347  	paddb	xmm2, xmm0
 17348  	movdqu	xmmword ptr [r8 + rdi + 32], xmm1
 17349  	movdqu	xmmword ptr [r8 + rdi + 48], xmm2
 17350  	add	rdi, 64
 17351  	add	rdx, 2
 17352  	jne	.LBB2_497
 17353  	jmp	.LBB2_734
 17354  .LBB2_498:
 17355  	mov	esi, r10d
 17356  	and	esi, -32
 17357  	movzx	edx, r11b
 17358  	movd	xmm0, edx
 17359  	pxor	xmm1, xmm1
 17360  	pshufb	xmm0, xmm1
 17361  	lea	rdx, [rsi - 32]
 17362  	mov	r9, rdx
 17363  	shr	r9, 5
 17364  	add	r9, 1
 17365  	test	rdx, rdx
 17366  	je	.LBB2_741
 17367  # %bb.499:
 17368  	mov	rdx, r9
 17369  	and	rdx, -2
 17370  	neg	rdx
 17371  	xor	edi, edi
 17372  .LBB2_500:                              # =>This Inner Loop Header: Depth=1
 17373  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 17374  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 17375  	movdqa	xmm3, xmm0
 17376  	psubb	xmm3, xmm1
 17377  	movdqa	xmm1, xmm0
 17378  	psubb	xmm1, xmm2
 17379  	movdqu	xmmword ptr [r8 + rdi], xmm3
 17380  	movdqu	xmmword ptr [r8 + rdi + 16], xmm1
 17381  	movdqu	xmm1, xmmword ptr [rcx + rdi + 32]
 17382  	movdqu	xmm2, xmmword ptr [rcx + rdi + 48]
 17383  	movdqa	xmm3, xmm0
 17384  	psubb	xmm3, xmm1
 17385  	movdqa	xmm1, xmm0
 17386  	psubb	xmm1, xmm2
 17387  	movdqu	xmmword ptr [r8 + rdi + 32], xmm3
 17388  	movdqu	xmmword ptr [r8 + rdi + 48], xmm1
 17389  	add	rdi, 64
 17390  	add	rdx, 2
 17391  	jne	.LBB2_500
 17392  	jmp	.LBB2_742
 17393  .LBB2_501:
 17394  	mov	esi, r10d
 17395  	and	esi, -32
 17396  	movzx	edx, al
 17397  	movd	xmm0, edx
 17398  	pxor	xmm1, xmm1
 17399  	pshufb	xmm0, xmm1
 17400  	lea	rdx, [rsi - 32]
 17401  	mov	r9, rdx
 17402  	shr	r9, 5
 17403  	add	r9, 1
 17404  	test	rdx, rdx
 17405  	je	.LBB2_749
 17406  # %bb.502:
 17407  	mov	rdx, r9
 17408  	and	rdx, -2
 17409  	neg	rdx
 17410  	xor	edi, edi
 17411  .LBB2_503:                              # =>This Inner Loop Header: Depth=1
 17412  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 17413  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 17414  	paddb	xmm1, xmm0
 17415  	paddb	xmm2, xmm0
 17416  	movdqu	xmmword ptr [r8 + rdi], xmm1
 17417  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 17418  	movdqu	xmm1, xmmword ptr [rcx + rdi + 32]
 17419  	movdqu	xmm2, xmmword ptr [rcx + rdi + 48]
 17420  	paddb	xmm1, xmm0
 17421  	paddb	xmm2, xmm0
 17422  	movdqu	xmmword ptr [r8 + rdi + 32], xmm1
 17423  	movdqu	xmmword ptr [r8 + rdi + 48], xmm2
 17424  	add	rdi, 64
 17425  	add	rdx, 2
 17426  	jne	.LBB2_503
 17427  	jmp	.LBB2_750
 17428  .LBB2_504:
 17429  	mov	esi, r10d
 17430  	and	esi, -32
 17431  	movzx	edx, r11b
 17432  	movd	xmm0, edx
 17433  	pxor	xmm1, xmm1
 17434  	pshufb	xmm0, xmm1
 17435  	lea	rdx, [rsi - 32]
 17436  	mov	r9, rdx
 17437  	shr	r9, 5
 17438  	add	r9, 1
 17439  	test	rdx, rdx
 17440  	je	.LBB2_757
 17441  # %bb.505:
 17442  	mov	rdx, r9
 17443  	and	rdx, -2
 17444  	neg	rdx
 17445  	xor	edi, edi
 17446  .LBB2_506:                              # =>This Inner Loop Header: Depth=1
 17447  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 17448  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 17449  	movdqa	xmm3, xmm0
 17450  	psubb	xmm3, xmm1
 17451  	movdqa	xmm1, xmm0
 17452  	psubb	xmm1, xmm2
 17453  	movdqu	xmmword ptr [r8 + rdi], xmm3
 17454  	movdqu	xmmword ptr [r8 + rdi + 16], xmm1
 17455  	movdqu	xmm1, xmmword ptr [rcx + rdi + 32]
 17456  	movdqu	xmm2, xmmword ptr [rcx + rdi + 48]
 17457  	movdqa	xmm3, xmm0
 17458  	psubb	xmm3, xmm1
 17459  	movdqa	xmm1, xmm0
 17460  	psubb	xmm1, xmm2
 17461  	movdqu	xmmword ptr [r8 + rdi + 32], xmm3
 17462  	movdqu	xmmword ptr [r8 + rdi + 48], xmm1
 17463  	add	rdi, 64
 17464  	add	rdx, 2
 17465  	jne	.LBB2_506
 17466  	jmp	.LBB2_758
 17467  .LBB2_507:
 17468  	mov	esi, r10d
 17469  	and	esi, -4
 17470  	movq	xmm0, rax
 17471  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 17472  	lea	rdx, [rsi - 4]
 17473  	mov	r9, rdx
 17474  	shr	r9, 2
 17475  	add	r9, 1
 17476  	test	rdx, rdx
 17477  	je	.LBB2_765
 17478  # %bb.508:
 17479  	mov	rdx, r9
 17480  	and	rdx, -2
 17481  	neg	rdx
 17482  	xor	edi, edi
 17483  .LBB2_509:                              # =>This Inner Loop Header: Depth=1
 17484  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 17485  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 17486  	paddq	xmm1, xmm0
 17487  	paddq	xmm2, xmm0
 17488  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 17489  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 17490  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi + 32]
 17491  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 48]
 17492  	paddq	xmm1, xmm0
 17493  	paddq	xmm2, xmm0
 17494  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm1
 17495  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm2
 17496  	add	rdi, 8
 17497  	add	rdx, 2
 17498  	jne	.LBB2_509
 17499  	jmp	.LBB2_766
 17500  .LBB2_510:
 17501  	mov	esi, r10d
 17502  	and	esi, -4
 17503  	movq	xmm0, r11
 17504  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 17505  	lea	rdx, [rsi - 4]
 17506  	mov	r9, rdx
 17507  	shr	r9, 2
 17508  	add	r9, 1
 17509  	test	rdx, rdx
 17510  	je	.LBB2_773
 17511  # %bb.511:
 17512  	mov	rdx, r9
 17513  	and	rdx, -2
 17514  	neg	rdx
 17515  	xor	edi, edi
 17516  .LBB2_512:                              # =>This Inner Loop Header: Depth=1
 17517  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 17518  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 17519  	movdqa	xmm3, xmm0
 17520  	psubq	xmm3, xmm1
 17521  	movdqa	xmm1, xmm0
 17522  	psubq	xmm1, xmm2
 17523  	movdqu	xmmword ptr [r8 + 8*rdi], xmm3
 17524  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm1
 17525  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi + 32]
 17526  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 48]
 17527  	movdqa	xmm3, xmm0
 17528  	psubq	xmm3, xmm1
 17529  	movdqa	xmm1, xmm0
 17530  	psubq	xmm1, xmm2
 17531  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm3
 17532  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm1
 17533  	add	rdi, 8
 17534  	add	rdx, 2
 17535  	jne	.LBB2_512
 17536  	jmp	.LBB2_774
 17537  .LBB2_513:
 17538  	mov	esi, r10d
 17539  	and	esi, -4
 17540  	movq	xmm0, rax
 17541  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 17542  	lea	rdx, [rsi - 4]
 17543  	mov	r9, rdx
 17544  	shr	r9, 2
 17545  	add	r9, 1
 17546  	test	rdx, rdx
 17547  	je	.LBB2_781
 17548  # %bb.514:
 17549  	mov	rdx, r9
 17550  	and	rdx, -2
 17551  	neg	rdx
 17552  	xor	edi, edi
 17553  .LBB2_515:                              # =>This Inner Loop Header: Depth=1
 17554  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 17555  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 17556  	paddq	xmm1, xmm0
 17557  	paddq	xmm2, xmm0
 17558  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 17559  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 17560  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi + 32]
 17561  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 48]
 17562  	paddq	xmm1, xmm0
 17563  	paddq	xmm2, xmm0
 17564  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm1
 17565  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm2
 17566  	add	rdi, 8
 17567  	add	rdx, 2
 17568  	jne	.LBB2_515
 17569  	jmp	.LBB2_782
 17570  .LBB2_516:
 17571  	mov	esi, r10d
 17572  	and	esi, -4
 17573  	movq	xmm0, r11
 17574  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 17575  	lea	rdx, [rsi - 4]
 17576  	mov	r9, rdx
 17577  	shr	r9, 2
 17578  	add	r9, 1
 17579  	test	rdx, rdx
 17580  	je	.LBB2_789
 17581  # %bb.517:
 17582  	mov	rdx, r9
 17583  	and	rdx, -2
 17584  	neg	rdx
 17585  	xor	edi, edi
 17586  .LBB2_518:                              # =>This Inner Loop Header: Depth=1
 17587  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 17588  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 17589  	movdqa	xmm3, xmm0
 17590  	psubq	xmm3, xmm1
 17591  	movdqa	xmm1, xmm0
 17592  	psubq	xmm1, xmm2
 17593  	movdqu	xmmword ptr [r8 + 8*rdi], xmm3
 17594  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm1
 17595  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi + 32]
 17596  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 48]
 17597  	movdqa	xmm3, xmm0
 17598  	psubq	xmm3, xmm1
 17599  	movdqa	xmm1, xmm0
 17600  	psubq	xmm1, xmm2
 17601  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm3
 17602  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm1
 17603  	add	rdi, 8
 17604  	add	rdx, 2
 17605  	jne	.LBB2_518
 17606  	jmp	.LBB2_790
 17607  .LBB2_519:
 17608  	mov	esi, r10d
 17609  	and	esi, -16
 17610  	movd	xmm0, eax
 17611  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 17612  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 17613  	lea	rdx, [rsi - 16]
 17614  	mov	r9, rdx
 17615  	shr	r9, 4
 17616  	add	r9, 1
 17617  	test	rdx, rdx
 17618  	je	.LBB2_797
 17619  # %bb.520:
 17620  	mov	rdx, r9
 17621  	and	rdx, -2
 17622  	neg	rdx
 17623  	xor	edi, edi
 17624  .LBB2_521:                              # =>This Inner Loop Header: Depth=1
 17625  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 17626  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 17627  	pmullw	xmm1, xmm0
 17628  	pmullw	xmm2, xmm0
 17629  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 17630  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 17631  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi + 32]
 17632  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
 17633  	pmullw	xmm1, xmm0
 17634  	pmullw	xmm2, xmm0
 17635  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 17636  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 17637  	add	rdi, 32
 17638  	add	rdx, 2
 17639  	jne	.LBB2_521
 17640  	jmp	.LBB2_798
 17641  .LBB2_522:
 17642  	mov	esi, r10d
 17643  	and	esi, -16
 17644  	movd	xmm0, eax
 17645  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 17646  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 17647  	lea	rdx, [rsi - 16]
 17648  	mov	r9, rdx
 17649  	shr	r9, 4
 17650  	add	r9, 1
 17651  	test	rdx, rdx
 17652  	je	.LBB2_805
 17653  # %bb.523:
 17654  	mov	rdx, r9
 17655  	and	rdx, -2
 17656  	neg	rdx
 17657  	xor	edi, edi
 17658  .LBB2_524:                              # =>This Inner Loop Header: Depth=1
 17659  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 17660  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 17661  	pmullw	xmm1, xmm0
 17662  	pmullw	xmm2, xmm0
 17663  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 17664  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 17665  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi + 32]
 17666  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
 17667  	pmullw	xmm1, xmm0
 17668  	pmullw	xmm2, xmm0
 17669  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 17670  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 17671  	add	rdi, 32
 17672  	add	rdx, 2
 17673  	jne	.LBB2_524
 17674  	jmp	.LBB2_806
 17675  .LBB2_525:
 17676  	mov	esi, r10d
 17677  	and	esi, -16
 17678  	movd	xmm0, eax
 17679  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 17680  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 17681  	lea	rdx, [rsi - 16]
 17682  	mov	r9, rdx
 17683  	shr	r9, 4
 17684  	add	r9, 1
 17685  	test	rdx, rdx
 17686  	je	.LBB2_813
 17687  # %bb.526:
 17688  	mov	rdx, r9
 17689  	and	rdx, -2
 17690  	neg	rdx
 17691  	xor	edi, edi
 17692  .LBB2_527:                              # =>This Inner Loop Header: Depth=1
 17693  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 17694  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 17695  	pmullw	xmm1, xmm0
 17696  	pmullw	xmm2, xmm0
 17697  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 17698  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 17699  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi + 32]
 17700  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
 17701  	pmullw	xmm1, xmm0
 17702  	pmullw	xmm2, xmm0
 17703  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 17704  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 17705  	add	rdi, 32
 17706  	add	rdx, 2
 17707  	jne	.LBB2_527
 17708  	jmp	.LBB2_814
 17709  .LBB2_528:
 17710  	mov	esi, r10d
 17711  	and	esi, -16
 17712  	movd	xmm0, eax
 17713  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 17714  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 17715  	lea	rdx, [rsi - 16]
 17716  	mov	r9, rdx
 17717  	shr	r9, 4
 17718  	add	r9, 1
 17719  	test	rdx, rdx
 17720  	je	.LBB2_821
 17721  # %bb.529:
 17722  	mov	rdx, r9
 17723  	and	rdx, -2
 17724  	neg	rdx
 17725  	xor	edi, edi
 17726  .LBB2_530:                              # =>This Inner Loop Header: Depth=1
 17727  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 17728  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 17729  	pmullw	xmm1, xmm0
 17730  	pmullw	xmm2, xmm0
 17731  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 17732  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 17733  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi + 32]
 17734  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
 17735  	pmullw	xmm1, xmm0
 17736  	pmullw	xmm2, xmm0
 17737  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 17738  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 17739  	add	rdi, 32
 17740  	add	rdx, 2
 17741  	jne	.LBB2_530
 17742  	jmp	.LBB2_822
 17743  .LBB2_531:
 17744  	mov	esi, r10d
 17745  	and	esi, -16
 17746  	movd	xmm0, eax
 17747  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 17748  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 17749  	lea	rdx, [rsi - 16]
 17750  	mov	r9, rdx
 17751  	shr	r9, 4
 17752  	add	r9, 1
 17753  	test	rdx, rdx
 17754  	je	.LBB2_829
 17755  # %bb.532:
 17756  	mov	rdx, r9
 17757  	and	rdx, -2
 17758  	neg	rdx
 17759  	xor	edi, edi
 17760  .LBB2_533:                              # =>This Inner Loop Header: Depth=1
 17761  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 17762  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 17763  	paddw	xmm1, xmm0
 17764  	paddw	xmm2, xmm0
 17765  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 17766  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 17767  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi + 32]
 17768  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
 17769  	paddw	xmm1, xmm0
 17770  	paddw	xmm2, xmm0
 17771  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 17772  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 17773  	add	rdi, 32
 17774  	add	rdx, 2
 17775  	jne	.LBB2_533
 17776  	jmp	.LBB2_830
 17777  .LBB2_534:
 17778  	mov	esi, r10d
 17779  	and	esi, -16
 17780  	movd	xmm0, eax
 17781  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 17782  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 17783  	lea	rdx, [rsi - 16]
 17784  	mov	r9, rdx
 17785  	shr	r9, 4
 17786  	add	r9, 1
 17787  	test	rdx, rdx
 17788  	je	.LBB2_837
 17789  # %bb.535:
 17790  	mov	rdx, r9
 17791  	and	rdx, -2
 17792  	neg	rdx
 17793  	xor	edi, edi
 17794  .LBB2_536:                              # =>This Inner Loop Header: Depth=1
 17795  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 17796  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 17797  	paddw	xmm1, xmm0
 17798  	paddw	xmm2, xmm0
 17799  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 17800  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 17801  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi + 32]
 17802  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
 17803  	paddw	xmm1, xmm0
 17804  	paddw	xmm2, xmm0
 17805  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 17806  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 17807  	add	rdi, 32
 17808  	add	rdx, 2
 17809  	jne	.LBB2_536
 17810  	jmp	.LBB2_838
 17811  .LBB2_537:
 17812  	mov	esi, r10d
 17813  	and	esi, -16
 17814  	movd	xmm0, eax
 17815  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 17816  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 17817  	lea	rdx, [rsi - 16]
 17818  	mov	r9, rdx
 17819  	shr	r9, 4
 17820  	add	r9, 1
 17821  	test	rdx, rdx
 17822  	je	.LBB2_845
 17823  # %bb.538:
 17824  	mov	rdx, r9
 17825  	and	rdx, -2
 17826  	neg	rdx
 17827  	xor	edi, edi
 17828  .LBB2_539:                              # =>This Inner Loop Header: Depth=1
 17829  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 17830  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 17831  	movdqa	xmm3, xmm0
 17832  	psubw	xmm3, xmm1
 17833  	movdqa	xmm1, xmm0
 17834  	psubw	xmm1, xmm2
 17835  	movdqu	xmmword ptr [r8 + 2*rdi], xmm3
 17836  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm1
 17837  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi + 32]
 17838  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
 17839  	movdqa	xmm3, xmm0
 17840  	psubw	xmm3, xmm1
 17841  	movdqa	xmm1, xmm0
 17842  	psubw	xmm1, xmm2
 17843  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm3
 17844  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm1
 17845  	add	rdi, 32
 17846  	add	rdx, 2
 17847  	jne	.LBB2_539
 17848  	jmp	.LBB2_846
 17849  .LBB2_540:
 17850  	mov	esi, r10d
 17851  	and	esi, -16
 17852  	movd	xmm0, eax
 17853  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 17854  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 17855  	lea	rdx, [rsi - 16]
 17856  	mov	r9, rdx
 17857  	shr	r9, 4
 17858  	add	r9, 1
 17859  	test	rdx, rdx
 17860  	je	.LBB2_853
 17861  # %bb.541:
 17862  	mov	rdx, r9
 17863  	and	rdx, -2
 17864  	neg	rdx
 17865  	xor	edi, edi
 17866  .LBB2_542:                              # =>This Inner Loop Header: Depth=1
 17867  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 17868  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 17869  	movdqa	xmm3, xmm0
 17870  	psubw	xmm3, xmm1
 17871  	movdqa	xmm1, xmm0
 17872  	psubw	xmm1, xmm2
 17873  	movdqu	xmmword ptr [r8 + 2*rdi], xmm3
 17874  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm1
 17875  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi + 32]
 17876  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
 17877  	movdqa	xmm3, xmm0
 17878  	psubw	xmm3, xmm1
 17879  	movdqa	xmm1, xmm0
 17880  	psubw	xmm1, xmm2
 17881  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm3
 17882  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm1
 17883  	add	rdi, 32
 17884  	add	rdx, 2
 17885  	jne	.LBB2_542
 17886  	jmp	.LBB2_854
 17887  .LBB2_543:
 17888  	mov	esi, r10d
 17889  	and	esi, -16
 17890  	movd	xmm0, eax
 17891  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 17892  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 17893  	lea	rdx, [rsi - 16]
 17894  	mov	r9, rdx
 17895  	shr	r9, 4
 17896  	add	r9, 1
 17897  	test	rdx, rdx
 17898  	je	.LBB2_861
 17899  # %bb.544:
 17900  	mov	rdx, r9
 17901  	and	rdx, -2
 17902  	neg	rdx
 17903  	xor	edi, edi
 17904  .LBB2_545:                              # =>This Inner Loop Header: Depth=1
 17905  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 17906  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 17907  	paddw	xmm1, xmm0
 17908  	paddw	xmm2, xmm0
 17909  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 17910  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 17911  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi + 32]
 17912  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
 17913  	paddw	xmm1, xmm0
 17914  	paddw	xmm2, xmm0
 17915  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 17916  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 17917  	add	rdi, 32
 17918  	add	rdx, 2
 17919  	jne	.LBB2_545
 17920  	jmp	.LBB2_862
 17921  .LBB2_546:
 17922  	mov	esi, r10d
 17923  	and	esi, -16
 17924  	movd	xmm0, eax
 17925  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 17926  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 17927  	lea	rdx, [rsi - 16]
 17928  	mov	r9, rdx
 17929  	shr	r9, 4
 17930  	add	r9, 1
 17931  	test	rdx, rdx
 17932  	je	.LBB2_869
 17933  # %bb.547:
 17934  	mov	rdx, r9
 17935  	and	rdx, -2
 17936  	neg	rdx
 17937  	xor	edi, edi
 17938  .LBB2_548:                              # =>This Inner Loop Header: Depth=1
 17939  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 17940  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 17941  	paddw	xmm1, xmm0
 17942  	paddw	xmm2, xmm0
 17943  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 17944  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 17945  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi + 32]
 17946  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
 17947  	paddw	xmm1, xmm0
 17948  	paddw	xmm2, xmm0
 17949  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm1
 17950  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm2
 17951  	add	rdi, 32
 17952  	add	rdx, 2
 17953  	jne	.LBB2_548
 17954  	jmp	.LBB2_870
 17955  .LBB2_549:
 17956  	mov	esi, r10d
 17957  	and	esi, -16
 17958  	movd	xmm0, eax
 17959  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 17960  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 17961  	lea	rdx, [rsi - 16]
 17962  	mov	r9, rdx
 17963  	shr	r9, 4
 17964  	add	r9, 1
 17965  	test	rdx, rdx
 17966  	je	.LBB2_877
 17967  # %bb.550:
 17968  	mov	rdx, r9
 17969  	and	rdx, -2
 17970  	neg	rdx
 17971  	xor	edi, edi
 17972  .LBB2_551:                              # =>This Inner Loop Header: Depth=1
 17973  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 17974  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 17975  	movdqa	xmm3, xmm0
 17976  	psubw	xmm3, xmm1
 17977  	movdqa	xmm1, xmm0
 17978  	psubw	xmm1, xmm2
 17979  	movdqu	xmmword ptr [r8 + 2*rdi], xmm3
 17980  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm1
 17981  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi + 32]
 17982  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
 17983  	movdqa	xmm3, xmm0
 17984  	psubw	xmm3, xmm1
 17985  	movdqa	xmm1, xmm0
 17986  	psubw	xmm1, xmm2
 17987  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm3
 17988  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm1
 17989  	add	rdi, 32
 17990  	add	rdx, 2
 17991  	jne	.LBB2_551
 17992  	jmp	.LBB2_878
 17993  .LBB2_552:
 17994  	mov	esi, r10d
 17995  	and	esi, -16
 17996  	movd	xmm0, eax
 17997  	pshuflw	xmm0, xmm0, 224                 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
 17998  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 17999  	lea	rdx, [rsi - 16]
 18000  	mov	r9, rdx
 18001  	shr	r9, 4
 18002  	add	r9, 1
 18003  	test	rdx, rdx
 18004  	je	.LBB2_885
 18005  # %bb.553:
 18006  	mov	rdx, r9
 18007  	and	rdx, -2
 18008  	neg	rdx
 18009  	xor	edi, edi
 18010  .LBB2_554:                              # =>This Inner Loop Header: Depth=1
 18011  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 18012  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 18013  	movdqa	xmm3, xmm0
 18014  	psubw	xmm3, xmm1
 18015  	movdqa	xmm1, xmm0
 18016  	psubw	xmm1, xmm2
 18017  	movdqu	xmmword ptr [r8 + 2*rdi], xmm3
 18018  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm1
 18019  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi + 32]
 18020  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 48]
 18021  	movdqa	xmm3, xmm0
 18022  	psubw	xmm3, xmm1
 18023  	movdqa	xmm1, xmm0
 18024  	psubw	xmm1, xmm2
 18025  	movdqu	xmmword ptr [r8 + 2*rdi + 32], xmm3
 18026  	movdqu	xmmword ptr [r8 + 2*rdi + 48], xmm1
 18027  	add	rdi, 32
 18028  	add	rdx, 2
 18029  	jne	.LBB2_554
 18030  	jmp	.LBB2_886
 18031  .LBB2_555:
 18032  	mov	edx, eax
 18033  	and	edx, -8
 18034  	movaps	xmm1, xmm0
 18035  	shufps	xmm1, xmm0, 0                   # xmm1 = xmm1[0,0],xmm0[0,0]
 18036  	lea	rsi, [rdx - 8]
 18037  	mov	r9, rsi
 18038  	shr	r9, 3
 18039  	add	r9, 1
 18040  	test	rsi, rsi
 18041  	je	.LBB2_893
 18042  # %bb.556:
 18043  	mov	rsi, r9
 18044  	and	rsi, -2
 18045  	neg	rsi
 18046  	xor	edi, edi
 18047  .LBB2_557:                              # =>This Inner Loop Header: Depth=1
 18048  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
 18049  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 16]
 18050  	mulps	xmm2, xmm1
 18051  	mulps	xmm3, xmm1
 18052  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 18053  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 18054  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 32]
 18055  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 48]
 18056  	mulps	xmm2, xmm1
 18057  	mulps	xmm3, xmm1
 18058  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
 18059  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm3
 18060  	add	rdi, 16
 18061  	add	rsi, 2
 18062  	jne	.LBB2_557
 18063  	jmp	.LBB2_894
 18064  .LBB2_558:
 18065  	mov	edx, eax
 18066  	and	edx, -8
 18067  	movaps	xmm1, xmm0
 18068  	shufps	xmm1, xmm0, 0                   # xmm1 = xmm1[0,0],xmm0[0,0]
 18069  	lea	rsi, [rdx - 8]
 18070  	mov	r9, rsi
 18071  	shr	r9, 3
 18072  	add	r9, 1
 18073  	test	rsi, rsi
 18074  	je	.LBB2_901
 18075  # %bb.559:
 18076  	mov	rsi, r9
 18077  	and	rsi, -2
 18078  	neg	rsi
 18079  	xor	edi, edi
 18080  .LBB2_560:                              # =>This Inner Loop Header: Depth=1
 18081  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
 18082  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 16]
 18083  	mulps	xmm2, xmm1
 18084  	mulps	xmm3, xmm1
 18085  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 18086  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 18087  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 32]
 18088  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 48]
 18089  	mulps	xmm2, xmm1
 18090  	mulps	xmm3, xmm1
 18091  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
 18092  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm3
 18093  	add	rdi, 16
 18094  	add	rsi, 2
 18095  	jne	.LBB2_560
 18096  	jmp	.LBB2_902
 18097  .LBB2_561:
 18098  	mov	esi, r10d
 18099  	and	esi, -4
 18100  	movq	xmm0, rax
 18101  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 18102  	lea	rdx, [rsi - 4]
 18103  	mov	r9, rdx
 18104  	shr	r9, 2
 18105  	add	r9, 1
 18106  	test	rdx, rdx
 18107  	je	.LBB2_909
 18108  # %bb.562:
 18109  	mov	rdx, r9
 18110  	and	rdx, -2
 18111  	neg	rdx
 18112  	xor	edi, edi
 18113  .LBB2_563:                              # =>This Inner Loop Header: Depth=1
 18114  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 18115  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 18116  	paddq	xmm1, xmm0
 18117  	paddq	xmm2, xmm0
 18118  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 18119  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 18120  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi + 32]
 18121  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 48]
 18122  	paddq	xmm1, xmm0
 18123  	paddq	xmm2, xmm0
 18124  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm1
 18125  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm2
 18126  	add	rdi, 8
 18127  	add	rdx, 2
 18128  	jne	.LBB2_563
 18129  	jmp	.LBB2_910
 18130  .LBB2_564:
 18131  	mov	edx, eax
 18132  	and	edx, -8
 18133  	movaps	xmm1, xmm0
 18134  	shufps	xmm1, xmm0, 0                   # xmm1 = xmm1[0,0],xmm0[0,0]
 18135  	lea	rsi, [rdx - 8]
 18136  	mov	r9, rsi
 18137  	shr	r9, 3
 18138  	add	r9, 1
 18139  	test	rsi, rsi
 18140  	je	.LBB2_917
 18141  # %bb.565:
 18142  	mov	rsi, r9
 18143  	and	rsi, -2
 18144  	neg	rsi
 18145  	xor	edi, edi
 18146  .LBB2_566:                              # =>This Inner Loop Header: Depth=1
 18147  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
 18148  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 16]
 18149  	addps	xmm2, xmm1
 18150  	addps	xmm3, xmm1
 18151  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 18152  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 18153  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 32]
 18154  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 48]
 18155  	addps	xmm2, xmm1
 18156  	addps	xmm3, xmm1
 18157  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
 18158  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm3
 18159  	add	rdi, 16
 18160  	add	rsi, 2
 18161  	jne	.LBB2_566
 18162  	jmp	.LBB2_918
 18163  .LBB2_567:
 18164  	mov	esi, r10d
 18165  	and	esi, -4
 18166  	movq	xmm0, r11
 18167  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 18168  	lea	rdx, [rsi - 4]
 18169  	mov	r9, rdx
 18170  	shr	r9, 2
 18171  	add	r9, 1
 18172  	test	rdx, rdx
 18173  	je	.LBB2_925
 18174  # %bb.568:
 18175  	mov	rdx, r9
 18176  	and	rdx, -2
 18177  	neg	rdx
 18178  	xor	edi, edi
 18179  .LBB2_569:                              # =>This Inner Loop Header: Depth=1
 18180  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 18181  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 18182  	movdqa	xmm3, xmm0
 18183  	psubq	xmm3, xmm1
 18184  	movdqa	xmm1, xmm0
 18185  	psubq	xmm1, xmm2
 18186  	movdqu	xmmword ptr [r8 + 8*rdi], xmm3
 18187  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm1
 18188  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi + 32]
 18189  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 48]
 18190  	movdqa	xmm3, xmm0
 18191  	psubq	xmm3, xmm1
 18192  	movdqa	xmm1, xmm0
 18193  	psubq	xmm1, xmm2
 18194  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm3
 18195  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm1
 18196  	add	rdi, 8
 18197  	add	rdx, 2
 18198  	jne	.LBB2_569
 18199  	jmp	.LBB2_926
 18200  .LBB2_570:
 18201  	mov	edx, eax
 18202  	and	edx, -8
 18203  	movaps	xmm1, xmm0
 18204  	shufps	xmm1, xmm0, 0                   # xmm1 = xmm1[0,0],xmm0[0,0]
 18205  	lea	rsi, [rdx - 8]
 18206  	mov	r9, rsi
 18207  	shr	r9, 3
 18208  	add	r9, 1
 18209  	test	rsi, rsi
 18210  	je	.LBB2_933
 18211  # %bb.571:
 18212  	mov	rsi, r9
 18213  	and	rsi, -2
 18214  	neg	rsi
 18215  	xor	edi, edi
 18216  .LBB2_572:                              # =>This Inner Loop Header: Depth=1
 18217  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
 18218  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 16]
 18219  	movaps	xmm4, xmm1
 18220  	subps	xmm4, xmm2
 18221  	movaps	xmm2, xmm1
 18222  	subps	xmm2, xmm3
 18223  	movups	xmmword ptr [r8 + 4*rdi], xmm4
 18224  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm2
 18225  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 32]
 18226  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 48]
 18227  	movaps	xmm4, xmm1
 18228  	subps	xmm4, xmm2
 18229  	movaps	xmm2, xmm1
 18230  	subps	xmm2, xmm3
 18231  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm4
 18232  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm2
 18233  	add	rdi, 16
 18234  	add	rsi, 2
 18235  	jne	.LBB2_572
 18236  	jmp	.LBB2_934
 18237  .LBB2_573:
 18238  	mov	esi, r10d
 18239  	and	esi, -4
 18240  	movq	xmm0, rax
 18241  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 18242  	lea	rdx, [rsi - 4]
 18243  	mov	r9, rdx
 18244  	shr	r9, 2
 18245  	add	r9, 1
 18246  	test	rdx, rdx
 18247  	je	.LBB2_941
 18248  # %bb.574:
 18249  	mov	rdx, r9
 18250  	and	rdx, -2
 18251  	neg	rdx
 18252  	xor	edi, edi
 18253  .LBB2_575:                              # =>This Inner Loop Header: Depth=1
 18254  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 18255  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 18256  	paddq	xmm1, xmm0
 18257  	paddq	xmm2, xmm0
 18258  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 18259  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 18260  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi + 32]
 18261  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 48]
 18262  	paddq	xmm1, xmm0
 18263  	paddq	xmm2, xmm0
 18264  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm1
 18265  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm2
 18266  	add	rdi, 8
 18267  	add	rdx, 2
 18268  	jne	.LBB2_575
 18269  	jmp	.LBB2_942
 18270  .LBB2_576:
 18271  	mov	edx, eax
 18272  	and	edx, -8
 18273  	movaps	xmm1, xmm0
 18274  	shufps	xmm1, xmm0, 0                   # xmm1 = xmm1[0,0],xmm0[0,0]
 18275  	lea	rsi, [rdx - 8]
 18276  	mov	r9, rsi
 18277  	shr	r9, 3
 18278  	add	r9, 1
 18279  	test	rsi, rsi
 18280  	je	.LBB2_949
 18281  # %bb.577:
 18282  	mov	rsi, r9
 18283  	and	rsi, -2
 18284  	neg	rsi
 18285  	xor	edi, edi
 18286  .LBB2_578:                              # =>This Inner Loop Header: Depth=1
 18287  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
 18288  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 16]
 18289  	addps	xmm2, xmm1
 18290  	addps	xmm3, xmm1
 18291  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 18292  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 18293  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 32]
 18294  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 48]
 18295  	addps	xmm2, xmm1
 18296  	addps	xmm3, xmm1
 18297  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm2
 18298  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm3
 18299  	add	rdi, 16
 18300  	add	rsi, 2
 18301  	jne	.LBB2_578
 18302  	jmp	.LBB2_950
 18303  .LBB2_579:
 18304  	mov	esi, r10d
 18305  	and	esi, -4
 18306  	movq	xmm0, r11
 18307  	pshufd	xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
 18308  	lea	rdx, [rsi - 4]
 18309  	mov	r9, rdx
 18310  	shr	r9, 2
 18311  	add	r9, 1
 18312  	test	rdx, rdx
 18313  	je	.LBB2_957
 18314  # %bb.580:
 18315  	mov	rdx, r9
 18316  	and	rdx, -2
 18317  	neg	rdx
 18318  	xor	edi, edi
 18319  .LBB2_581:                              # =>This Inner Loop Header: Depth=1
 18320  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 18321  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 18322  	movdqa	xmm3, xmm0
 18323  	psubq	xmm3, xmm1
 18324  	movdqa	xmm1, xmm0
 18325  	psubq	xmm1, xmm2
 18326  	movdqu	xmmword ptr [r8 + 8*rdi], xmm3
 18327  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm1
 18328  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi + 32]
 18329  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 48]
 18330  	movdqa	xmm3, xmm0
 18331  	psubq	xmm3, xmm1
 18332  	movdqa	xmm1, xmm0
 18333  	psubq	xmm1, xmm2
 18334  	movdqu	xmmword ptr [r8 + 8*rdi + 32], xmm3
 18335  	movdqu	xmmword ptr [r8 + 8*rdi + 48], xmm1
 18336  	add	rdi, 8
 18337  	add	rdx, 2
 18338  	jne	.LBB2_581
 18339  	jmp	.LBB2_958
 18340  .LBB2_582:
 18341  	mov	edx, eax
 18342  	and	edx, -8
 18343  	movaps	xmm1, xmm0
 18344  	shufps	xmm1, xmm0, 0                   # xmm1 = xmm1[0,0],xmm0[0,0]
 18345  	lea	rsi, [rdx - 8]
 18346  	mov	r9, rsi
 18347  	shr	r9, 3
 18348  	add	r9, 1
 18349  	test	rsi, rsi
 18350  	je	.LBB2_965
 18351  # %bb.583:
 18352  	mov	rsi, r9
 18353  	and	rsi, -2
 18354  	neg	rsi
 18355  	xor	edi, edi
 18356  .LBB2_584:                              # =>This Inner Loop Header: Depth=1
 18357  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
 18358  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 16]
 18359  	movaps	xmm4, xmm1
 18360  	subps	xmm4, xmm2
 18361  	movaps	xmm2, xmm1
 18362  	subps	xmm2, xmm3
 18363  	movups	xmmword ptr [r8 + 4*rdi], xmm4
 18364  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm2
 18365  	movups	xmm2, xmmword ptr [rcx + 4*rdi + 32]
 18366  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 48]
 18367  	movaps	xmm4, xmm1
 18368  	subps	xmm4, xmm2
 18369  	movaps	xmm2, xmm1
 18370  	subps	xmm2, xmm3
 18371  	movups	xmmword ptr [r8 + 4*rdi + 32], xmm4
 18372  	movups	xmmword ptr [r8 + 4*rdi + 48], xmm2
 18373  	add	rdi, 16
 18374  	add	rsi, 2
 18375  	jne	.LBB2_584
 18376  	jmp	.LBB2_966
 18377  .LBB2_585:
 18378  	mov	edi, r10d
 18379  	and	edi, -32
 18380  	movzx	eax, dl
 18381  	movd	xmm0, eax
 18382  	pxor	xmm1, xmm1
 18383  	pshufb	xmm0, xmm1
 18384  	lea	rax, [rdi - 32]
 18385  	mov	r9, rax
 18386  	shr	r9, 5
 18387  	add	r9, 1
 18388  	pmovzxbw	xmm1, xmm0                      # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 18389  	test	rax, rax
 18390  	je	.LBB2_973
 18391  # %bb.586:
 18392  	mov	rsi, r9
 18393  	and	rsi, -2
 18394  	neg	rsi
 18395  	xor	eax, eax
 18396  	movdqa	xmm2, xmm0
 18397  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 18398  	movdqa	xmm3, xmmword ptr [rip + .LCPI2_0] # xmm3 = [255,255,255,255,255,255,255,255]
 18399  	movdqa	xmm4, xmm0
 18400  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 18401  .LBB2_587:                              # =>This Inner Loop Header: Depth=1
 18402  	movdqu	xmm5, xmmword ptr [rcx + rax]
 18403  	movdqu	xmm6, xmmword ptr [rcx + rax + 16]
 18404  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 18405  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 18406  	pmullw	xmm5, xmm2
 18407  	pand	xmm5, xmm3
 18408  	pmullw	xmm7, xmm1
 18409  	pand	xmm7, xmm3
 18410  	packuswb	xmm7, xmm5
 18411  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 18412  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 18413  	pmullw	xmm6, xmm4
 18414  	pand	xmm6, xmm3
 18415  	pmullw	xmm5, xmm1
 18416  	pand	xmm5, xmm3
 18417  	packuswb	xmm5, xmm6
 18418  	movdqu	xmmword ptr [r8 + rax], xmm7
 18419  	movdqu	xmmword ptr [r8 + rax + 16], xmm5
 18420  	movdqu	xmm5, xmmword ptr [rcx + rax + 32]
 18421  	movdqu	xmm6, xmmword ptr [rcx + rax + 48]
 18422  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 18423  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 18424  	pmullw	xmm5, xmm2
 18425  	pand	xmm5, xmm3
 18426  	pmullw	xmm7, xmm1
 18427  	pand	xmm7, xmm3
 18428  	packuswb	xmm7, xmm5
 18429  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 18430  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 18431  	pmullw	xmm6, xmm4
 18432  	pand	xmm6, xmm3
 18433  	pmullw	xmm5, xmm1
 18434  	pand	xmm5, xmm3
 18435  	packuswb	xmm5, xmm6
 18436  	movdqu	xmmword ptr [r8 + rax + 32], xmm7
 18437  	movdqu	xmmword ptr [r8 + rax + 48], xmm5
 18438  	add	rax, 64
 18439  	add	rsi, 2
 18440  	jne	.LBB2_587
 18441  	jmp	.LBB2_974
 18442  .LBB2_588:
 18443  	mov	edi, r10d
 18444  	and	edi, -32
 18445  	movzx	eax, dl
 18446  	movd	xmm0, eax
 18447  	pxor	xmm1, xmm1
 18448  	pshufb	xmm0, xmm1
 18449  	lea	rax, [rdi - 32]
 18450  	mov	r9, rax
 18451  	shr	r9, 5
 18452  	add	r9, 1
 18453  	pmovzxbw	xmm1, xmm0                      # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 18454  	test	rax, rax
 18455  	je	.LBB2_981
 18456  # %bb.589:
 18457  	mov	rsi, r9
 18458  	and	rsi, -2
 18459  	neg	rsi
 18460  	xor	eax, eax
 18461  	movdqa	xmm2, xmm0
 18462  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 18463  	movdqa	xmm3, xmmword ptr [rip + .LCPI2_0] # xmm3 = [255,255,255,255,255,255,255,255]
 18464  	movdqa	xmm4, xmm0
 18465  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 18466  .LBB2_590:                              # =>This Inner Loop Header: Depth=1
 18467  	movdqu	xmm5, xmmword ptr [rcx + rax]
 18468  	movdqu	xmm6, xmmword ptr [rcx + rax + 16]
 18469  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 18470  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 18471  	pmullw	xmm5, xmm2
 18472  	pand	xmm5, xmm3
 18473  	pmullw	xmm7, xmm1
 18474  	pand	xmm7, xmm3
 18475  	packuswb	xmm7, xmm5
 18476  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 18477  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 18478  	pmullw	xmm6, xmm4
 18479  	pand	xmm6, xmm3
 18480  	pmullw	xmm5, xmm1
 18481  	pand	xmm5, xmm3
 18482  	packuswb	xmm5, xmm6
 18483  	movdqu	xmmword ptr [r8 + rax], xmm7
 18484  	movdqu	xmmword ptr [r8 + rax + 16], xmm5
 18485  	movdqu	xmm5, xmmword ptr [rcx + rax + 32]
 18486  	movdqu	xmm6, xmmword ptr [rcx + rax + 48]
 18487  	pmovzxbw	xmm7, xmm5                      # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 18488  	punpckhbw	xmm5, xmm5              # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 18489  	pmullw	xmm5, xmm2
 18490  	pand	xmm5, xmm3
 18491  	pmullw	xmm7, xmm1
 18492  	pand	xmm7, xmm3
 18493  	packuswb	xmm7, xmm5
 18494  	pmovzxbw	xmm5, xmm6                      # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 18495  	punpckhbw	xmm6, xmm6              # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 18496  	pmullw	xmm6, xmm4
 18497  	pand	xmm6, xmm3
 18498  	pmullw	xmm5, xmm1
 18499  	pand	xmm5, xmm3
 18500  	packuswb	xmm5, xmm6
 18501  	movdqu	xmmword ptr [r8 + rax + 32], xmm7
 18502  	movdqu	xmmword ptr [r8 + rax + 48], xmm5
 18503  	add	rax, 64
 18504  	add	rsi, 2
 18505  	jne	.LBB2_590
 18506  	jmp	.LBB2_982
 18507  .LBB2_591:
 18508  	mov	esi, r10d
 18509  	and	esi, -32
 18510  	movzx	edx, al
 18511  	movd	xmm0, edx
 18512  	pxor	xmm1, xmm1
 18513  	pshufb	xmm0, xmm1
 18514  	lea	rdx, [rsi - 32]
 18515  	mov	r9, rdx
 18516  	shr	r9, 5
 18517  	add	r9, 1
 18518  	test	rdx, rdx
 18519  	je	.LBB2_989
 18520  # %bb.592:
 18521  	mov	rdx, r9
 18522  	and	rdx, -2
 18523  	neg	rdx
 18524  	xor	edi, edi
 18525  .LBB2_593:                              # =>This Inner Loop Header: Depth=1
 18526  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 18527  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 18528  	paddb	xmm1, xmm0
 18529  	paddb	xmm2, xmm0
 18530  	movdqu	xmmword ptr [r8 + rdi], xmm1
 18531  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 18532  	movdqu	xmm1, xmmword ptr [rcx + rdi + 32]
 18533  	movdqu	xmm2, xmmword ptr [rcx + rdi + 48]
 18534  	paddb	xmm1, xmm0
 18535  	paddb	xmm2, xmm0
 18536  	movdqu	xmmword ptr [r8 + rdi + 32], xmm1
 18537  	movdqu	xmmword ptr [r8 + rdi + 48], xmm2
 18538  	add	rdi, 64
 18539  	add	rdx, 2
 18540  	jne	.LBB2_593
 18541  	jmp	.LBB2_990
 18542  .LBB2_594:
 18543  	mov	esi, r10d
 18544  	and	esi, -32
 18545  	movzx	edx, r11b
 18546  	movd	xmm0, edx
 18547  	pxor	xmm1, xmm1
 18548  	pshufb	xmm0, xmm1
 18549  	lea	rdx, [rsi - 32]
 18550  	mov	r9, rdx
 18551  	shr	r9, 5
 18552  	add	r9, 1
 18553  	test	rdx, rdx
 18554  	je	.LBB2_997
 18555  # %bb.595:
 18556  	mov	rdx, r9
 18557  	and	rdx, -2
 18558  	neg	rdx
 18559  	xor	edi, edi
 18560  .LBB2_596:                              # =>This Inner Loop Header: Depth=1
 18561  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 18562  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 18563  	movdqa	xmm3, xmm0
 18564  	psubb	xmm3, xmm1
 18565  	movdqa	xmm1, xmm0
 18566  	psubb	xmm1, xmm2
 18567  	movdqu	xmmword ptr [r8 + rdi], xmm3
 18568  	movdqu	xmmword ptr [r8 + rdi + 16], xmm1
 18569  	movdqu	xmm1, xmmword ptr [rcx + rdi + 32]
 18570  	movdqu	xmm2, xmmword ptr [rcx + rdi + 48]
 18571  	movdqa	xmm3, xmm0
 18572  	psubb	xmm3, xmm1
 18573  	movdqa	xmm1, xmm0
 18574  	psubb	xmm1, xmm2
 18575  	movdqu	xmmword ptr [r8 + rdi + 32], xmm3
 18576  	movdqu	xmmword ptr [r8 + rdi + 48], xmm1
 18577  	add	rdi, 64
 18578  	add	rdx, 2
 18579  	jne	.LBB2_596
 18580  	jmp	.LBB2_998
 18581  .LBB2_597:
 18582  	mov	esi, r10d
 18583  	and	esi, -32
 18584  	movzx	edx, al
 18585  	movd	xmm0, edx
 18586  	pxor	xmm1, xmm1
 18587  	pshufb	xmm0, xmm1
 18588  	lea	rdx, [rsi - 32]
 18589  	mov	r9, rdx
 18590  	shr	r9, 5
 18591  	add	r9, 1
 18592  	test	rdx, rdx
 18593  	je	.LBB2_1005
 18594  # %bb.598:
 18595  	mov	rdx, r9
 18596  	and	rdx, -2
 18597  	neg	rdx
 18598  	xor	edi, edi
 18599  .LBB2_599:                              # =>This Inner Loop Header: Depth=1
 18600  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 18601  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 18602  	paddb	xmm1, xmm0
 18603  	paddb	xmm2, xmm0
 18604  	movdqu	xmmword ptr [r8 + rdi], xmm1
 18605  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 18606  	movdqu	xmm1, xmmword ptr [rcx + rdi + 32]
 18607  	movdqu	xmm2, xmmword ptr [rcx + rdi + 48]
 18608  	paddb	xmm1, xmm0
 18609  	paddb	xmm2, xmm0
 18610  	movdqu	xmmword ptr [r8 + rdi + 32], xmm1
 18611  	movdqu	xmmword ptr [r8 + rdi + 48], xmm2
 18612  	add	rdi, 64
 18613  	add	rdx, 2
 18614  	jne	.LBB2_599
 18615  	jmp	.LBB2_1006
 18616  .LBB2_600:
 18617  	mov	esi, r10d
 18618  	and	esi, -32
 18619  	movzx	edx, r11b
 18620  	movd	xmm0, edx
 18621  	pxor	xmm1, xmm1
 18622  	pshufb	xmm0, xmm1
 18623  	lea	rdx, [rsi - 32]
 18624  	mov	r9, rdx
 18625  	shr	r9, 5
 18626  	add	r9, 1
 18627  	test	rdx, rdx
 18628  	je	.LBB2_1013
 18629  # %bb.601:
 18630  	mov	rdx, r9
 18631  	and	rdx, -2
 18632  	neg	rdx
 18633  	xor	edi, edi
 18634  .LBB2_602:                              # =>This Inner Loop Header: Depth=1
 18635  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 18636  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 18637  	movdqa	xmm3, xmm0
 18638  	psubb	xmm3, xmm1
 18639  	movdqa	xmm1, xmm0
 18640  	psubb	xmm1, xmm2
 18641  	movdqu	xmmword ptr [r8 + rdi], xmm3
 18642  	movdqu	xmmword ptr [r8 + rdi + 16], xmm1
 18643  	movdqu	xmm1, xmmword ptr [rcx + rdi + 32]
 18644  	movdqu	xmm2, xmmword ptr [rcx + rdi + 48]
 18645  	movdqa	xmm3, xmm0
 18646  	psubb	xmm3, xmm1
 18647  	movdqa	xmm1, xmm0
 18648  	psubb	xmm1, xmm2
 18649  	movdqu	xmmword ptr [r8 + rdi + 32], xmm3
 18650  	movdqu	xmmword ptr [r8 + rdi + 48], xmm1
 18651  	add	rdi, 64
 18652  	add	rdx, 2
 18653  	jne	.LBB2_602
 18654  	jmp	.LBB2_1014
 18655  .LBB2_603:
 18656  	mov	esi, r10d
 18657  	and	esi, -8
 18658  	movd	xmm0, eax
 18659  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 18660  	lea	rdx, [rsi - 8]
 18661  	mov	r9, rdx
 18662  	shr	r9, 3
 18663  	add	r9, 1
 18664  	test	rdx, rdx
 18665  	je	.LBB2_1021
 18666  # %bb.604:
 18667  	mov	rdx, r9
 18668  	and	rdx, -2
 18669  	neg	rdx
 18670  	xor	edi, edi
 18671  .LBB2_605:                              # =>This Inner Loop Header: Depth=1
 18672  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 18673  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 18674  	pmulld	xmm1, xmm0
 18675  	pmulld	xmm2, xmm0
 18676  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 18677  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 18678  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi + 32]
 18679  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
 18680  	pmulld	xmm1, xmm0
 18681  	pmulld	xmm2, xmm0
 18682  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 18683  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 18684  	add	rdi, 16
 18685  	add	rdx, 2
 18686  	jne	.LBB2_605
 18687  	jmp	.LBB2_1022
 18688  .LBB2_606:
 18689  	mov	esi, r10d
 18690  	and	esi, -8
 18691  	movd	xmm0, eax
 18692  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 18693  	lea	rdx, [rsi - 8]
 18694  	mov	r9, rdx
 18695  	shr	r9, 3
 18696  	add	r9, 1
 18697  	test	rdx, rdx
 18698  	je	.LBB2_1029
 18699  # %bb.607:
 18700  	mov	rdx, r9
 18701  	and	rdx, -2
 18702  	neg	rdx
 18703  	xor	edi, edi
 18704  .LBB2_608:                              # =>This Inner Loop Header: Depth=1
 18705  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 18706  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 18707  	pmulld	xmm1, xmm0
 18708  	pmulld	xmm2, xmm0
 18709  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 18710  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 18711  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi + 32]
 18712  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
 18713  	pmulld	xmm1, xmm0
 18714  	pmulld	xmm2, xmm0
 18715  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 18716  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 18717  	add	rdi, 16
 18718  	add	rdx, 2
 18719  	jne	.LBB2_608
 18720  	jmp	.LBB2_1030
 18721  .LBB2_609:
 18722  	mov	esi, r10d
 18723  	and	esi, -8
 18724  	movd	xmm0, eax
 18725  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 18726  	lea	rdx, [rsi - 8]
 18727  	mov	r9, rdx
 18728  	shr	r9, 3
 18729  	add	r9, 1
 18730  	test	rdx, rdx
 18731  	je	.LBB2_1037
 18732  # %bb.610:
 18733  	mov	rdx, r9
 18734  	and	rdx, -2
 18735  	neg	rdx
 18736  	xor	edi, edi
 18737  .LBB2_611:                              # =>This Inner Loop Header: Depth=1
 18738  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 18739  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 18740  	paddd	xmm1, xmm0
 18741  	paddd	xmm2, xmm0
 18742  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 18743  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 18744  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi + 32]
 18745  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
 18746  	paddd	xmm1, xmm0
 18747  	paddd	xmm2, xmm0
 18748  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 18749  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 18750  	add	rdi, 16
 18751  	add	rdx, 2
 18752  	jne	.LBB2_611
 18753  	jmp	.LBB2_1038
 18754  .LBB2_612:
 18755  	mov	esi, r10d
 18756  	and	esi, -8
 18757  	movd	xmm0, r11d
 18758  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 18759  	lea	rdx, [rsi - 8]
 18760  	mov	r9, rdx
 18761  	shr	r9, 3
 18762  	add	r9, 1
 18763  	test	rdx, rdx
 18764  	je	.LBB2_1045
 18765  # %bb.613:
 18766  	mov	rdx, r9
 18767  	and	rdx, -2
 18768  	neg	rdx
 18769  	xor	edi, edi
 18770  .LBB2_614:                              # =>This Inner Loop Header: Depth=1
 18771  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 18772  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 18773  	movdqa	xmm3, xmm0
 18774  	psubd	xmm3, xmm1
 18775  	movdqa	xmm1, xmm0
 18776  	psubd	xmm1, xmm2
 18777  	movdqu	xmmword ptr [r8 + 4*rdi], xmm3
 18778  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm1
 18779  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi + 32]
 18780  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
 18781  	movdqa	xmm3, xmm0
 18782  	psubd	xmm3, xmm1
 18783  	movdqa	xmm1, xmm0
 18784  	psubd	xmm1, xmm2
 18785  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm3
 18786  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm1
 18787  	add	rdi, 16
 18788  	add	rdx, 2
 18789  	jne	.LBB2_614
 18790  	jmp	.LBB2_1046
 18791  .LBB2_615:
 18792  	mov	esi, r10d
 18793  	and	esi, -8
 18794  	movd	xmm0, eax
 18795  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 18796  	lea	rdx, [rsi - 8]
 18797  	mov	r9, rdx
 18798  	shr	r9, 3
 18799  	add	r9, 1
 18800  	test	rdx, rdx
 18801  	je	.LBB2_1053
 18802  # %bb.616:
 18803  	mov	rdx, r9
 18804  	and	rdx, -2
 18805  	neg	rdx
 18806  	xor	edi, edi
 18807  .LBB2_617:                              # =>This Inner Loop Header: Depth=1
 18808  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 18809  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 18810  	paddd	xmm1, xmm0
 18811  	paddd	xmm2, xmm0
 18812  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 18813  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 18814  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi + 32]
 18815  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
 18816  	paddd	xmm1, xmm0
 18817  	paddd	xmm2, xmm0
 18818  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm1
 18819  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm2
 18820  	add	rdi, 16
 18821  	add	rdx, 2
 18822  	jne	.LBB2_617
 18823  	jmp	.LBB2_1054
 18824  .LBB2_618:
 18825  	mov	esi, r10d
 18826  	and	esi, -8
 18827  	movd	xmm0, r11d
 18828  	pshufd	xmm0, xmm0, 0                   # xmm0 = xmm0[0,0,0,0]
 18829  	lea	rdx, [rsi - 8]
 18830  	mov	r9, rdx
 18831  	shr	r9, 3
 18832  	add	r9, 1
 18833  	test	rdx, rdx
 18834  	je	.LBB2_1061
 18835  # %bb.619:
 18836  	mov	rdx, r9
 18837  	and	rdx, -2
 18838  	neg	rdx
 18839  	xor	edi, edi
 18840  .LBB2_620:                              # =>This Inner Loop Header: Depth=1
 18841  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 18842  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 18843  	movdqa	xmm3, xmm0
 18844  	psubd	xmm3, xmm1
 18845  	movdqa	xmm1, xmm0
 18846  	psubd	xmm1, xmm2
 18847  	movdqu	xmmword ptr [r8 + 4*rdi], xmm3
 18848  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm1
 18849  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi + 32]
 18850  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 48]
 18851  	movdqa	xmm3, xmm0
 18852  	psubd	xmm3, xmm1
 18853  	movdqa	xmm1, xmm0
 18854  	psubd	xmm1, xmm2
 18855  	movdqu	xmmword ptr [r8 + 4*rdi + 32], xmm3
 18856  	movdqu	xmmword ptr [r8 + 4*rdi + 48], xmm1
 18857  	add	rdi, 16
 18858  	add	rdx, 2
 18859  	jne	.LBB2_620
 18860  	jmp	.LBB2_1062
 18861  .LBB2_621:
 18862  	xor	edi, edi
 18863  .LBB2_622:
 18864  	test	r9b, 1
 18865  	je	.LBB2_624
 18866  # %bb.623:
 18867  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 18868  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 18869  	pmulld	xmm1, xmm0
 18870  	pmulld	xmm2, xmm0
 18871  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 18872  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 18873  .LBB2_624:
 18874  	cmp	rsi, r10
 18875  	je	.LBB2_1069
 18876  	jmp	.LBB2_625
 18877  .LBB2_629:
 18878  	xor	edi, edi
 18879  .LBB2_630:
 18880  	test	r9b, 1
 18881  	je	.LBB2_632
 18882  # %bb.631:
 18883  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 18884  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 18885  	pmulld	xmm1, xmm0
 18886  	pmulld	xmm2, xmm0
 18887  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 18888  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 18889  .LBB2_632:
 18890  	cmp	rsi, r10
 18891  	je	.LBB2_1069
 18892  	jmp	.LBB2_633
 18893  .LBB2_637:
 18894  	xor	edi, edi
 18895  .LBB2_638:
 18896  	test	r9b, 1
 18897  	je	.LBB2_640
 18898  # %bb.639:
 18899  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 18900  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 18901  	paddd	xmm1, xmm0
 18902  	paddd	xmm2, xmm0
 18903  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 18904  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 18905  .LBB2_640:
 18906  	cmp	rsi, r10
 18907  	je	.LBB2_1069
 18908  	jmp	.LBB2_641
 18909  .LBB2_645:
 18910  	xor	edi, edi
 18911  .LBB2_646:
 18912  	test	r9b, 1
 18913  	je	.LBB2_648
 18914  # %bb.647:
 18915  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 18916  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 18917  	movdqa	xmm3, xmm0
 18918  	psubd	xmm3, xmm1
 18919  	psubd	xmm0, xmm2
 18920  	movdqu	xmmword ptr [r8 + 4*rdi], xmm3
 18921  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
 18922  .LBB2_648:
 18923  	cmp	rsi, r10
 18924  	je	.LBB2_1069
 18925  	jmp	.LBB2_649
 18926  .LBB2_653:
 18927  	xor	edi, edi
 18928  .LBB2_654:
 18929  	test	r9b, 1
 18930  	je	.LBB2_656
 18931  # %bb.655:
 18932  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 18933  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 18934  	paddd	xmm1, xmm0
 18935  	paddd	xmm2, xmm0
 18936  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 18937  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 18938  .LBB2_656:
 18939  	cmp	rsi, r10
 18940  	je	.LBB2_1069
 18941  	jmp	.LBB2_657
 18942  .LBB2_661:
 18943  	xor	edi, edi
 18944  .LBB2_662:
 18945  	test	r9b, 1
 18946  	je	.LBB2_664
 18947  # %bb.663:
 18948  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 18949  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 18950  	movdqa	xmm3, xmm0
 18951  	psubd	xmm3, xmm1
 18952  	psubd	xmm0, xmm2
 18953  	movdqu	xmmword ptr [r8 + 4*rdi], xmm3
 18954  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
 18955  .LBB2_664:
 18956  	cmp	rsi, r10
 18957  	je	.LBB2_1069
 18958  	jmp	.LBB2_665
 18959  .LBB2_669:
 18960  	xor	edi, edi
 18961  .LBB2_670:
 18962  	test	r9b, 1
 18963  	je	.LBB2_672
 18964  # %bb.671:
 18965  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 18966  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 18967  	mulpd	xmm2, xmm1
 18968  	mulpd	xmm3, xmm1
 18969  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 18970  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 18971  .LBB2_672:
 18972  	cmp	rdx, rax
 18973  	je	.LBB2_1069
 18974  	jmp	.LBB2_673
 18975  .LBB2_677:
 18976  	xor	edi, edi
 18977  .LBB2_678:
 18978  	test	r9b, 1
 18979  	je	.LBB2_680
 18980  # %bb.679:
 18981  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 18982  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 18983  	mulpd	xmm2, xmm1
 18984  	mulpd	xmm3, xmm1
 18985  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 18986  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 18987  .LBB2_680:
 18988  	cmp	rdx, rax
 18989  	je	.LBB2_1069
 18990  	jmp	.LBB2_681
 18991  .LBB2_685:
 18992  	xor	edi, edi
 18993  .LBB2_686:
 18994  	test	r9b, 1
 18995  	je	.LBB2_688
 18996  # %bb.687:
 18997  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 18998  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 18999  	addpd	xmm2, xmm1
 19000  	addpd	xmm3, xmm1
 19001  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 19002  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 19003  .LBB2_688:
 19004  	cmp	rdx, rax
 19005  	je	.LBB2_1069
 19006  	jmp	.LBB2_689
 19007  .LBB2_693:
 19008  	xor	edi, edi
 19009  .LBB2_694:
 19010  	test	r9b, 1
 19011  	je	.LBB2_696
 19012  # %bb.695:
 19013  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 19014  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 19015  	movapd	xmm4, xmm1
 19016  	subpd	xmm4, xmm2
 19017  	subpd	xmm1, xmm3
 19018  	movupd	xmmword ptr [r8 + 8*rdi], xmm4
 19019  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm1
 19020  .LBB2_696:
 19021  	cmp	rdx, rax
 19022  	je	.LBB2_1069
 19023  	jmp	.LBB2_697
 19024  .LBB2_701:
 19025  	xor	edi, edi
 19026  .LBB2_702:
 19027  	test	r9b, 1
 19028  	je	.LBB2_704
 19029  # %bb.703:
 19030  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 19031  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 19032  	addpd	xmm2, xmm1
 19033  	addpd	xmm3, xmm1
 19034  	movupd	xmmword ptr [r8 + 8*rdi], xmm2
 19035  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm3
 19036  .LBB2_704:
 19037  	cmp	rdx, rax
 19038  	je	.LBB2_1069
 19039  	jmp	.LBB2_705
 19040  .LBB2_709:
 19041  	xor	edi, edi
 19042  .LBB2_710:
 19043  	test	r9b, 1
 19044  	je	.LBB2_712
 19045  # %bb.711:
 19046  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 19047  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 19048  	movapd	xmm4, xmm1
 19049  	subpd	xmm4, xmm2
 19050  	subpd	xmm1, xmm3
 19051  	movupd	xmmword ptr [r8 + 8*rdi], xmm4
 19052  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm1
 19053  .LBB2_712:
 19054  	cmp	rdx, rax
 19055  	je	.LBB2_1069
 19056  	jmp	.LBB2_713
 19057  .LBB2_717:
 19058  	xor	eax, eax
 19059  .LBB2_718:
 19060  	test	r9b, 1
 19061  	je	.LBB2_720
 19062  # %bb.719:
 19063  	movdqu	xmm2, xmmword ptr [rcx + rax]
 19064  	movdqu	xmm3, xmmword ptr [rcx + rax + 16]
 19065  	movdqa	xmm4, xmm0
 19066  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19067  	pmovzxbw	xmm5, xmm2                      # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 19068  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19069  	pmullw	xmm2, xmm4
 19070  	movdqa	xmm4, xmmword ptr [rip + .LCPI2_0] # xmm4 = [255,255,255,255,255,255,255,255]
 19071  	pand	xmm2, xmm4
 19072  	pmullw	xmm5, xmm1
 19073  	pand	xmm5, xmm4
 19074  	packuswb	xmm5, xmm2
 19075  	punpckhbw	xmm0, xmm0              # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19076  	pmovzxbw	xmm2, xmm3                      # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 19077  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19078  	pmullw	xmm3, xmm0
 19079  	pand	xmm3, xmm4
 19080  	pmullw	xmm2, xmm1
 19081  	pand	xmm2, xmm4
 19082  	packuswb	xmm2, xmm3
 19083  	movdqu	xmmword ptr [r8 + rax], xmm5
 19084  	movdqu	xmmword ptr [r8 + rax + 16], xmm2
 19085  .LBB2_720:
 19086  	cmp	rdi, r10
 19087  	je	.LBB2_1069
 19088  	jmp	.LBB2_721
 19089  .LBB2_725:
 19090  	xor	eax, eax
 19091  .LBB2_726:
 19092  	test	r9b, 1
 19093  	je	.LBB2_728
 19094  # %bb.727:
 19095  	movdqu	xmm2, xmmword ptr [rcx + rax]
 19096  	movdqu	xmm3, xmmword ptr [rcx + rax + 16]
 19097  	movdqa	xmm4, xmm0
 19098  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19099  	pmovzxbw	xmm5, xmm2                      # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 19100  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19101  	pmullw	xmm2, xmm4
 19102  	movdqa	xmm4, xmmword ptr [rip + .LCPI2_0] # xmm4 = [255,255,255,255,255,255,255,255]
 19103  	pand	xmm2, xmm4
 19104  	pmullw	xmm5, xmm1
 19105  	pand	xmm5, xmm4
 19106  	packuswb	xmm5, xmm2
 19107  	punpckhbw	xmm0, xmm0              # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19108  	pmovzxbw	xmm2, xmm3                      # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 19109  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19110  	pmullw	xmm3, xmm0
 19111  	pand	xmm3, xmm4
 19112  	pmullw	xmm2, xmm1
 19113  	pand	xmm2, xmm4
 19114  	packuswb	xmm2, xmm3
 19115  	movdqu	xmmword ptr [r8 + rax], xmm5
 19116  	movdqu	xmmword ptr [r8 + rax + 16], xmm2
 19117  .LBB2_728:
 19118  	cmp	rdi, r10
 19119  	je	.LBB2_1069
 19120  	jmp	.LBB2_729
 19121  .LBB2_733:
 19122  	xor	edi, edi
 19123  .LBB2_734:
 19124  	test	r9b, 1
 19125  	je	.LBB2_736
 19126  # %bb.735:
 19127  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 19128  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 19129  	paddb	xmm1, xmm0
 19130  	paddb	xmm2, xmm0
 19131  	movdqu	xmmword ptr [r8 + rdi], xmm1
 19132  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 19133  .LBB2_736:
 19134  	cmp	rsi, r10
 19135  	je	.LBB2_1069
 19136  	jmp	.LBB2_737
 19137  .LBB2_741:
 19138  	xor	edi, edi
 19139  .LBB2_742:
 19140  	test	r9b, 1
 19141  	je	.LBB2_744
 19142  # %bb.743:
 19143  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 19144  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 19145  	movdqa	xmm3, xmm0
 19146  	psubb	xmm3, xmm1
 19147  	psubb	xmm0, xmm2
 19148  	movdqu	xmmword ptr [r8 + rdi], xmm3
 19149  	movdqu	xmmword ptr [r8 + rdi + 16], xmm0
 19150  .LBB2_744:
 19151  	cmp	rsi, r10
 19152  	je	.LBB2_1069
 19153  	jmp	.LBB2_745
 19154  .LBB2_749:
 19155  	xor	edi, edi
 19156  .LBB2_750:
 19157  	test	r9b, 1
 19158  	je	.LBB2_752
 19159  # %bb.751:
 19160  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 19161  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 19162  	paddb	xmm1, xmm0
 19163  	paddb	xmm2, xmm0
 19164  	movdqu	xmmword ptr [r8 + rdi], xmm1
 19165  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 19166  .LBB2_752:
 19167  	cmp	rsi, r10
 19168  	je	.LBB2_1069
 19169  	jmp	.LBB2_753
 19170  .LBB2_757:
 19171  	xor	edi, edi
 19172  .LBB2_758:
 19173  	test	r9b, 1
 19174  	je	.LBB2_760
 19175  # %bb.759:
 19176  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 19177  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 19178  	movdqa	xmm3, xmm0
 19179  	psubb	xmm3, xmm1
 19180  	psubb	xmm0, xmm2
 19181  	movdqu	xmmword ptr [r8 + rdi], xmm3
 19182  	movdqu	xmmword ptr [r8 + rdi + 16], xmm0
 19183  .LBB2_760:
 19184  	cmp	rsi, r10
 19185  	je	.LBB2_1069
 19186  	jmp	.LBB2_761
 19187  .LBB2_765:
 19188  	xor	edi, edi
 19189  .LBB2_766:
 19190  	test	r9b, 1
 19191  	je	.LBB2_768
 19192  # %bb.767:
 19193  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 19194  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 19195  	paddq	xmm1, xmm0
 19196  	paddq	xmm2, xmm0
 19197  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 19198  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 19199  .LBB2_768:
 19200  	cmp	rsi, r10
 19201  	je	.LBB2_1069
 19202  	jmp	.LBB2_769
 19203  .LBB2_773:
 19204  	xor	edi, edi
 19205  .LBB2_774:
 19206  	test	r9b, 1
 19207  	je	.LBB2_776
 19208  # %bb.775:
 19209  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 19210  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 19211  	movdqa	xmm3, xmm0
 19212  	psubq	xmm3, xmm1
 19213  	psubq	xmm0, xmm2
 19214  	movdqu	xmmword ptr [r8 + 8*rdi], xmm3
 19215  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm0
 19216  .LBB2_776:
 19217  	cmp	rsi, r10
 19218  	je	.LBB2_1069
 19219  	jmp	.LBB2_777
 19220  .LBB2_781:
 19221  	xor	edi, edi
 19222  .LBB2_782:
 19223  	test	r9b, 1
 19224  	je	.LBB2_784
 19225  # %bb.783:
 19226  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 19227  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 19228  	paddq	xmm1, xmm0
 19229  	paddq	xmm2, xmm0
 19230  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 19231  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 19232  .LBB2_784:
 19233  	cmp	rsi, r10
 19234  	je	.LBB2_1069
 19235  	jmp	.LBB2_785
 19236  .LBB2_789:
 19237  	xor	edi, edi
 19238  .LBB2_790:
 19239  	test	r9b, 1
 19240  	je	.LBB2_792
 19241  # %bb.791:
 19242  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 19243  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 19244  	movdqa	xmm3, xmm0
 19245  	psubq	xmm3, xmm1
 19246  	psubq	xmm0, xmm2
 19247  	movdqu	xmmword ptr [r8 + 8*rdi], xmm3
 19248  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm0
 19249  .LBB2_792:
 19250  	cmp	rsi, r10
 19251  	je	.LBB2_1069
 19252  	jmp	.LBB2_793
 19253  .LBB2_797:
 19254  	xor	edi, edi
 19255  .LBB2_798:
 19256  	test	r9b, 1
 19257  	je	.LBB2_800
 19258  # %bb.799:
 19259  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 19260  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 19261  	pmullw	xmm1, xmm0
 19262  	pmullw	xmm2, xmm0
 19263  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 19264  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 19265  .LBB2_800:
 19266  	cmp	rsi, r10
 19267  	je	.LBB2_1069
 19268  	jmp	.LBB2_801
 19269  .LBB2_805:
 19270  	xor	edi, edi
 19271  .LBB2_806:
 19272  	test	r9b, 1
 19273  	je	.LBB2_808
 19274  # %bb.807:
 19275  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 19276  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 19277  	pmullw	xmm1, xmm0
 19278  	pmullw	xmm2, xmm0
 19279  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 19280  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 19281  .LBB2_808:
 19282  	cmp	rsi, r10
 19283  	je	.LBB2_1069
 19284  	jmp	.LBB2_809
 19285  .LBB2_813:
 19286  	xor	edi, edi
 19287  .LBB2_814:
 19288  	test	r9b, 1
 19289  	je	.LBB2_816
 19290  # %bb.815:
 19291  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 19292  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 19293  	pmullw	xmm1, xmm0
 19294  	pmullw	xmm2, xmm0
 19295  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 19296  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 19297  .LBB2_816:
 19298  	cmp	rsi, r10
 19299  	je	.LBB2_1069
 19300  	jmp	.LBB2_817
 19301  .LBB2_821:
 19302  	xor	edi, edi
 19303  .LBB2_822:
 19304  	test	r9b, 1
 19305  	je	.LBB2_824
 19306  # %bb.823:
 19307  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 19308  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 19309  	pmullw	xmm1, xmm0
 19310  	pmullw	xmm2, xmm0
 19311  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 19312  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 19313  .LBB2_824:
 19314  	cmp	rsi, r10
 19315  	je	.LBB2_1069
 19316  	jmp	.LBB2_825
 19317  .LBB2_829:
 19318  	xor	edi, edi
 19319  .LBB2_830:
 19320  	test	r9b, 1
 19321  	je	.LBB2_832
 19322  # %bb.831:
 19323  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 19324  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 19325  	paddw	xmm1, xmm0
 19326  	paddw	xmm2, xmm0
 19327  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 19328  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 19329  .LBB2_832:
 19330  	cmp	rsi, r10
 19331  	je	.LBB2_1069
 19332  	jmp	.LBB2_833
 19333  .LBB2_837:
 19334  	xor	edi, edi
 19335  .LBB2_838:
 19336  	test	r9b, 1
 19337  	je	.LBB2_840
 19338  # %bb.839:
 19339  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 19340  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 19341  	paddw	xmm1, xmm0
 19342  	paddw	xmm2, xmm0
 19343  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 19344  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 19345  .LBB2_840:
 19346  	cmp	rsi, r10
 19347  	je	.LBB2_1069
 19348  	jmp	.LBB2_841
 19349  .LBB2_845:
 19350  	xor	edi, edi
 19351  .LBB2_846:
 19352  	test	r9b, 1
 19353  	je	.LBB2_848
 19354  # %bb.847:
 19355  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 19356  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 19357  	movdqa	xmm3, xmm0
 19358  	psubw	xmm3, xmm1
 19359  	psubw	xmm0, xmm2
 19360  	movdqu	xmmword ptr [r8 + 2*rdi], xmm3
 19361  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
 19362  .LBB2_848:
 19363  	cmp	rsi, r10
 19364  	je	.LBB2_1069
 19365  	jmp	.LBB2_849
 19366  .LBB2_853:
 19367  	xor	edi, edi
 19368  .LBB2_854:
 19369  	test	r9b, 1
 19370  	je	.LBB2_856
 19371  # %bb.855:
 19372  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 19373  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 19374  	movdqa	xmm3, xmm0
 19375  	psubw	xmm3, xmm1
 19376  	psubw	xmm0, xmm2
 19377  	movdqu	xmmword ptr [r8 + 2*rdi], xmm3
 19378  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
 19379  .LBB2_856:
 19380  	cmp	rsi, r10
 19381  	je	.LBB2_1069
 19382  	jmp	.LBB2_857
 19383  .LBB2_861:
 19384  	xor	edi, edi
 19385  .LBB2_862:
 19386  	test	r9b, 1
 19387  	je	.LBB2_864
 19388  # %bb.863:
 19389  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 19390  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 19391  	paddw	xmm1, xmm0
 19392  	paddw	xmm2, xmm0
 19393  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 19394  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 19395  .LBB2_864:
 19396  	cmp	rsi, r10
 19397  	je	.LBB2_1069
 19398  	jmp	.LBB2_865
 19399  .LBB2_869:
 19400  	xor	edi, edi
 19401  .LBB2_870:
 19402  	test	r9b, 1
 19403  	je	.LBB2_872
 19404  # %bb.871:
 19405  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 19406  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 19407  	paddw	xmm1, xmm0
 19408  	paddw	xmm2, xmm0
 19409  	movdqu	xmmword ptr [r8 + 2*rdi], xmm1
 19410  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm2
 19411  .LBB2_872:
 19412  	cmp	rsi, r10
 19413  	je	.LBB2_1069
 19414  	jmp	.LBB2_873
 19415  .LBB2_877:
 19416  	xor	edi, edi
 19417  .LBB2_878:
 19418  	test	r9b, 1
 19419  	je	.LBB2_880
 19420  # %bb.879:
 19421  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 19422  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 19423  	movdqa	xmm3, xmm0
 19424  	psubw	xmm3, xmm1
 19425  	psubw	xmm0, xmm2
 19426  	movdqu	xmmword ptr [r8 + 2*rdi], xmm3
 19427  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
 19428  .LBB2_880:
 19429  	cmp	rsi, r10
 19430  	je	.LBB2_1069
 19431  	jmp	.LBB2_881
 19432  .LBB2_885:
 19433  	xor	edi, edi
 19434  .LBB2_886:
 19435  	test	r9b, 1
 19436  	je	.LBB2_888
 19437  # %bb.887:
 19438  	movdqu	xmm1, xmmword ptr [rcx + 2*rdi]
 19439  	movdqu	xmm2, xmmword ptr [rcx + 2*rdi + 16]
 19440  	movdqa	xmm3, xmm0
 19441  	psubw	xmm3, xmm1
 19442  	psubw	xmm0, xmm2
 19443  	movdqu	xmmword ptr [r8 + 2*rdi], xmm3
 19444  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm0
 19445  .LBB2_888:
 19446  	cmp	rsi, r10
 19447  	je	.LBB2_1069
 19448  	jmp	.LBB2_889
 19449  .LBB2_893:
 19450  	xor	edi, edi
 19451  .LBB2_894:
 19452  	test	r9b, 1
 19453  	je	.LBB2_896
 19454  # %bb.895:
 19455  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
 19456  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 16]
 19457  	mulps	xmm2, xmm1
 19458  	mulps	xmm3, xmm1
 19459  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 19460  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 19461  .LBB2_896:
 19462  	cmp	rdx, rax
 19463  	je	.LBB2_1069
 19464  	jmp	.LBB2_897
 19465  .LBB2_901:
 19466  	xor	edi, edi
 19467  .LBB2_902:
 19468  	test	r9b, 1
 19469  	je	.LBB2_904
 19470  # %bb.903:
 19471  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
 19472  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 16]
 19473  	mulps	xmm2, xmm1
 19474  	mulps	xmm3, xmm1
 19475  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 19476  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 19477  .LBB2_904:
 19478  	cmp	rdx, rax
 19479  	je	.LBB2_1069
 19480  	jmp	.LBB2_905
 19481  .LBB2_909:
 19482  	xor	edi, edi
 19483  .LBB2_910:
 19484  	test	r9b, 1
 19485  	je	.LBB2_912
 19486  # %bb.911:
 19487  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 19488  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 19489  	paddq	xmm1, xmm0
 19490  	paddq	xmm2, xmm0
 19491  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 19492  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 19493  .LBB2_912:
 19494  	cmp	rsi, r10
 19495  	je	.LBB2_1069
 19496  	jmp	.LBB2_913
 19497  .LBB2_917:
 19498  	xor	edi, edi
 19499  .LBB2_918:
 19500  	test	r9b, 1
 19501  	je	.LBB2_920
 19502  # %bb.919:
 19503  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
 19504  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 16]
 19505  	addps	xmm2, xmm1
 19506  	addps	xmm3, xmm1
 19507  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 19508  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 19509  .LBB2_920:
 19510  	cmp	rdx, rax
 19511  	je	.LBB2_1069
 19512  	jmp	.LBB2_921
 19513  .LBB2_925:
 19514  	xor	edi, edi
 19515  .LBB2_926:
 19516  	test	r9b, 1
 19517  	je	.LBB2_928
 19518  # %bb.927:
 19519  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 19520  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 19521  	movdqa	xmm3, xmm0
 19522  	psubq	xmm3, xmm1
 19523  	psubq	xmm0, xmm2
 19524  	movdqu	xmmword ptr [r8 + 8*rdi], xmm3
 19525  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm0
 19526  .LBB2_928:
 19527  	cmp	rsi, r10
 19528  	je	.LBB2_1069
 19529  	jmp	.LBB2_929
 19530  .LBB2_933:
 19531  	xor	edi, edi
 19532  .LBB2_934:
 19533  	test	r9b, 1
 19534  	je	.LBB2_936
 19535  # %bb.935:
 19536  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
 19537  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 16]
 19538  	movaps	xmm4, xmm1
 19539  	subps	xmm4, xmm2
 19540  	subps	xmm1, xmm3
 19541  	movups	xmmword ptr [r8 + 4*rdi], xmm4
 19542  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm1
 19543  .LBB2_936:
 19544  	cmp	rdx, rax
 19545  	je	.LBB2_1069
 19546  	jmp	.LBB2_937
 19547  .LBB2_941:
 19548  	xor	edi, edi
 19549  .LBB2_942:
 19550  	test	r9b, 1
 19551  	je	.LBB2_944
 19552  # %bb.943:
 19553  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 19554  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 19555  	paddq	xmm1, xmm0
 19556  	paddq	xmm2, xmm0
 19557  	movdqu	xmmword ptr [r8 + 8*rdi], xmm1
 19558  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm2
 19559  .LBB2_944:
 19560  	cmp	rsi, r10
 19561  	je	.LBB2_1069
 19562  	jmp	.LBB2_945
 19563  .LBB2_949:
 19564  	xor	edi, edi
 19565  .LBB2_950:
 19566  	test	r9b, 1
 19567  	je	.LBB2_952
 19568  # %bb.951:
 19569  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
 19570  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 16]
 19571  	addps	xmm2, xmm1
 19572  	addps	xmm3, xmm1
 19573  	movups	xmmword ptr [r8 + 4*rdi], xmm2
 19574  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 19575  .LBB2_952:
 19576  	cmp	rdx, rax
 19577  	je	.LBB2_1069
 19578  	jmp	.LBB2_953
 19579  .LBB2_957:
 19580  	xor	edi, edi
 19581  .LBB2_958:
 19582  	test	r9b, 1
 19583  	je	.LBB2_960
 19584  # %bb.959:
 19585  	movdqu	xmm1, xmmword ptr [rcx + 8*rdi]
 19586  	movdqu	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 19587  	movdqa	xmm3, xmm0
 19588  	psubq	xmm3, xmm1
 19589  	psubq	xmm0, xmm2
 19590  	movdqu	xmmword ptr [r8 + 8*rdi], xmm3
 19591  	movdqu	xmmword ptr [r8 + 8*rdi + 16], xmm0
 19592  .LBB2_960:
 19593  	cmp	rsi, r10
 19594  	je	.LBB2_1069
 19595  	jmp	.LBB2_961
 19596  .LBB2_965:
 19597  	xor	edi, edi
 19598  .LBB2_966:
 19599  	test	r9b, 1
 19600  	je	.LBB2_968
 19601  # %bb.967:
 19602  	movups	xmm2, xmmword ptr [rcx + 4*rdi]
 19603  	movups	xmm3, xmmword ptr [rcx + 4*rdi + 16]
 19604  	movaps	xmm4, xmm1
 19605  	subps	xmm4, xmm2
 19606  	subps	xmm1, xmm3
 19607  	movups	xmmword ptr [r8 + 4*rdi], xmm4
 19608  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm1
 19609  .LBB2_968:
 19610  	cmp	rdx, rax
 19611  	je	.LBB2_1069
 19612  	jmp	.LBB2_969
 19613  .LBB2_973:
 19614  	xor	eax, eax
 19615  .LBB2_974:
 19616  	test	r9b, 1
 19617  	je	.LBB2_976
 19618  # %bb.975:
 19619  	movdqu	xmm2, xmmword ptr [rcx + rax]
 19620  	movdqu	xmm3, xmmword ptr [rcx + rax + 16]
 19621  	movdqa	xmm4, xmm0
 19622  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19623  	pmovzxbw	xmm5, xmm2                      # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 19624  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19625  	pmullw	xmm2, xmm4
 19626  	movdqa	xmm4, xmmword ptr [rip + .LCPI2_0] # xmm4 = [255,255,255,255,255,255,255,255]
 19627  	pand	xmm2, xmm4
 19628  	pmullw	xmm5, xmm1
 19629  	pand	xmm5, xmm4
 19630  	packuswb	xmm5, xmm2
 19631  	punpckhbw	xmm0, xmm0              # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19632  	pmovzxbw	xmm2, xmm3                      # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 19633  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19634  	pmullw	xmm3, xmm0
 19635  	pand	xmm3, xmm4
 19636  	pmullw	xmm2, xmm1
 19637  	pand	xmm2, xmm4
 19638  	packuswb	xmm2, xmm3
 19639  	movdqu	xmmword ptr [r8 + rax], xmm5
 19640  	movdqu	xmmword ptr [r8 + rax + 16], xmm2
 19641  .LBB2_976:
 19642  	cmp	rdi, r10
 19643  	je	.LBB2_1069
 19644  	jmp	.LBB2_977
 19645  .LBB2_981:
 19646  	xor	eax, eax
 19647  .LBB2_982:
 19648  	test	r9b, 1
 19649  	je	.LBB2_984
 19650  # %bb.983:
 19651  	movdqu	xmm2, xmmword ptr [rcx + rax]
 19652  	movdqu	xmm3, xmmword ptr [rcx + rax + 16]
 19653  	movdqa	xmm4, xmm0
 19654  	punpckhbw	xmm4, xmm4              # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19655  	pmovzxbw	xmm5, xmm2                      # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 19656  	punpckhbw	xmm2, xmm2              # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19657  	pmullw	xmm2, xmm4
 19658  	movdqa	xmm4, xmmword ptr [rip + .LCPI2_0] # xmm4 = [255,255,255,255,255,255,255,255]
 19659  	pand	xmm2, xmm4
 19660  	pmullw	xmm5, xmm1
 19661  	pand	xmm5, xmm4
 19662  	packuswb	xmm5, xmm2
 19663  	punpckhbw	xmm0, xmm0              # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19664  	pmovzxbw	xmm2, xmm3                      # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 19665  	punpckhbw	xmm3, xmm3              # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 19666  	pmullw	xmm3, xmm0
 19667  	pand	xmm3, xmm4
 19668  	pmullw	xmm2, xmm1
 19669  	pand	xmm2, xmm4
 19670  	packuswb	xmm2, xmm3
 19671  	movdqu	xmmword ptr [r8 + rax], xmm5
 19672  	movdqu	xmmword ptr [r8 + rax + 16], xmm2
 19673  .LBB2_984:
 19674  	cmp	rdi, r10
 19675  	je	.LBB2_1069
 19676  	jmp	.LBB2_985
 19677  .LBB2_989:
 19678  	xor	edi, edi
 19679  .LBB2_990:
 19680  	test	r9b, 1
 19681  	je	.LBB2_992
 19682  # %bb.991:
 19683  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 19684  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 19685  	paddb	xmm1, xmm0
 19686  	paddb	xmm2, xmm0
 19687  	movdqu	xmmword ptr [r8 + rdi], xmm1
 19688  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 19689  .LBB2_992:
 19690  	cmp	rsi, r10
 19691  	je	.LBB2_1069
 19692  	jmp	.LBB2_993
 19693  .LBB2_997:
 19694  	xor	edi, edi
 19695  .LBB2_998:
 19696  	test	r9b, 1
 19697  	je	.LBB2_1000
 19698  # %bb.999:
 19699  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 19700  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 19701  	movdqa	xmm3, xmm0
 19702  	psubb	xmm3, xmm1
 19703  	psubb	xmm0, xmm2
 19704  	movdqu	xmmword ptr [r8 + rdi], xmm3
 19705  	movdqu	xmmword ptr [r8 + rdi + 16], xmm0
 19706  .LBB2_1000:
 19707  	cmp	rsi, r10
 19708  	je	.LBB2_1069
 19709  	jmp	.LBB2_1001
 19710  .LBB2_1005:
 19711  	xor	edi, edi
 19712  .LBB2_1006:
 19713  	test	r9b, 1
 19714  	je	.LBB2_1008
 19715  # %bb.1007:
 19716  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 19717  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 19718  	paddb	xmm1, xmm0
 19719  	paddb	xmm2, xmm0
 19720  	movdqu	xmmword ptr [r8 + rdi], xmm1
 19721  	movdqu	xmmword ptr [r8 + rdi + 16], xmm2
 19722  .LBB2_1008:
 19723  	cmp	rsi, r10
 19724  	je	.LBB2_1069
 19725  	jmp	.LBB2_1009
 19726  .LBB2_1013:
 19727  	xor	edi, edi
 19728  .LBB2_1014:
 19729  	test	r9b, 1
 19730  	je	.LBB2_1016
 19731  # %bb.1015:
 19732  	movdqu	xmm1, xmmword ptr [rcx + rdi]
 19733  	movdqu	xmm2, xmmword ptr [rcx + rdi + 16]
 19734  	movdqa	xmm3, xmm0
 19735  	psubb	xmm3, xmm1
 19736  	psubb	xmm0, xmm2
 19737  	movdqu	xmmword ptr [r8 + rdi], xmm3
 19738  	movdqu	xmmword ptr [r8 + rdi + 16], xmm0
 19739  .LBB2_1016:
 19740  	cmp	rsi, r10
 19741  	je	.LBB2_1069
 19742  	jmp	.LBB2_1017
 19743  .LBB2_1021:
 19744  	xor	edi, edi
 19745  .LBB2_1022:
 19746  	test	r9b, 1
 19747  	je	.LBB2_1024
 19748  # %bb.1023:
 19749  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 19750  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 19751  	pmulld	xmm1, xmm0
 19752  	pmulld	xmm2, xmm0
 19753  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 19754  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 19755  .LBB2_1024:
 19756  	cmp	rsi, r10
 19757  	je	.LBB2_1069
 19758  	jmp	.LBB2_1025
 19759  .LBB2_1029:
 19760  	xor	edi, edi
 19761  .LBB2_1030:
 19762  	test	r9b, 1
 19763  	je	.LBB2_1032
 19764  # %bb.1031:
 19765  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 19766  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 19767  	pmulld	xmm1, xmm0
 19768  	pmulld	xmm2, xmm0
 19769  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 19770  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 19771  .LBB2_1032:
 19772  	cmp	rsi, r10
 19773  	je	.LBB2_1069
 19774  	jmp	.LBB2_1033
 19775  .LBB2_1037:
 19776  	xor	edi, edi
 19777  .LBB2_1038:
 19778  	test	r9b, 1
 19779  	je	.LBB2_1040
 19780  # %bb.1039:
 19781  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 19782  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 19783  	paddd	xmm1, xmm0
 19784  	paddd	xmm2, xmm0
 19785  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 19786  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 19787  .LBB2_1040:
 19788  	cmp	rsi, r10
 19789  	je	.LBB2_1069
 19790  	jmp	.LBB2_1041
 19791  .LBB2_1045:
 19792  	xor	edi, edi
 19793  .LBB2_1046:
 19794  	test	r9b, 1
 19795  	je	.LBB2_1048
 19796  # %bb.1047:
 19797  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 19798  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 19799  	movdqa	xmm3, xmm0
 19800  	psubd	xmm3, xmm1
 19801  	psubd	xmm0, xmm2
 19802  	movdqu	xmmword ptr [r8 + 4*rdi], xmm3
 19803  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
 19804  .LBB2_1048:
 19805  	cmp	rsi, r10
 19806  	je	.LBB2_1069
 19807  	jmp	.LBB2_1049
 19808  .LBB2_1053:
 19809  	xor	edi, edi
 19810  .LBB2_1054:
 19811  	test	r9b, 1
 19812  	je	.LBB2_1056
 19813  # %bb.1055:
 19814  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 19815  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 19816  	paddd	xmm1, xmm0
 19817  	paddd	xmm2, xmm0
 19818  	movdqu	xmmword ptr [r8 + 4*rdi], xmm1
 19819  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm2
 19820  .LBB2_1056:
 19821  	cmp	rsi, r10
 19822  	je	.LBB2_1069
 19823  	jmp	.LBB2_1057
 19824  .LBB2_1061:
 19825  	xor	edi, edi
 19826  .LBB2_1062:
 19827  	test	r9b, 1
 19828  	je	.LBB2_1064
 19829  # %bb.1063:
 19830  	movdqu	xmm1, xmmword ptr [rcx + 4*rdi]
 19831  	movdqu	xmm2, xmmword ptr [rcx + 4*rdi + 16]
 19832  	movdqa	xmm3, xmm0
 19833  	psubd	xmm3, xmm1
 19834  	psubd	xmm0, xmm2
 19835  	movdqu	xmmword ptr [r8 + 4*rdi], xmm3
 19836  	movdqu	xmmword ptr [r8 + 4*rdi + 16], xmm0
 19837  .LBB2_1064:
 19838  	cmp	rsi, r10
 19839  	je	.LBB2_1069
 19840  	jmp	.LBB2_1065
 19841  .Lfunc_end2:
 19842  	.size	arithmetic_scalar_arr_sse4, .Lfunc_end2-arithmetic_scalar_arr_sse4
 19843                                          # -- End function
 19844  	.section	.rodata.cst16,"aM",@progbits,16
 19845  	.p2align	4                               # -- Begin function arithmetic_unary_same_types_sse4
 19846  .LCPI3_0:
 19847  	.quad	0x8000000000000000              # double -0
 19848  	.quad	0x8000000000000000              # double -0
 19849  .LCPI3_1:
 19850  	.quad	0x3ff0000000000000              # double 1
 19851  	.quad	0x3ff0000000000000              # double 1
 19852  .LCPI3_3:
 19853  	.long	1                               # 0x1
 19854  	.long	1                               # 0x1
 19855  	.long	1                               # 0x1
 19856  	.long	1                               # 0x1
 19857  .LCPI3_4:
 19858  	.quad	1                               # 0x1
 19859  	.quad	1                               # 0x1
 19860  .LCPI3_5:
 19861  	.short	1                               # 0x1
 19862  	.short	1                               # 0x1
 19863  	.short	1                               # 0x1
 19864  	.short	1                               # 0x1
 19865  	.short	1                               # 0x1
 19866  	.short	1                               # 0x1
 19867  	.short	1                               # 0x1
 19868  	.short	1                               # 0x1
 19869  .LCPI3_6:
 19870  	.zero	16,1
 19871  .LCPI3_7:
 19872  	.long	0x80000000                      # float -0
 19873  	.long	0x80000000                      # float -0
 19874  	.long	0x80000000                      # float -0
 19875  	.long	0x80000000                      # float -0
 19876  .LCPI3_8:
 19877  	.quad	9223372036854775807             # 0x7fffffffffffffff
 19878  	.quad	9223372036854775807             # 0x7fffffffffffffff
 19879  .LCPI3_9:
 19880  	.long	2147483647                      # 0x7fffffff
 19881  	.long	2147483647                      # 0x7fffffff
 19882  	.long	2147483647                      # 0x7fffffff
 19883  	.long	2147483647                      # 0x7fffffff
 19884  .LCPI3_10:
 19885  	.byte	255                             # 0xff
 19886  	.byte	0                               # 0x0
 19887  	.byte	0                               # 0x0
 19888  	.byte	0                               # 0x0
 19889  	.byte	255                             # 0xff
 19890  	.byte	0                               # 0x0
 19891  	.byte	0                               # 0x0
 19892  	.byte	0                               # 0x0
 19893  	.byte	255                             # 0xff
 19894  	.byte	0                               # 0x0
 19895  	.byte	0                               # 0x0
 19896  	.byte	0                               # 0x0
 19897  	.byte	255                             # 0xff
 19898  	.byte	0                               # 0x0
 19899  	.byte	0                               # 0x0
 19900  	.byte	0                               # 0x0
 19901  	.section	.rodata.cst8,"aM",@progbits,8
 19902  	.p2align	3
 19903  .LCPI3_2:
 19904  	.quad	0x3ff0000000000000              # double 1
 19905  	.text
 19906  	.globl	arithmetic_unary_same_types_sse4
 19907  	.p2align	4, 0x90
 19908  	.type	arithmetic_unary_same_types_sse4,@function
 19909  arithmetic_unary_same_types_sse4:       # @arithmetic_unary_same_types_sse4
 19910  # %bb.0:
 19911  	push	rbp
 19912  	mov	rbp, rsp
 19913  	and	rsp, -8
 19914  	cmp	sil, 19
 19915  	jle	.LBB3_12
 19916  # %bb.1:
 19917  	cmp	sil, 20
 19918  	je	.LBB3_22
 19919  # %bb.2:
 19920  	cmp	sil, 25
 19921  	je	.LBB3_30
 19922  # %bb.3:
 19923  	cmp	sil, 26
 19924  	jne	.LBB3_923
 19925  # %bb.4:
 19926  	cmp	edi, 6
 19927  	jg	.LBB3_46
 19928  # %bb.5:
 19929  	cmp	edi, 3
 19930  	jle	.LBB3_81
 19931  # %bb.6:
 19932  	cmp	edi, 4
 19933  	je	.LBB3_131
 19934  # %bb.7:
 19935  	cmp	edi, 5
 19936  	je	.LBB3_134
 19937  # %bb.8:
 19938  	cmp	edi, 6
 19939  	jne	.LBB3_923
 19940  # %bb.9:
 19941  	test	r8d, r8d
 19942  	jle	.LBB3_923
 19943  # %bb.10:
 19944  	mov	r9d, r8d
 19945  	cmp	r8d, 8
 19946  	jae	.LBB3_221
 19947  # %bb.11:
 19948  	xor	edx, edx
 19949  	jmp	.LBB3_373
 19950  .LBB3_12:
 19951  	cmp	sil, 4
 19952  	je	.LBB3_38
 19953  # %bb.13:
 19954  	cmp	sil, 5
 19955  	jne	.LBB3_923
 19956  # %bb.14:
 19957  	cmp	edi, 6
 19958  	jg	.LBB3_53
 19959  # %bb.15:
 19960  	cmp	edi, 3
 19961  	jle	.LBB3_86
 19962  # %bb.16:
 19963  	cmp	edi, 4
 19964  	je	.LBB3_137
 19965  # %bb.17:
 19966  	cmp	edi, 5
 19967  	je	.LBB3_140
 19968  # %bb.18:
 19969  	cmp	edi, 6
 19970  	jne	.LBB3_923
 19971  # %bb.19:
 19972  	test	r8d, r8d
 19973  	jle	.LBB3_923
 19974  # %bb.20:
 19975  	mov	r9d, r8d
 19976  	cmp	r8d, 8
 19977  	jb	.LBB3_21
 19978  # %bb.223:
 19979  	lea	rax, [rdx + 4*r9]
 19980  	cmp	rax, rcx
 19981  	jbe	.LBB3_374
 19982  # %bb.224:
 19983  	lea	rax, [rcx + 4*r9]
 19984  	cmp	rax, rdx
 19985  	jbe	.LBB3_374
 19986  .LBB3_21:
 19987  	xor	esi, esi
 19988  .LBB3_614:
 19989  	mov	r8, rsi
 19990  	not	r8
 19991  	add	r8, r9
 19992  	mov	rdi, r9
 19993  	and	rdi, 3
 19994  	je	.LBB3_616
 19995  .LBB3_615:                              # =>This Inner Loop Header: Depth=1
 19996  	xor	eax, eax
 19997  	sub	eax, dword ptr [rdx + 4*rsi]
 19998  	mov	dword ptr [rcx + 4*rsi], eax
 19999  	add	rsi, 1
 20000  	add	rdi, -1
 20001  	jne	.LBB3_615
 20002  .LBB3_616:
 20003  	cmp	r8, 3
 20004  	jb	.LBB3_923
 20005  .LBB3_617:                              # =>This Inner Loop Header: Depth=1
 20006  	xor	eax, eax
 20007  	sub	eax, dword ptr [rdx + 4*rsi]
 20008  	mov	dword ptr [rcx + 4*rsi], eax
 20009  	xor	eax, eax
 20010  	sub	eax, dword ptr [rdx + 4*rsi + 4]
 20011  	mov	dword ptr [rcx + 4*rsi + 4], eax
 20012  	xor	eax, eax
 20013  	sub	eax, dword ptr [rdx + 4*rsi + 8]
 20014  	mov	dword ptr [rcx + 4*rsi + 8], eax
 20015  	xor	eax, eax
 20016  	sub	eax, dword ptr [rdx + 4*rsi + 12]
 20017  	mov	dword ptr [rcx + 4*rsi + 12], eax
 20018  	add	rsi, 4
 20019  	cmp	r9, rsi
 20020  	jne	.LBB3_617
 20021  	jmp	.LBB3_923
 20022  .LBB3_22:
 20023  	cmp	edi, 6
 20024  	jg	.LBB3_60
 20025  # %bb.23:
 20026  	cmp	edi, 3
 20027  	jle	.LBB3_91
 20028  # %bb.24:
 20029  	cmp	edi, 4
 20030  	je	.LBB3_143
 20031  # %bb.25:
 20032  	cmp	edi, 5
 20033  	je	.LBB3_146
 20034  # %bb.26:
 20035  	cmp	edi, 6
 20036  	jne	.LBB3_923
 20037  # %bb.27:
 20038  	test	r8d, r8d
 20039  	jle	.LBB3_923
 20040  # %bb.28:
 20041  	mov	r9d, r8d
 20042  	cmp	r8d, 8
 20043  	jb	.LBB3_29
 20044  # %bb.226:
 20045  	lea	rax, [rdx + 4*r9]
 20046  	cmp	rax, rcx
 20047  	jbe	.LBB3_377
 20048  # %bb.227:
 20049  	lea	rax, [rcx + 4*r9]
 20050  	cmp	rax, rdx
 20051  	jbe	.LBB3_377
 20052  .LBB3_29:
 20053  	xor	esi, esi
 20054  .LBB3_622:
 20055  	mov	r8, rsi
 20056  	not	r8
 20057  	add	r8, r9
 20058  	mov	rdi, r9
 20059  	and	rdi, 3
 20060  	je	.LBB3_624
 20061  .LBB3_623:                              # =>This Inner Loop Header: Depth=1
 20062  	xor	eax, eax
 20063  	cmp	dword ptr [rdx + 4*rsi], 0
 20064  	setne	al
 20065  	mov	dword ptr [rcx + 4*rsi], eax
 20066  	add	rsi, 1
 20067  	add	rdi, -1
 20068  	jne	.LBB3_623
 20069  .LBB3_624:
 20070  	cmp	r8, 3
 20071  	jb	.LBB3_923
 20072  .LBB3_625:                              # =>This Inner Loop Header: Depth=1
 20073  	xor	eax, eax
 20074  	cmp	dword ptr [rdx + 4*rsi], 0
 20075  	setne	al
 20076  	mov	dword ptr [rcx + 4*rsi], eax
 20077  	xor	eax, eax
 20078  	cmp	dword ptr [rdx + 4*rsi + 4], 0
 20079  	setne	al
 20080  	mov	dword ptr [rcx + 4*rsi + 4], eax
 20081  	xor	eax, eax
 20082  	cmp	dword ptr [rdx + 4*rsi + 8], 0
 20083  	setne	al
 20084  	mov	dword ptr [rcx + 4*rsi + 8], eax
 20085  	xor	eax, eax
 20086  	cmp	dword ptr [rdx + 4*rsi + 12], 0
 20087  	setne	al
 20088  	mov	dword ptr [rcx + 4*rsi + 12], eax
 20089  	add	rsi, 4
 20090  	cmp	r9, rsi
 20091  	jne	.LBB3_625
 20092  	jmp	.LBB3_923
 20093  .LBB3_30:
 20094  	cmp	edi, 6
 20095  	jg	.LBB3_67
 20096  # %bb.31:
 20097  	cmp	edi, 3
 20098  	jle	.LBB3_96
 20099  # %bb.32:
 20100  	cmp	edi, 4
 20101  	je	.LBB3_149
 20102  # %bb.33:
 20103  	cmp	edi, 5
 20104  	je	.LBB3_152
 20105  # %bb.34:
 20106  	cmp	edi, 6
 20107  	jne	.LBB3_923
 20108  # %bb.35:
 20109  	test	r8d, r8d
 20110  	jle	.LBB3_923
 20111  # %bb.36:
 20112  	mov	r9d, r8d
 20113  	cmp	r8d, 8
 20114  	jb	.LBB3_37
 20115  # %bb.229:
 20116  	lea	rax, [rdx + 4*r9]
 20117  	cmp	rax, rcx
 20118  	jbe	.LBB3_380
 20119  # %bb.230:
 20120  	lea	rax, [rcx + 4*r9]
 20121  	cmp	rax, rdx
 20122  	jbe	.LBB3_380
 20123  .LBB3_37:
 20124  	xor	esi, esi
 20125  .LBB3_536:
 20126  	mov	r8, rsi
 20127  	not	r8
 20128  	add	r8, r9
 20129  	mov	rdi, r9
 20130  	and	rdi, 3
 20131  	je	.LBB3_538
 20132  .LBB3_537:                              # =>This Inner Loop Header: Depth=1
 20133  	mov	eax, dword ptr [rdx + 4*rsi]
 20134  	mov	dword ptr [rcx + 4*rsi], eax
 20135  	add	rsi, 1
 20136  	add	rdi, -1
 20137  	jne	.LBB3_537
 20138  .LBB3_538:
 20139  	cmp	r8, 3
 20140  	jb	.LBB3_923
 20141  .LBB3_539:                              # =>This Inner Loop Header: Depth=1
 20142  	mov	eax, dword ptr [rdx + 4*rsi]
 20143  	mov	dword ptr [rcx + 4*rsi], eax
 20144  	mov	eax, dword ptr [rdx + 4*rsi + 4]
 20145  	mov	dword ptr [rcx + 4*rsi + 4], eax
 20146  	mov	eax, dword ptr [rdx + 4*rsi + 8]
 20147  	mov	dword ptr [rcx + 4*rsi + 8], eax
 20148  	mov	eax, dword ptr [rdx + 4*rsi + 12]
 20149  	mov	dword ptr [rcx + 4*rsi + 12], eax
 20150  	add	rsi, 4
 20151  	cmp	r9, rsi
 20152  	jne	.LBB3_539
 20153  	jmp	.LBB3_923
 20154  .LBB3_38:
 20155  	cmp	edi, 6
 20156  	jg	.LBB3_74
 20157  # %bb.39:
 20158  	cmp	edi, 3
 20159  	jle	.LBB3_101
 20160  # %bb.40:
 20161  	cmp	edi, 4
 20162  	je	.LBB3_155
 20163  # %bb.41:
 20164  	cmp	edi, 5
 20165  	je	.LBB3_158
 20166  # %bb.42:
 20167  	cmp	edi, 6
 20168  	jne	.LBB3_923
 20169  # %bb.43:
 20170  	test	r8d, r8d
 20171  	jle	.LBB3_923
 20172  # %bb.44:
 20173  	mov	r9d, r8d
 20174  	cmp	r8d, 8
 20175  	jb	.LBB3_45
 20176  # %bb.232:
 20177  	lea	rax, [rdx + 4*r9]
 20178  	cmp	rax, rcx
 20179  	jbe	.LBB3_382
 20180  # %bb.233:
 20181  	lea	rax, [rcx + 4*r9]
 20182  	cmp	rax, rdx
 20183  	jbe	.LBB3_382
 20184  .LBB3_45:
 20185  	xor	esi, esi
 20186  .LBB3_546:
 20187  	mov	r8, rsi
 20188  	not	r8
 20189  	add	r8, r9
 20190  	mov	rdi, r9
 20191  	and	rdi, 3
 20192  	je	.LBB3_548
 20193  .LBB3_547:                              # =>This Inner Loop Header: Depth=1
 20194  	mov	eax, dword ptr [rdx + 4*rsi]
 20195  	mov	dword ptr [rcx + 4*rsi], eax
 20196  	add	rsi, 1
 20197  	add	rdi, -1
 20198  	jne	.LBB3_547
 20199  .LBB3_548:
 20200  	cmp	r8, 3
 20201  	jb	.LBB3_923
 20202  .LBB3_549:                              # =>This Inner Loop Header: Depth=1
 20203  	mov	eax, dword ptr [rdx + 4*rsi]
 20204  	mov	dword ptr [rcx + 4*rsi], eax
 20205  	mov	eax, dword ptr [rdx + 4*rsi + 4]
 20206  	mov	dword ptr [rcx + 4*rsi + 4], eax
 20207  	mov	eax, dword ptr [rdx + 4*rsi + 8]
 20208  	mov	dword ptr [rcx + 4*rsi + 8], eax
 20209  	mov	eax, dword ptr [rdx + 4*rsi + 12]
 20210  	mov	dword ptr [rcx + 4*rsi + 12], eax
 20211  	add	rsi, 4
 20212  	cmp	r9, rsi
 20213  	jne	.LBB3_549
 20214  	jmp	.LBB3_923
 20215  .LBB3_46:
 20216  	cmp	edi, 8
 20217  	jle	.LBB3_106
 20218  # %bb.47:
 20219  	cmp	edi, 9
 20220  	je	.LBB3_161
 20221  # %bb.48:
 20222  	cmp	edi, 11
 20223  	je	.LBB3_164
 20224  # %bb.49:
 20225  	cmp	edi, 12
 20226  	jne	.LBB3_923
 20227  # %bb.50:
 20228  	test	r8d, r8d
 20229  	jle	.LBB3_923
 20230  # %bb.51:
 20231  	mov	r9d, r8d
 20232  	cmp	r8d, 4
 20233  	jb	.LBB3_52
 20234  # %bb.235:
 20235  	lea	rax, [rdx + 8*r9]
 20236  	cmp	rax, rcx
 20237  	jbe	.LBB3_384
 20238  # %bb.236:
 20239  	lea	rax, [rcx + 8*r9]
 20240  	cmp	rax, rdx
 20241  	jbe	.LBB3_384
 20242  .LBB3_52:
 20243  	xor	esi, esi
 20244  .LBB3_630:
 20245  	mov	rax, rsi
 20246  	not	rax
 20247  	add	rax, r9
 20248  	mov	rdi, r9
 20249  	and	rdi, 3
 20250  	je	.LBB3_633
 20251  # %bb.631:
 20252  	movapd	xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0]
 20253  .LBB3_632:                              # =>This Inner Loop Header: Depth=1
 20254  	movsd	xmm1, qword ptr [rdx + 8*rsi]   # xmm1 = mem[0],zero
 20255  	xorpd	xmm1, xmm0
 20256  	movlpd	qword ptr [rcx + 8*rsi], xmm1
 20257  	add	rsi, 1
 20258  	add	rdi, -1
 20259  	jne	.LBB3_632
 20260  .LBB3_633:
 20261  	cmp	rax, 3
 20262  	jb	.LBB3_923
 20263  # %bb.634:
 20264  	movapd	xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0]
 20265  .LBB3_635:                              # =>This Inner Loop Header: Depth=1
 20266  	movsd	xmm1, qword ptr [rdx + 8*rsi]   # xmm1 = mem[0],zero
 20267  	xorpd	xmm1, xmm0
 20268  	movlpd	qword ptr [rcx + 8*rsi], xmm1
 20269  	movsd	xmm1, qword ptr [rdx + 8*rsi + 8] # xmm1 = mem[0],zero
 20270  	xorpd	xmm1, xmm0
 20271  	movlpd	qword ptr [rcx + 8*rsi + 8], xmm1
 20272  	movsd	xmm1, qword ptr [rdx + 8*rsi + 16] # xmm1 = mem[0],zero
 20273  	xorpd	xmm1, xmm0
 20274  	movlpd	qword ptr [rcx + 8*rsi + 16], xmm1
 20275  	movsd	xmm1, qword ptr [rdx + 8*rsi + 24] # xmm1 = mem[0],zero
 20276  	xorpd	xmm1, xmm0
 20277  	movlpd	qword ptr [rcx + 8*rsi + 24], xmm1
 20278  	add	rsi, 4
 20279  	cmp	r9, rsi
 20280  	jne	.LBB3_635
 20281  	jmp	.LBB3_923
 20282  .LBB3_53:
 20283  	cmp	edi, 8
 20284  	jle	.LBB3_111
 20285  # %bb.54:
 20286  	cmp	edi, 9
 20287  	je	.LBB3_167
 20288  # %bb.55:
 20289  	cmp	edi, 11
 20290  	je	.LBB3_170
 20291  # %bb.56:
 20292  	cmp	edi, 12
 20293  	jne	.LBB3_923
 20294  # %bb.57:
 20295  	test	r8d, r8d
 20296  	jle	.LBB3_923
 20297  # %bb.58:
 20298  	mov	r9d, r8d
 20299  	cmp	r8d, 4
 20300  	jb	.LBB3_59
 20301  # %bb.238:
 20302  	lea	rax, [rdx + 8*r9]
 20303  	cmp	rax, rcx
 20304  	jbe	.LBB3_387
 20305  # %bb.239:
 20306  	lea	rax, [rcx + 8*r9]
 20307  	cmp	rax, rdx
 20308  	jbe	.LBB3_387
 20309  .LBB3_59:
 20310  	xor	esi, esi
 20311  .LBB3_640:
 20312  	mov	rax, rsi
 20313  	not	rax
 20314  	add	rax, r9
 20315  	mov	rdi, r9
 20316  	and	rdi, 3
 20317  	je	.LBB3_643
 20318  # %bb.641:
 20319  	movapd	xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0]
 20320  .LBB3_642:                              # =>This Inner Loop Header: Depth=1
 20321  	movsd	xmm1, qword ptr [rdx + 8*rsi]   # xmm1 = mem[0],zero
 20322  	xorpd	xmm1, xmm0
 20323  	movlpd	qword ptr [rcx + 8*rsi], xmm1
 20324  	add	rsi, 1
 20325  	add	rdi, -1
 20326  	jne	.LBB3_642
 20327  .LBB3_643:
 20328  	cmp	rax, 3
 20329  	jb	.LBB3_923
 20330  # %bb.644:
 20331  	movapd	xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0]
 20332  .LBB3_645:                              # =>This Inner Loop Header: Depth=1
 20333  	movsd	xmm1, qword ptr [rdx + 8*rsi]   # xmm1 = mem[0],zero
 20334  	xorpd	xmm1, xmm0
 20335  	movlpd	qword ptr [rcx + 8*rsi], xmm1
 20336  	movsd	xmm1, qword ptr [rdx + 8*rsi + 8] # xmm1 = mem[0],zero
 20337  	xorpd	xmm1, xmm0
 20338  	movlpd	qword ptr [rcx + 8*rsi + 8], xmm1
 20339  	movsd	xmm1, qword ptr [rdx + 8*rsi + 16] # xmm1 = mem[0],zero
 20340  	xorpd	xmm1, xmm0
 20341  	movlpd	qword ptr [rcx + 8*rsi + 16], xmm1
 20342  	movsd	xmm1, qword ptr [rdx + 8*rsi + 24] # xmm1 = mem[0],zero
 20343  	xorpd	xmm1, xmm0
 20344  	movlpd	qword ptr [rcx + 8*rsi + 24], xmm1
 20345  	add	rsi, 4
 20346  	cmp	r9, rsi
 20347  	jne	.LBB3_645
 20348  	jmp	.LBB3_923
 20349  .LBB3_60:
 20350  	cmp	edi, 8
 20351  	jle	.LBB3_116
 20352  # %bb.61:
 20353  	cmp	edi, 9
 20354  	je	.LBB3_173
 20355  # %bb.62:
 20356  	cmp	edi, 11
 20357  	je	.LBB3_176
 20358  # %bb.63:
 20359  	cmp	edi, 12
 20360  	jne	.LBB3_923
 20361  # %bb.64:
 20362  	test	r8d, r8d
 20363  	jle	.LBB3_923
 20364  # %bb.65:
 20365  	mov	r9d, r8d
 20366  	cmp	r8d, 4
 20367  	jb	.LBB3_66
 20368  # %bb.241:
 20369  	lea	rax, [rdx + 8*r9]
 20370  	cmp	rax, rcx
 20371  	jbe	.LBB3_390
 20372  # %bb.242:
 20373  	lea	rax, [rcx + 8*r9]
 20374  	cmp	rax, rdx
 20375  	jbe	.LBB3_390
 20376  .LBB3_66:
 20377  	xor	esi, esi
 20378  .LBB3_650:
 20379  	mov	rax, rsi
 20380  	not	rax
 20381  	test	r9b, 1
 20382  	je	.LBB3_652
 20383  # %bb.651:
 20384  	movsd	xmm0, qword ptr [rdx + 8*rsi]   # xmm0 = mem[0],zero
 20385  	movapd	xmm1, xmmword ptr [rip + .LCPI3_0] # xmm1 = [-0.0E+0,-0.0E+0]
 20386  	andpd	xmm1, xmm0
 20387  	movsd	xmm2, qword ptr [rip + .LCPI3_2] # xmm2 = mem[0],zero
 20388  	orpd	xmm2, xmm1
 20389  	xorpd	xmm1, xmm1
 20390  	cmpeqsd	xmm1, xmm0
 20391  	andnpd	xmm1, xmm2
 20392  	movlpd	qword ptr [rcx + 8*rsi], xmm1
 20393  	or	rsi, 1
 20394  .LBB3_652:
 20395  	add	rax, r9
 20396  	je	.LBB3_923
 20397  # %bb.653:
 20398  	movapd	xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0]
 20399  	movsd	xmm1, qword ptr [rip + .LCPI3_2] # xmm1 = mem[0],zero
 20400  	xorpd	xmm2, xmm2
 20401  .LBB3_654:                              # =>This Inner Loop Header: Depth=1
 20402  	movsd	xmm3, qword ptr [rdx + 8*rsi]   # xmm3 = mem[0],zero
 20403  	movapd	xmm4, xmm3
 20404  	andpd	xmm4, xmm0
 20405  	orpd	xmm4, xmm1
 20406  	cmpeqsd	xmm3, xmm2
 20407  	andnpd	xmm3, xmm4
 20408  	movlpd	qword ptr [rcx + 8*rsi], xmm3
 20409  	movsd	xmm3, qword ptr [rdx + 8*rsi + 8] # xmm3 = mem[0],zero
 20410  	movapd	xmm4, xmm3
 20411  	andpd	xmm4, xmm0
 20412  	orpd	xmm4, xmm1
 20413  	cmpeqsd	xmm3, xmm2
 20414  	andnpd	xmm3, xmm4
 20415  	movlpd	qword ptr [rcx + 8*rsi + 8], xmm3
 20416  	add	rsi, 2
 20417  	cmp	r9, rsi
 20418  	jne	.LBB3_654
 20419  	jmp	.LBB3_923
 20420  .LBB3_67:
 20421  	cmp	edi, 8
 20422  	jle	.LBB3_121
 20423  # %bb.68:
 20424  	cmp	edi, 9
 20425  	je	.LBB3_179
 20426  # %bb.69:
 20427  	cmp	edi, 11
 20428  	je	.LBB3_182
 20429  # %bb.70:
 20430  	cmp	edi, 12
 20431  	jne	.LBB3_923
 20432  # %bb.71:
 20433  	test	r8d, r8d
 20434  	jle	.LBB3_923
 20435  # %bb.72:
 20436  	mov	r9d, r8d
 20437  	cmp	r8d, 4
 20438  	jb	.LBB3_73
 20439  # %bb.244:
 20440  	lea	rax, [rdx + 8*r9]
 20441  	cmp	rax, rcx
 20442  	jbe	.LBB3_393
 20443  # %bb.245:
 20444  	lea	rax, [rcx + 8*r9]
 20445  	cmp	rax, rdx
 20446  	jbe	.LBB3_393
 20447  .LBB3_73:
 20448  	xor	esi, esi
 20449  .LBB3_659:
 20450  	movabs	r10, 9223372036854775807
 20451  	mov	r8, rsi
 20452  	not	r8
 20453  	add	r8, r9
 20454  	mov	rax, r9
 20455  	and	rax, 3
 20456  	je	.LBB3_661
 20457  .LBB3_660:                              # =>This Inner Loop Header: Depth=1
 20458  	mov	rdi, qword ptr [rdx + 8*rsi]
 20459  	and	rdi, r10
 20460  	mov	qword ptr [rcx + 8*rsi], rdi
 20461  	add	rsi, 1
 20462  	add	rax, -1
 20463  	jne	.LBB3_660
 20464  .LBB3_661:
 20465  	cmp	r8, 3
 20466  	jb	.LBB3_923
 20467  .LBB3_662:                              # =>This Inner Loop Header: Depth=1
 20468  	mov	rax, qword ptr [rdx + 8*rsi]
 20469  	and	rax, r10
 20470  	mov	qword ptr [rcx + 8*rsi], rax
 20471  	mov	rax, qword ptr [rdx + 8*rsi + 8]
 20472  	and	rax, r10
 20473  	mov	qword ptr [rcx + 8*rsi + 8], rax
 20474  	mov	rax, qword ptr [rdx + 8*rsi + 16]
 20475  	and	rax, r10
 20476  	mov	qword ptr [rcx + 8*rsi + 16], rax
 20477  	mov	rax, qword ptr [rdx + 8*rsi + 24]
 20478  	and	rax, r10
 20479  	mov	qword ptr [rcx + 8*rsi + 24], rax
 20480  	add	rsi, 4
 20481  	cmp	r9, rsi
 20482  	jne	.LBB3_662
 20483  	jmp	.LBB3_923
 20484  .LBB3_74:
 20485  	cmp	edi, 8
 20486  	jle	.LBB3_126
 20487  # %bb.75:
 20488  	cmp	edi, 9
 20489  	je	.LBB3_185
 20490  # %bb.76:
 20491  	cmp	edi, 11
 20492  	je	.LBB3_188
 20493  # %bb.77:
 20494  	cmp	edi, 12
 20495  	jne	.LBB3_923
 20496  # %bb.78:
 20497  	test	r8d, r8d
 20498  	jle	.LBB3_923
 20499  # %bb.79:
 20500  	mov	r9d, r8d
 20501  	cmp	r8d, 4
 20502  	jb	.LBB3_80
 20503  # %bb.247:
 20504  	lea	rax, [rdx + 8*r9]
 20505  	cmp	rax, rcx
 20506  	jbe	.LBB3_396
 20507  # %bb.248:
 20508  	lea	rax, [rcx + 8*r9]
 20509  	cmp	rax, rdx
 20510  	jbe	.LBB3_396
 20511  .LBB3_80:
 20512  	xor	esi, esi
 20513  .LBB3_667:
 20514  	movabs	r10, 9223372036854775807
 20515  	mov	r8, rsi
 20516  	not	r8
 20517  	add	r8, r9
 20518  	mov	rax, r9
 20519  	and	rax, 3
 20520  	je	.LBB3_669
 20521  .LBB3_668:                              # =>This Inner Loop Header: Depth=1
 20522  	mov	rdi, qword ptr [rdx + 8*rsi]
 20523  	and	rdi, r10
 20524  	mov	qword ptr [rcx + 8*rsi], rdi
 20525  	add	rsi, 1
 20526  	add	rax, -1
 20527  	jne	.LBB3_668
 20528  .LBB3_669:
 20529  	cmp	r8, 3
 20530  	jb	.LBB3_923
 20531  .LBB3_670:                              # =>This Inner Loop Header: Depth=1
 20532  	mov	rax, qword ptr [rdx + 8*rsi]
 20533  	and	rax, r10
 20534  	mov	qword ptr [rcx + 8*rsi], rax
 20535  	mov	rax, qword ptr [rdx + 8*rsi + 8]
 20536  	and	rax, r10
 20537  	mov	qword ptr [rcx + 8*rsi + 8], rax
 20538  	mov	rax, qword ptr [rdx + 8*rsi + 16]
 20539  	and	rax, r10
 20540  	mov	qword ptr [rcx + 8*rsi + 16], rax
 20541  	mov	rax, qword ptr [rdx + 8*rsi + 24]
 20542  	and	rax, r10
 20543  	mov	qword ptr [rcx + 8*rsi + 24], rax
 20544  	add	rsi, 4
 20545  	cmp	r9, rsi
 20546  	jne	.LBB3_670
 20547  	jmp	.LBB3_923
 20548  .LBB3_81:
 20549  	cmp	edi, 2
 20550  	je	.LBB3_191
 20551  # %bb.82:
 20552  	cmp	edi, 3
 20553  	jne	.LBB3_923
 20554  # %bb.83:
 20555  	test	r8d, r8d
 20556  	jle	.LBB3_923
 20557  # %bb.84:
 20558  	mov	r9d, r8d
 20559  	cmp	r8d, 32
 20560  	jb	.LBB3_85
 20561  # %bb.250:
 20562  	lea	rax, [rdx + r9]
 20563  	cmp	rax, rcx
 20564  	jbe	.LBB3_399
 20565  # %bb.251:
 20566  	lea	rax, [rcx + r9]
 20567  	cmp	rax, rdx
 20568  	jbe	.LBB3_399
 20569  .LBB3_85:
 20570  	xor	esi, esi
 20571  .LBB3_675:
 20572  	mov	r8, rsi
 20573  	not	r8
 20574  	add	r8, r9
 20575  	mov	rdi, r9
 20576  	and	rdi, 3
 20577  	je	.LBB3_677
 20578  .LBB3_676:                              # =>This Inner Loop Header: Depth=1
 20579  	movzx	r10d, byte ptr [rdx + rsi]
 20580  	xor	eax, eax
 20581  	sub	al, r10b
 20582  	mov	byte ptr [rcx + rsi], al
 20583  	add	rsi, 1
 20584  	add	rdi, -1
 20585  	jne	.LBB3_676
 20586  .LBB3_677:
 20587  	cmp	r8, 3
 20588  	jb	.LBB3_923
 20589  .LBB3_678:                              # =>This Inner Loop Header: Depth=1
 20590  	xor	eax, eax
 20591  	sub	al, byte ptr [rdx + rsi]
 20592  	mov	byte ptr [rcx + rsi], al
 20593  	xor	eax, eax
 20594  	sub	al, byte ptr [rdx + rsi + 1]
 20595  	mov	byte ptr [rcx + rsi + 1], al
 20596  	xor	eax, eax
 20597  	sub	al, byte ptr [rdx + rsi + 2]
 20598  	mov	byte ptr [rcx + rsi + 2], al
 20599  	movzx	eax, byte ptr [rdx + rsi + 3]
 20600  	xor	edi, edi
 20601  	sub	dil, al
 20602  	mov	byte ptr [rcx + rsi + 3], dil
 20603  	add	rsi, 4
 20604  	cmp	r9, rsi
 20605  	jne	.LBB3_678
 20606  	jmp	.LBB3_923
 20607  .LBB3_86:
 20608  	cmp	edi, 2
 20609  	je	.LBB3_194
 20610  # %bb.87:
 20611  	cmp	edi, 3
 20612  	jne	.LBB3_923
 20613  # %bb.88:
 20614  	test	r8d, r8d
 20615  	jle	.LBB3_923
 20616  # %bb.89:
 20617  	mov	r9d, r8d
 20618  	cmp	r8d, 32
 20619  	jb	.LBB3_90
 20620  # %bb.253:
 20621  	lea	rax, [rdx + r9]
 20622  	cmp	rax, rcx
 20623  	jbe	.LBB3_402
 20624  # %bb.254:
 20625  	lea	rax, [rcx + r9]
 20626  	cmp	rax, rdx
 20627  	jbe	.LBB3_402
 20628  .LBB3_90:
 20629  	xor	esi, esi
 20630  .LBB3_683:
 20631  	mov	r8, rsi
 20632  	not	r8
 20633  	add	r8, r9
 20634  	mov	rdi, r9
 20635  	and	rdi, 3
 20636  	je	.LBB3_685
 20637  .LBB3_684:                              # =>This Inner Loop Header: Depth=1
 20638  	movzx	r10d, byte ptr [rdx + rsi]
 20639  	xor	eax, eax
 20640  	sub	al, r10b
 20641  	mov	byte ptr [rcx + rsi], al
 20642  	add	rsi, 1
 20643  	add	rdi, -1
 20644  	jne	.LBB3_684
 20645  .LBB3_685:
 20646  	cmp	r8, 3
 20647  	jb	.LBB3_923
 20648  .LBB3_686:                              # =>This Inner Loop Header: Depth=1
 20649  	xor	eax, eax
 20650  	sub	al, byte ptr [rdx + rsi]
 20651  	mov	byte ptr [rcx + rsi], al
 20652  	xor	eax, eax
 20653  	sub	al, byte ptr [rdx + rsi + 1]
 20654  	mov	byte ptr [rcx + rsi + 1], al
 20655  	xor	eax, eax
 20656  	sub	al, byte ptr [rdx + rsi + 2]
 20657  	mov	byte ptr [rcx + rsi + 2], al
 20658  	movzx	eax, byte ptr [rdx + rsi + 3]
 20659  	xor	edi, edi
 20660  	sub	dil, al
 20661  	mov	byte ptr [rcx + rsi + 3], dil
 20662  	add	rsi, 4
 20663  	cmp	r9, rsi
 20664  	jne	.LBB3_686
 20665  	jmp	.LBB3_923
 20666  .LBB3_91:
 20667  	cmp	edi, 2
 20668  	je	.LBB3_197
 20669  # %bb.92:
 20670  	cmp	edi, 3
 20671  	jne	.LBB3_923
 20672  # %bb.93:
 20673  	test	r8d, r8d
 20674  	jle	.LBB3_923
 20675  # %bb.94:
 20676  	mov	r9d, r8d
 20677  	cmp	r8d, 32
 20678  	jb	.LBB3_95
 20679  # %bb.256:
 20680  	lea	rax, [rdx + r9]
 20681  	cmp	rax, rcx
 20682  	jbe	.LBB3_405
 20683  # %bb.257:
 20684  	lea	rax, [rcx + r9]
 20685  	cmp	rax, rdx
 20686  	jbe	.LBB3_405
 20687  .LBB3_95:
 20688  	xor	esi, esi
 20689  .LBB3_691:
 20690  	mov	rax, rsi
 20691  	not	rax
 20692  	test	r9b, 1
 20693  	je	.LBB3_693
 20694  # %bb.692:
 20695  	mov	dil, byte ptr [rdx + rsi]
 20696  	test	dil, dil
 20697  	setne	r8b
 20698  	neg	r8b
 20699  	test	dil, dil
 20700  	movzx	r8d, r8b
 20701  	mov	edi, 1
 20702  	cmovle	edi, r8d
 20703  	mov	byte ptr [rcx + rsi], dil
 20704  	or	rsi, 1
 20705  .LBB3_693:
 20706  	add	rax, r9
 20707  	je	.LBB3_923
 20708  # %bb.694:
 20709  	mov	edi, 1
 20710  .LBB3_695:                              # =>This Inner Loop Header: Depth=1
 20711  	movzx	r8d, byte ptr [rdx + rsi]
 20712  	test	r8b, r8b
 20713  	setne	al
 20714  	neg	al
 20715  	test	r8b, r8b
 20716  	movzx	eax, al
 20717  	cmovg	eax, edi
 20718  	mov	byte ptr [rcx + rsi], al
 20719  	movzx	r8d, byte ptr [rdx + rsi + 1]
 20720  	test	r8b, r8b
 20721  	setne	al
 20722  	neg	al
 20723  	test	r8b, r8b
 20724  	movzx	eax, al
 20725  	cmovg	eax, edi
 20726  	mov	byte ptr [rcx + rsi + 1], al
 20727  	add	rsi, 2
 20728  	cmp	r9, rsi
 20729  	jne	.LBB3_695
 20730  	jmp	.LBB3_923
 20731  .LBB3_96:
 20732  	cmp	edi, 2
 20733  	je	.LBB3_200
 20734  # %bb.97:
 20735  	cmp	edi, 3
 20736  	jne	.LBB3_923
 20737  # %bb.98:
 20738  	test	r8d, r8d
 20739  	jle	.LBB3_923
 20740  # %bb.99:
 20741  	mov	r9d, r8d
 20742  	cmp	r8d, 16
 20743  	jb	.LBB3_100
 20744  # %bb.259:
 20745  	lea	rax, [rdx + r9]
 20746  	cmp	rax, rcx
 20747  	jbe	.LBB3_408
 20748  # %bb.260:
 20749  	lea	rax, [rcx + r9]
 20750  	cmp	rax, rdx
 20751  	jbe	.LBB3_408
 20752  .LBB3_100:
 20753  	xor	esi, esi
 20754  .LBB3_700:
 20755  	mov	rax, rsi
 20756  	not	rax
 20757  	test	r9b, 1
 20758  	je	.LBB3_702
 20759  # %bb.701:
 20760  	movsx	edi, byte ptr [rdx + rsi]
 20761  	mov	r8d, edi
 20762  	sar	r8d, 7
 20763  	add	edi, r8d
 20764  	xor	edi, r8d
 20765  	mov	byte ptr [rcx + rsi], dil
 20766  	or	rsi, 1
 20767  .LBB3_702:
 20768  	add	rax, r9
 20769  	je	.LBB3_923
 20770  .LBB3_703:                              # =>This Inner Loop Header: Depth=1
 20771  	movsx	eax, byte ptr [rdx + rsi]
 20772  	mov	edi, eax
 20773  	sar	edi, 7
 20774  	add	eax, edi
 20775  	xor	eax, edi
 20776  	mov	byte ptr [rcx + rsi], al
 20777  	movsx	eax, byte ptr [rdx + rsi + 1]
 20778  	mov	edi, eax
 20779  	sar	edi, 7
 20780  	add	eax, edi
 20781  	xor	eax, edi
 20782  	mov	byte ptr [rcx + rsi + 1], al
 20783  	add	rsi, 2
 20784  	cmp	r9, rsi
 20785  	jne	.LBB3_703
 20786  	jmp	.LBB3_923
 20787  .LBB3_101:
 20788  	cmp	edi, 2
 20789  	je	.LBB3_203
 20790  # %bb.102:
 20791  	cmp	edi, 3
 20792  	jne	.LBB3_923
 20793  # %bb.103:
 20794  	test	r8d, r8d
 20795  	jle	.LBB3_923
 20796  # %bb.104:
 20797  	mov	r9d, r8d
 20798  	cmp	r8d, 16
 20799  	jb	.LBB3_105
 20800  # %bb.262:
 20801  	lea	rax, [rdx + r9]
 20802  	cmp	rax, rcx
 20803  	jbe	.LBB3_411
 20804  # %bb.263:
 20805  	lea	rax, [rcx + r9]
 20806  	cmp	rax, rdx
 20807  	jbe	.LBB3_411
 20808  .LBB3_105:
 20809  	xor	esi, esi
 20810  .LBB3_708:
 20811  	mov	rax, rsi
 20812  	not	rax
 20813  	test	r9b, 1
 20814  	je	.LBB3_710
 20815  # %bb.709:
 20816  	movsx	edi, byte ptr [rdx + rsi]
 20817  	mov	r8d, edi
 20818  	sar	r8d, 7
 20819  	add	edi, r8d
 20820  	xor	edi, r8d
 20821  	mov	byte ptr [rcx + rsi], dil
 20822  	or	rsi, 1
 20823  .LBB3_710:
 20824  	add	rax, r9
 20825  	je	.LBB3_923
 20826  .LBB3_711:                              # =>This Inner Loop Header: Depth=1
 20827  	movsx	eax, byte ptr [rdx + rsi]
 20828  	mov	edi, eax
 20829  	sar	edi, 7
 20830  	add	eax, edi
 20831  	xor	eax, edi
 20832  	mov	byte ptr [rcx + rsi], al
 20833  	movsx	eax, byte ptr [rdx + rsi + 1]
 20834  	mov	edi, eax
 20835  	sar	edi, 7
 20836  	add	eax, edi
 20837  	xor	eax, edi
 20838  	mov	byte ptr [rcx + rsi + 1], al
 20839  	add	rsi, 2
 20840  	cmp	r9, rsi
 20841  	jne	.LBB3_711
 20842  	jmp	.LBB3_923
 20843  .LBB3_106:
 20844  	cmp	edi, 7
 20845  	je	.LBB3_206
 20846  # %bb.107:
 20847  	cmp	edi, 8
 20848  	jne	.LBB3_923
 20849  # %bb.108:
 20850  	test	r8d, r8d
 20851  	jle	.LBB3_923
 20852  # %bb.109:
 20853  	mov	r9d, r8d
 20854  	cmp	r8d, 4
 20855  	jae	.LBB3_265
 20856  # %bb.110:
 20857  	xor	edx, edx
 20858  	jmp	.LBB3_420
 20859  .LBB3_111:
 20860  	cmp	edi, 7
 20861  	je	.LBB3_209
 20862  # %bb.112:
 20863  	cmp	edi, 8
 20864  	jne	.LBB3_923
 20865  # %bb.113:
 20866  	test	r8d, r8d
 20867  	jle	.LBB3_923
 20868  # %bb.114:
 20869  	mov	r9d, r8d
 20870  	cmp	r8d, 4
 20871  	jb	.LBB3_115
 20872  # %bb.267:
 20873  	lea	rax, [rdx + 8*r9]
 20874  	cmp	rax, rcx
 20875  	jbe	.LBB3_421
 20876  # %bb.268:
 20877  	lea	rax, [rcx + 8*r9]
 20878  	cmp	rax, rdx
 20879  	jbe	.LBB3_421
 20880  .LBB3_115:
 20881  	xor	esi, esi
 20882  .LBB3_716:
 20883  	mov	r8, rsi
 20884  	not	r8
 20885  	add	r8, r9
 20886  	mov	rdi, r9
 20887  	and	rdi, 3
 20888  	je	.LBB3_718
 20889  .LBB3_717:                              # =>This Inner Loop Header: Depth=1
 20890  	xor	eax, eax
 20891  	sub	rax, qword ptr [rdx + 8*rsi]
 20892  	mov	qword ptr [rcx + 8*rsi], rax
 20893  	add	rsi, 1
 20894  	add	rdi, -1
 20895  	jne	.LBB3_717
 20896  .LBB3_718:
 20897  	cmp	r8, 3
 20898  	jb	.LBB3_923
 20899  .LBB3_719:                              # =>This Inner Loop Header: Depth=1
 20900  	xor	eax, eax
 20901  	sub	rax, qword ptr [rdx + 8*rsi]
 20902  	mov	qword ptr [rcx + 8*rsi], rax
 20903  	xor	eax, eax
 20904  	sub	rax, qword ptr [rdx + 8*rsi + 8]
 20905  	mov	qword ptr [rcx + 8*rsi + 8], rax
 20906  	xor	eax, eax
 20907  	sub	rax, qword ptr [rdx + 8*rsi + 16]
 20908  	mov	qword ptr [rcx + 8*rsi + 16], rax
 20909  	xor	eax, eax
 20910  	sub	rax, qword ptr [rdx + 8*rsi + 24]
 20911  	mov	qword ptr [rcx + 8*rsi + 24], rax
 20912  	add	rsi, 4
 20913  	cmp	r9, rsi
 20914  	jne	.LBB3_719
 20915  	jmp	.LBB3_923
 20916  .LBB3_116:
 20917  	cmp	edi, 7
 20918  	je	.LBB3_212
 20919  # %bb.117:
 20920  	cmp	edi, 8
 20921  	jne	.LBB3_923
 20922  # %bb.118:
 20923  	test	r8d, r8d
 20924  	jle	.LBB3_923
 20925  # %bb.119:
 20926  	mov	r9d, r8d
 20927  	cmp	r8d, 4
 20928  	jb	.LBB3_120
 20929  # %bb.270:
 20930  	lea	rax, [rdx + 8*r9]
 20931  	cmp	rax, rcx
 20932  	jbe	.LBB3_424
 20933  # %bb.271:
 20934  	lea	rax, [rcx + 8*r9]
 20935  	cmp	rax, rdx
 20936  	jbe	.LBB3_424
 20937  .LBB3_120:
 20938  	xor	esi, esi
 20939  .LBB3_724:
 20940  	mov	r8, rsi
 20941  	not	r8
 20942  	add	r8, r9
 20943  	mov	rdi, r9
 20944  	and	rdi, 3
 20945  	je	.LBB3_726
 20946  .LBB3_725:                              # =>This Inner Loop Header: Depth=1
 20947  	xor	eax, eax
 20948  	cmp	qword ptr [rdx + 8*rsi], 0
 20949  	setne	al
 20950  	mov	qword ptr [rcx + 8*rsi], rax
 20951  	add	rsi, 1
 20952  	add	rdi, -1
 20953  	jne	.LBB3_725
 20954  .LBB3_726:
 20955  	cmp	r8, 3
 20956  	jb	.LBB3_923
 20957  .LBB3_727:                              # =>This Inner Loop Header: Depth=1
 20958  	xor	eax, eax
 20959  	cmp	qword ptr [rdx + 8*rsi], 0
 20960  	setne	al
 20961  	mov	qword ptr [rcx + 8*rsi], rax
 20962  	xor	eax, eax
 20963  	cmp	qword ptr [rdx + 8*rsi + 8], 0
 20964  	setne	al
 20965  	mov	qword ptr [rcx + 8*rsi + 8], rax
 20966  	xor	eax, eax
 20967  	cmp	qword ptr [rdx + 8*rsi + 16], 0
 20968  	setne	al
 20969  	mov	qword ptr [rcx + 8*rsi + 16], rax
 20970  	xor	eax, eax
 20971  	cmp	qword ptr [rdx + 8*rsi + 24], 0
 20972  	setne	al
 20973  	mov	qword ptr [rcx + 8*rsi + 24], rax
 20974  	add	rsi, 4
 20975  	cmp	r9, rsi
 20976  	jne	.LBB3_727
 20977  	jmp	.LBB3_923
 20978  .LBB3_121:
 20979  	cmp	edi, 7
 20980  	je	.LBB3_215
 20981  # %bb.122:
 20982  	cmp	edi, 8
 20983  	jne	.LBB3_923
 20984  # %bb.123:
 20985  	test	r8d, r8d
 20986  	jle	.LBB3_923
 20987  # %bb.124:
 20988  	mov	r9d, r8d
 20989  	cmp	r8d, 4
 20990  	jb	.LBB3_125
 20991  # %bb.273:
 20992  	lea	rax, [rdx + 8*r9]
 20993  	cmp	rax, rcx
 20994  	jbe	.LBB3_427
 20995  # %bb.274:
 20996  	lea	rax, [rcx + 8*r9]
 20997  	cmp	rax, rdx
 20998  	jbe	.LBB3_427
 20999  .LBB3_125:
 21000  	xor	esi, esi
 21001  .LBB3_556:
 21002  	mov	r8, rsi
 21003  	not	r8
 21004  	add	r8, r9
 21005  	mov	rdi, r9
 21006  	and	rdi, 3
 21007  	je	.LBB3_558
 21008  .LBB3_557:                              # =>This Inner Loop Header: Depth=1
 21009  	mov	rax, qword ptr [rdx + 8*rsi]
 21010  	mov	qword ptr [rcx + 8*rsi], rax
 21011  	add	rsi, 1
 21012  	add	rdi, -1
 21013  	jne	.LBB3_557
 21014  .LBB3_558:
 21015  	cmp	r8, 3
 21016  	jb	.LBB3_923
 21017  .LBB3_559:                              # =>This Inner Loop Header: Depth=1
 21018  	mov	rax, qword ptr [rdx + 8*rsi]
 21019  	mov	qword ptr [rcx + 8*rsi], rax
 21020  	mov	rax, qword ptr [rdx + 8*rsi + 8]
 21021  	mov	qword ptr [rcx + 8*rsi + 8], rax
 21022  	mov	rax, qword ptr [rdx + 8*rsi + 16]
 21023  	mov	qword ptr [rcx + 8*rsi + 16], rax
 21024  	mov	rax, qword ptr [rdx + 8*rsi + 24]
 21025  	mov	qword ptr [rcx + 8*rsi + 24], rax
 21026  	add	rsi, 4
 21027  	cmp	r9, rsi
 21028  	jne	.LBB3_559
 21029  	jmp	.LBB3_923
 21030  .LBB3_126:
 21031  	cmp	edi, 7
 21032  	je	.LBB3_218
 21033  # %bb.127:
 21034  	cmp	edi, 8
 21035  	jne	.LBB3_923
 21036  # %bb.128:
 21037  	test	r8d, r8d
 21038  	jle	.LBB3_923
 21039  # %bb.129:
 21040  	mov	r9d, r8d
 21041  	cmp	r8d, 4
 21042  	jb	.LBB3_130
 21043  # %bb.276:
 21044  	lea	rax, [rdx + 8*r9]
 21045  	cmp	rax, rcx
 21046  	jbe	.LBB3_429
 21047  # %bb.277:
 21048  	lea	rax, [rcx + 8*r9]
 21049  	cmp	rax, rdx
 21050  	jbe	.LBB3_429
 21051  .LBB3_130:
 21052  	xor	esi, esi
 21053  .LBB3_566:
 21054  	mov	r8, rsi
 21055  	not	r8
 21056  	add	r8, r9
 21057  	mov	rdi, r9
 21058  	and	rdi, 3
 21059  	je	.LBB3_568
 21060  .LBB3_567:                              # =>This Inner Loop Header: Depth=1
 21061  	mov	rax, qword ptr [rdx + 8*rsi]
 21062  	mov	qword ptr [rcx + 8*rsi], rax
 21063  	add	rsi, 1
 21064  	add	rdi, -1
 21065  	jne	.LBB3_567
 21066  .LBB3_568:
 21067  	cmp	r8, 3
 21068  	jb	.LBB3_923
 21069  .LBB3_569:                              # =>This Inner Loop Header: Depth=1
 21070  	mov	rax, qword ptr [rdx + 8*rsi]
 21071  	mov	qword ptr [rcx + 8*rsi], rax
 21072  	mov	rax, qword ptr [rdx + 8*rsi + 8]
 21073  	mov	qword ptr [rcx + 8*rsi + 8], rax
 21074  	mov	rax, qword ptr [rdx + 8*rsi + 16]
 21075  	mov	qword ptr [rcx + 8*rsi + 16], rax
 21076  	mov	rax, qword ptr [rdx + 8*rsi + 24]
 21077  	mov	qword ptr [rcx + 8*rsi + 24], rax
 21078  	add	rsi, 4
 21079  	cmp	r9, rsi
 21080  	jne	.LBB3_569
 21081  	jmp	.LBB3_923
 21082  .LBB3_131:
 21083  	test	r8d, r8d
 21084  	jle	.LBB3_923
 21085  # %bb.132:
 21086  	mov	r9d, r8d
 21087  	cmp	r8d, 16
 21088  	jae	.LBB3_279
 21089  # %bb.133:
 21090  	xor	edx, edx
 21091  	jmp	.LBB3_437
 21092  .LBB3_134:
 21093  	test	r8d, r8d
 21094  	jle	.LBB3_923
 21095  # %bb.135:
 21096  	mov	r9d, r8d
 21097  	cmp	r8d, 16
 21098  	jb	.LBB3_136
 21099  # %bb.281:
 21100  	lea	rax, [rdx + 2*r9]
 21101  	cmp	rax, rcx
 21102  	jbe	.LBB3_438
 21103  # %bb.282:
 21104  	lea	rax, [rcx + 2*r9]
 21105  	cmp	rax, rdx
 21106  	jbe	.LBB3_438
 21107  .LBB3_136:
 21108  	xor	esi, esi
 21109  .LBB3_732:
 21110  	mov	r8, rsi
 21111  	not	r8
 21112  	add	r8, r9
 21113  	mov	rdi, r9
 21114  	and	rdi, 3
 21115  	je	.LBB3_734
 21116  .LBB3_733:                              # =>This Inner Loop Header: Depth=1
 21117  	xor	eax, eax
 21118  	sub	ax, word ptr [rdx + 2*rsi]
 21119  	mov	word ptr [rcx + 2*rsi], ax
 21120  	add	rsi, 1
 21121  	add	rdi, -1
 21122  	jne	.LBB3_733
 21123  .LBB3_734:
 21124  	cmp	r8, 3
 21125  	jb	.LBB3_923
 21126  .LBB3_735:                              # =>This Inner Loop Header: Depth=1
 21127  	xor	eax, eax
 21128  	sub	ax, word ptr [rdx + 2*rsi]
 21129  	mov	word ptr [rcx + 2*rsi], ax
 21130  	xor	eax, eax
 21131  	sub	ax, word ptr [rdx + 2*rsi + 2]
 21132  	mov	word ptr [rcx + 2*rsi + 2], ax
 21133  	xor	eax, eax
 21134  	sub	ax, word ptr [rdx + 2*rsi + 4]
 21135  	mov	word ptr [rcx + 2*rsi + 4], ax
 21136  	xor	eax, eax
 21137  	sub	ax, word ptr [rdx + 2*rsi + 6]
 21138  	mov	word ptr [rcx + 2*rsi + 6], ax
 21139  	add	rsi, 4
 21140  	cmp	r9, rsi
 21141  	jne	.LBB3_735
 21142  	jmp	.LBB3_923
 21143  .LBB3_137:
 21144  	test	r8d, r8d
 21145  	jle	.LBB3_923
 21146  # %bb.138:
 21147  	mov	r9d, r8d
 21148  	cmp	r8d, 16
 21149  	jb	.LBB3_139
 21150  # %bb.284:
 21151  	lea	rax, [rdx + 2*r9]
 21152  	cmp	rax, rcx
 21153  	jbe	.LBB3_441
 21154  # %bb.285:
 21155  	lea	rax, [rcx + 2*r9]
 21156  	cmp	rax, rdx
 21157  	jbe	.LBB3_441
 21158  .LBB3_139:
 21159  	xor	esi, esi
 21160  .LBB3_740:
 21161  	mov	r8, rsi
 21162  	not	r8
 21163  	add	r8, r9
 21164  	mov	rdi, r9
 21165  	and	rdi, 3
 21166  	je	.LBB3_742
 21167  .LBB3_741:                              # =>This Inner Loop Header: Depth=1
 21168  	xor	eax, eax
 21169  	sub	ax, word ptr [rdx + 2*rsi]
 21170  	mov	word ptr [rcx + 2*rsi], ax
 21171  	add	rsi, 1
 21172  	add	rdi, -1
 21173  	jne	.LBB3_741
 21174  .LBB3_742:
 21175  	cmp	r8, 3
 21176  	jb	.LBB3_923
 21177  .LBB3_743:                              # =>This Inner Loop Header: Depth=1
 21178  	xor	eax, eax
 21179  	sub	ax, word ptr [rdx + 2*rsi]
 21180  	mov	word ptr [rcx + 2*rsi], ax
 21181  	xor	eax, eax
 21182  	sub	ax, word ptr [rdx + 2*rsi + 2]
 21183  	mov	word ptr [rcx + 2*rsi + 2], ax
 21184  	xor	eax, eax
 21185  	sub	ax, word ptr [rdx + 2*rsi + 4]
 21186  	mov	word ptr [rcx + 2*rsi + 4], ax
 21187  	xor	eax, eax
 21188  	sub	ax, word ptr [rdx + 2*rsi + 6]
 21189  	mov	word ptr [rcx + 2*rsi + 6], ax
 21190  	add	rsi, 4
 21191  	cmp	r9, rsi
 21192  	jne	.LBB3_743
 21193  	jmp	.LBB3_923
 21194  .LBB3_140:
 21195  	test	r8d, r8d
 21196  	jle	.LBB3_923
 21197  # %bb.141:
 21198  	mov	r9d, r8d
 21199  	cmp	r8d, 16
 21200  	jb	.LBB3_142
 21201  # %bb.287:
 21202  	lea	rax, [rdx + 2*r9]
 21203  	cmp	rax, rcx
 21204  	jbe	.LBB3_444
 21205  # %bb.288:
 21206  	lea	rax, [rcx + 2*r9]
 21207  	cmp	rax, rdx
 21208  	jbe	.LBB3_444
 21209  .LBB3_142:
 21210  	xor	esi, esi
 21211  .LBB3_748:
 21212  	mov	r8, rsi
 21213  	not	r8
 21214  	add	r8, r9
 21215  	mov	rdi, r9
 21216  	and	rdi, 3
 21217  	je	.LBB3_750
 21218  .LBB3_749:                              # =>This Inner Loop Header: Depth=1
 21219  	xor	eax, eax
 21220  	sub	ax, word ptr [rdx + 2*rsi]
 21221  	mov	word ptr [rcx + 2*rsi], ax
 21222  	add	rsi, 1
 21223  	add	rdi, -1
 21224  	jne	.LBB3_749
 21225  .LBB3_750:
 21226  	cmp	r8, 3
 21227  	jb	.LBB3_923
 21228  .LBB3_751:                              # =>This Inner Loop Header: Depth=1
 21229  	xor	eax, eax
 21230  	sub	ax, word ptr [rdx + 2*rsi]
 21231  	mov	word ptr [rcx + 2*rsi], ax
 21232  	xor	eax, eax
 21233  	sub	ax, word ptr [rdx + 2*rsi + 2]
 21234  	mov	word ptr [rcx + 2*rsi + 2], ax
 21235  	xor	eax, eax
 21236  	sub	ax, word ptr [rdx + 2*rsi + 4]
 21237  	mov	word ptr [rcx + 2*rsi + 4], ax
 21238  	xor	eax, eax
 21239  	sub	ax, word ptr [rdx + 2*rsi + 6]
 21240  	mov	word ptr [rcx + 2*rsi + 6], ax
 21241  	add	rsi, 4
 21242  	cmp	r9, rsi
 21243  	jne	.LBB3_751
 21244  	jmp	.LBB3_923
 21245  .LBB3_143:
 21246  	test	r8d, r8d
 21247  	jle	.LBB3_923
 21248  # %bb.144:
 21249  	mov	r9d, r8d
 21250  	cmp	r8d, 16
 21251  	jb	.LBB3_145
 21252  # %bb.290:
 21253  	lea	rax, [rdx + 2*r9]
 21254  	cmp	rax, rcx
 21255  	jbe	.LBB3_447
 21256  # %bb.291:
 21257  	lea	rax, [rcx + 2*r9]
 21258  	cmp	rax, rdx
 21259  	jbe	.LBB3_447
 21260  .LBB3_145:
 21261  	xor	esi, esi
 21262  .LBB3_756:
 21263  	mov	r8, rsi
 21264  	not	r8
 21265  	add	r8, r9
 21266  	mov	rdi, r9
 21267  	and	rdi, 3
 21268  	je	.LBB3_758
 21269  .LBB3_757:                              # =>This Inner Loop Header: Depth=1
 21270  	xor	eax, eax
 21271  	cmp	word ptr [rdx + 2*rsi], 0
 21272  	setne	al
 21273  	mov	word ptr [rcx + 2*rsi], ax
 21274  	add	rsi, 1
 21275  	add	rdi, -1
 21276  	jne	.LBB3_757
 21277  .LBB3_758:
 21278  	cmp	r8, 3
 21279  	jb	.LBB3_923
 21280  .LBB3_759:                              # =>This Inner Loop Header: Depth=1
 21281  	xor	eax, eax
 21282  	cmp	word ptr [rdx + 2*rsi], 0
 21283  	setne	al
 21284  	mov	word ptr [rcx + 2*rsi], ax
 21285  	xor	eax, eax
 21286  	cmp	word ptr [rdx + 2*rsi + 2], 0
 21287  	setne	al
 21288  	mov	word ptr [rcx + 2*rsi + 2], ax
 21289  	xor	eax, eax
 21290  	cmp	word ptr [rdx + 2*rsi + 4], 0
 21291  	setne	al
 21292  	mov	word ptr [rcx + 2*rsi + 4], ax
 21293  	xor	eax, eax
 21294  	cmp	word ptr [rdx + 2*rsi + 6], 0
 21295  	setne	al
 21296  	mov	word ptr [rcx + 2*rsi + 6], ax
 21297  	add	rsi, 4
 21298  	cmp	r9, rsi
 21299  	jne	.LBB3_759
 21300  	jmp	.LBB3_923
 21301  .LBB3_146:
 21302  	test	r8d, r8d
 21303  	jle	.LBB3_923
 21304  # %bb.147:
 21305  	mov	r9d, r8d
 21306  	cmp	r8d, 16
 21307  	jb	.LBB3_148
 21308  # %bb.293:
 21309  	lea	rax, [rdx + 2*r9]
 21310  	cmp	rax, rcx
 21311  	jbe	.LBB3_450
 21312  # %bb.294:
 21313  	lea	rax, [rcx + 2*r9]
 21314  	cmp	rax, rdx
 21315  	jbe	.LBB3_450
 21316  .LBB3_148:
 21317  	xor	esi, esi
 21318  .LBB3_764:
 21319  	mov	rax, rsi
 21320  	not	rax
 21321  	test	r9b, 1
 21322  	je	.LBB3_766
 21323  # %bb.765:
 21324  	movzx	r8d, word ptr [rdx + 2*rsi]
 21325  	xor	r10d, r10d
 21326  	test	r8w, r8w
 21327  	setne	r10b
 21328  	neg	r10d
 21329  	test	r8w, r8w
 21330  	mov	edi, 1
 21331  	cmovle	edi, r10d
 21332  	mov	word ptr [rcx + 2*rsi], di
 21333  	or	rsi, 1
 21334  .LBB3_766:
 21335  	add	rax, r9
 21336  	je	.LBB3_923
 21337  # %bb.767:
 21338  	mov	r8d, 1
 21339  .LBB3_768:                              # =>This Inner Loop Header: Depth=1
 21340  	movzx	edi, word ptr [rdx + 2*rsi]
 21341  	xor	eax, eax
 21342  	test	di, di
 21343  	setne	al
 21344  	neg	eax
 21345  	test	di, di
 21346  	cmovg	eax, r8d
 21347  	mov	word ptr [rcx + 2*rsi], ax
 21348  	movzx	eax, word ptr [rdx + 2*rsi + 2]
 21349  	xor	edi, edi
 21350  	test	ax, ax
 21351  	setne	dil
 21352  	neg	edi
 21353  	test	ax, ax
 21354  	cmovg	edi, r8d
 21355  	mov	word ptr [rcx + 2*rsi + 2], di
 21356  	add	rsi, 2
 21357  	cmp	r9, rsi
 21358  	jne	.LBB3_768
 21359  	jmp	.LBB3_923
 21360  .LBB3_149:
 21361  	test	r8d, r8d
 21362  	jle	.LBB3_923
 21363  # %bb.150:
 21364  	mov	r9d, r8d
 21365  	cmp	r8d, 16
 21366  	jb	.LBB3_151
 21367  # %bb.296:
 21368  	lea	rax, [rdx + 2*r9]
 21369  	cmp	rax, rcx
 21370  	jbe	.LBB3_453
 21371  # %bb.297:
 21372  	lea	rax, [rcx + 2*r9]
 21373  	cmp	rax, rdx
 21374  	jbe	.LBB3_453
 21375  .LBB3_151:
 21376  	xor	esi, esi
 21377  .LBB3_576:
 21378  	mov	r8, rsi
 21379  	not	r8
 21380  	add	r8, r9
 21381  	mov	rdi, r9
 21382  	and	rdi, 3
 21383  	je	.LBB3_578
 21384  .LBB3_577:                              # =>This Inner Loop Header: Depth=1
 21385  	movzx	eax, word ptr [rdx + 2*rsi]
 21386  	mov	word ptr [rcx + 2*rsi], ax
 21387  	add	rsi, 1
 21388  	add	rdi, -1
 21389  	jne	.LBB3_577
 21390  .LBB3_578:
 21391  	cmp	r8, 3
 21392  	jb	.LBB3_923
 21393  .LBB3_579:                              # =>This Inner Loop Header: Depth=1
 21394  	movzx	eax, word ptr [rdx + 2*rsi]
 21395  	mov	word ptr [rcx + 2*rsi], ax
 21396  	movzx	eax, word ptr [rdx + 2*rsi + 2]
 21397  	mov	word ptr [rcx + 2*rsi + 2], ax
 21398  	movzx	eax, word ptr [rdx + 2*rsi + 4]
 21399  	mov	word ptr [rcx + 2*rsi + 4], ax
 21400  	movzx	eax, word ptr [rdx + 2*rsi + 6]
 21401  	mov	word ptr [rcx + 2*rsi + 6], ax
 21402  	add	rsi, 4
 21403  	cmp	r9, rsi
 21404  	jne	.LBB3_579
 21405  	jmp	.LBB3_923
 21406  .LBB3_152:
 21407  	test	r8d, r8d
 21408  	jle	.LBB3_923
 21409  # %bb.153:
 21410  	mov	r9d, r8d
 21411  	cmp	r8d, 8
 21412  	jb	.LBB3_154
 21413  # %bb.299:
 21414  	lea	rax, [rdx + 2*r9]
 21415  	cmp	rax, rcx
 21416  	jbe	.LBB3_455
 21417  # %bb.300:
 21418  	lea	rax, [rcx + 2*r9]
 21419  	cmp	rax, rdx
 21420  	jbe	.LBB3_455
 21421  .LBB3_154:
 21422  	xor	esi, esi
 21423  .LBB3_773:
 21424  	mov	rax, rsi
 21425  	not	rax
 21426  	test	r9b, 1
 21427  	je	.LBB3_775
 21428  # %bb.774:
 21429  	movsx	edi, word ptr [rdx + 2*rsi]
 21430  	mov	r8d, edi
 21431  	sar	r8d, 15
 21432  	add	edi, r8d
 21433  	xor	edi, r8d
 21434  	mov	word ptr [rcx + 2*rsi], di
 21435  	or	rsi, 1
 21436  .LBB3_775:
 21437  	add	rax, r9
 21438  	je	.LBB3_923
 21439  .LBB3_776:                              # =>This Inner Loop Header: Depth=1
 21440  	movsx	eax, word ptr [rdx + 2*rsi]
 21441  	mov	edi, eax
 21442  	sar	edi, 15
 21443  	add	eax, edi
 21444  	xor	eax, edi
 21445  	mov	word ptr [rcx + 2*rsi], ax
 21446  	movsx	eax, word ptr [rdx + 2*rsi + 2]
 21447  	mov	edi, eax
 21448  	sar	edi, 15
 21449  	add	eax, edi
 21450  	xor	eax, edi
 21451  	mov	word ptr [rcx + 2*rsi + 2], ax
 21452  	add	rsi, 2
 21453  	cmp	r9, rsi
 21454  	jne	.LBB3_776
 21455  	jmp	.LBB3_923
 21456  .LBB3_155:
 21457  	test	r8d, r8d
 21458  	jle	.LBB3_923
 21459  # %bb.156:
 21460  	mov	r9d, r8d
 21461  	cmp	r8d, 16
 21462  	jb	.LBB3_157
 21463  # %bb.302:
 21464  	lea	rax, [rdx + 2*r9]
 21465  	cmp	rax, rcx
 21466  	jbe	.LBB3_458
 21467  # %bb.303:
 21468  	lea	rax, [rcx + 2*r9]
 21469  	cmp	rax, rdx
 21470  	jbe	.LBB3_458
 21471  .LBB3_157:
 21472  	xor	esi, esi
 21473  .LBB3_586:
 21474  	mov	r8, rsi
 21475  	not	r8
 21476  	add	r8, r9
 21477  	mov	rdi, r9
 21478  	and	rdi, 3
 21479  	je	.LBB3_588
 21480  .LBB3_587:                              # =>This Inner Loop Header: Depth=1
 21481  	movzx	eax, word ptr [rdx + 2*rsi]
 21482  	mov	word ptr [rcx + 2*rsi], ax
 21483  	add	rsi, 1
 21484  	add	rdi, -1
 21485  	jne	.LBB3_587
 21486  .LBB3_588:
 21487  	cmp	r8, 3
 21488  	jb	.LBB3_923
 21489  .LBB3_589:                              # =>This Inner Loop Header: Depth=1
 21490  	movzx	eax, word ptr [rdx + 2*rsi]
 21491  	mov	word ptr [rcx + 2*rsi], ax
 21492  	movzx	eax, word ptr [rdx + 2*rsi + 2]
 21493  	mov	word ptr [rcx + 2*rsi + 2], ax
 21494  	movzx	eax, word ptr [rdx + 2*rsi + 4]
 21495  	mov	word ptr [rcx + 2*rsi + 4], ax
 21496  	movzx	eax, word ptr [rdx + 2*rsi + 6]
 21497  	mov	word ptr [rcx + 2*rsi + 6], ax
 21498  	add	rsi, 4
 21499  	cmp	r9, rsi
 21500  	jne	.LBB3_589
 21501  	jmp	.LBB3_923
 21502  .LBB3_158:
 21503  	test	r8d, r8d
 21504  	jle	.LBB3_923
 21505  # %bb.159:
 21506  	mov	r9d, r8d
 21507  	cmp	r8d, 8
 21508  	jb	.LBB3_160
 21509  # %bb.305:
 21510  	lea	rax, [rdx + 2*r9]
 21511  	cmp	rax, rcx
 21512  	jbe	.LBB3_460
 21513  # %bb.306:
 21514  	lea	rax, [rcx + 2*r9]
 21515  	cmp	rax, rdx
 21516  	jbe	.LBB3_460
 21517  .LBB3_160:
 21518  	xor	esi, esi
 21519  .LBB3_781:
 21520  	mov	rax, rsi
 21521  	not	rax
 21522  	test	r9b, 1
 21523  	je	.LBB3_783
 21524  # %bb.782:
 21525  	movsx	edi, word ptr [rdx + 2*rsi]
 21526  	mov	r8d, edi
 21527  	sar	r8d, 15
 21528  	add	edi, r8d
 21529  	xor	edi, r8d
 21530  	mov	word ptr [rcx + 2*rsi], di
 21531  	or	rsi, 1
 21532  .LBB3_783:
 21533  	add	rax, r9
 21534  	je	.LBB3_923
 21535  .LBB3_784:                              # =>This Inner Loop Header: Depth=1
 21536  	movsx	eax, word ptr [rdx + 2*rsi]
 21537  	mov	edi, eax
 21538  	sar	edi, 15
 21539  	add	eax, edi
 21540  	xor	eax, edi
 21541  	mov	word ptr [rcx + 2*rsi], ax
 21542  	movsx	eax, word ptr [rdx + 2*rsi + 2]
 21543  	mov	edi, eax
 21544  	sar	edi, 15
 21545  	add	eax, edi
 21546  	xor	eax, edi
 21547  	mov	word ptr [rcx + 2*rsi + 2], ax
 21548  	add	rsi, 2
 21549  	cmp	r9, rsi
 21550  	jne	.LBB3_784
 21551  	jmp	.LBB3_923
 21552  .LBB3_161:
 21553  	test	r8d, r8d
 21554  	jle	.LBB3_923
 21555  # %bb.162:
 21556  	mov	r9d, r8d
 21557  	cmp	r8d, 4
 21558  	jb	.LBB3_163
 21559  # %bb.308:
 21560  	lea	rax, [rdx + 8*r9]
 21561  	cmp	rax, rcx
 21562  	jbe	.LBB3_463
 21563  # %bb.309:
 21564  	lea	rax, [rcx + 8*r9]
 21565  	cmp	rax, rdx
 21566  	jbe	.LBB3_463
 21567  .LBB3_163:
 21568  	xor	esi, esi
 21569  .LBB3_789:
 21570  	mov	r8, rsi
 21571  	not	r8
 21572  	add	r8, r9
 21573  	mov	rdi, r9
 21574  	and	rdi, 3
 21575  	je	.LBB3_791
 21576  .LBB3_790:                              # =>This Inner Loop Header: Depth=1
 21577  	xor	eax, eax
 21578  	sub	rax, qword ptr [rdx + 8*rsi]
 21579  	mov	qword ptr [rcx + 8*rsi], rax
 21580  	add	rsi, 1
 21581  	add	rdi, -1
 21582  	jne	.LBB3_790
 21583  .LBB3_791:
 21584  	cmp	r8, 3
 21585  	jb	.LBB3_923
 21586  .LBB3_792:                              # =>This Inner Loop Header: Depth=1
 21587  	xor	eax, eax
 21588  	sub	rax, qword ptr [rdx + 8*rsi]
 21589  	mov	qword ptr [rcx + 8*rsi], rax
 21590  	xor	eax, eax
 21591  	sub	rax, qword ptr [rdx + 8*rsi + 8]
 21592  	mov	qword ptr [rcx + 8*rsi + 8], rax
 21593  	xor	eax, eax
 21594  	sub	rax, qword ptr [rdx + 8*rsi + 16]
 21595  	mov	qword ptr [rcx + 8*rsi + 16], rax
 21596  	xor	eax, eax
 21597  	sub	rax, qword ptr [rdx + 8*rsi + 24]
 21598  	mov	qword ptr [rcx + 8*rsi + 24], rax
 21599  	add	rsi, 4
 21600  	cmp	r9, rsi
 21601  	jne	.LBB3_792
 21602  	jmp	.LBB3_923
 21603  .LBB3_164:
 21604  	test	r8d, r8d
 21605  	jle	.LBB3_923
 21606  # %bb.165:
 21607  	mov	r9d, r8d
 21608  	cmp	r8d, 8
 21609  	jb	.LBB3_166
 21610  # %bb.311:
 21611  	lea	rax, [rdx + 4*r9]
 21612  	cmp	rax, rcx
 21613  	jbe	.LBB3_466
 21614  # %bb.312:
 21615  	lea	rax, [rcx + 4*r9]
 21616  	cmp	rax, rdx
 21617  	jbe	.LBB3_466
 21618  .LBB3_166:
 21619  	xor	esi, esi
 21620  .LBB3_797:
 21621  	mov	rax, rsi
 21622  	not	rax
 21623  	add	rax, r9
 21624  	mov	rdi, r9
 21625  	and	rdi, 3
 21626  	je	.LBB3_800
 21627  # %bb.798:
 21628  	movapd	xmm0, xmmword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 21629  .LBB3_799:                              # =>This Inner Loop Header: Depth=1
 21630  	movss	xmm1, dword ptr [rdx + 4*rsi]   # xmm1 = mem[0],zero,zero,zero
 21631  	xorpd	xmm1, xmm0
 21632  	movss	dword ptr [rcx + 4*rsi], xmm1
 21633  	add	rsi, 1
 21634  	add	rdi, -1
 21635  	jne	.LBB3_799
 21636  .LBB3_800:
 21637  	cmp	rax, 3
 21638  	jb	.LBB3_923
 21639  # %bb.801:
 21640  	movapd	xmm0, xmmword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 21641  .LBB3_802:                              # =>This Inner Loop Header: Depth=1
 21642  	movss	xmm1, dword ptr [rdx + 4*rsi]   # xmm1 = mem[0],zero,zero,zero
 21643  	xorpd	xmm1, xmm0
 21644  	movss	dword ptr [rcx + 4*rsi], xmm1
 21645  	movss	xmm1, dword ptr [rdx + 4*rsi + 4] # xmm1 = mem[0],zero,zero,zero
 21646  	xorpd	xmm1, xmm0
 21647  	movss	dword ptr [rcx + 4*rsi + 4], xmm1
 21648  	movss	xmm1, dword ptr [rdx + 4*rsi + 8] # xmm1 = mem[0],zero,zero,zero
 21649  	xorpd	xmm1, xmm0
 21650  	movss	dword ptr [rcx + 4*rsi + 8], xmm1
 21651  	movss	xmm1, dword ptr [rdx + 4*rsi + 12] # xmm1 = mem[0],zero,zero,zero
 21652  	xorpd	xmm1, xmm0
 21653  	movss	dword ptr [rcx + 4*rsi + 12], xmm1
 21654  	add	rsi, 4
 21655  	cmp	r9, rsi
 21656  	jne	.LBB3_802
 21657  	jmp	.LBB3_923
 21658  .LBB3_167:
 21659  	test	r8d, r8d
 21660  	jle	.LBB3_923
 21661  # %bb.168:
 21662  	mov	r9d, r8d
 21663  	cmp	r8d, 4
 21664  	jb	.LBB3_169
 21665  # %bb.314:
 21666  	lea	rax, [rdx + 8*r9]
 21667  	cmp	rax, rcx
 21668  	jbe	.LBB3_469
 21669  # %bb.315:
 21670  	lea	rax, [rcx + 8*r9]
 21671  	cmp	rax, rdx
 21672  	jbe	.LBB3_469
 21673  .LBB3_169:
 21674  	xor	esi, esi
 21675  .LBB3_807:
 21676  	mov	r8, rsi
 21677  	not	r8
 21678  	add	r8, r9
 21679  	mov	rdi, r9
 21680  	and	rdi, 3
 21681  	je	.LBB3_809
 21682  .LBB3_808:                              # =>This Inner Loop Header: Depth=1
 21683  	xor	eax, eax
 21684  	sub	rax, qword ptr [rdx + 8*rsi]
 21685  	mov	qword ptr [rcx + 8*rsi], rax
 21686  	add	rsi, 1
 21687  	add	rdi, -1
 21688  	jne	.LBB3_808
 21689  .LBB3_809:
 21690  	cmp	r8, 3
 21691  	jb	.LBB3_923
 21692  .LBB3_810:                              # =>This Inner Loop Header: Depth=1
 21693  	xor	eax, eax
 21694  	sub	rax, qword ptr [rdx + 8*rsi]
 21695  	mov	qword ptr [rcx + 8*rsi], rax
 21696  	xor	eax, eax
 21697  	sub	rax, qword ptr [rdx + 8*rsi + 8]
 21698  	mov	qword ptr [rcx + 8*rsi + 8], rax
 21699  	xor	eax, eax
 21700  	sub	rax, qword ptr [rdx + 8*rsi + 16]
 21701  	mov	qword ptr [rcx + 8*rsi + 16], rax
 21702  	xor	eax, eax
 21703  	sub	rax, qword ptr [rdx + 8*rsi + 24]
 21704  	mov	qword ptr [rcx + 8*rsi + 24], rax
 21705  	add	rsi, 4
 21706  	cmp	r9, rsi
 21707  	jne	.LBB3_810
 21708  	jmp	.LBB3_923
 21709  .LBB3_170:
 21710  	test	r8d, r8d
 21711  	jle	.LBB3_923
 21712  # %bb.171:
 21713  	mov	r9d, r8d
 21714  	cmp	r8d, 8
 21715  	jb	.LBB3_172
 21716  # %bb.317:
 21717  	lea	rax, [rdx + 4*r9]
 21718  	cmp	rax, rcx
 21719  	jbe	.LBB3_472
 21720  # %bb.318:
 21721  	lea	rax, [rcx + 4*r9]
 21722  	cmp	rax, rdx
 21723  	jbe	.LBB3_472
 21724  .LBB3_172:
 21725  	xor	esi, esi
 21726  .LBB3_815:
 21727  	mov	rax, rsi
 21728  	not	rax
 21729  	add	rax, r9
 21730  	mov	rdi, r9
 21731  	and	rdi, 3
 21732  	je	.LBB3_818
 21733  # %bb.816:
 21734  	movapd	xmm0, xmmword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 21735  .LBB3_817:                              # =>This Inner Loop Header: Depth=1
 21736  	movss	xmm1, dword ptr [rdx + 4*rsi]   # xmm1 = mem[0],zero,zero,zero
 21737  	xorpd	xmm1, xmm0
 21738  	movss	dword ptr [rcx + 4*rsi], xmm1
 21739  	add	rsi, 1
 21740  	add	rdi, -1
 21741  	jne	.LBB3_817
 21742  .LBB3_818:
 21743  	cmp	rax, 3
 21744  	jb	.LBB3_923
 21745  # %bb.819:
 21746  	movapd	xmm0, xmmword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 21747  .LBB3_820:                              # =>This Inner Loop Header: Depth=1
 21748  	movss	xmm1, dword ptr [rdx + 4*rsi]   # xmm1 = mem[0],zero,zero,zero
 21749  	xorpd	xmm1, xmm0
 21750  	movss	dword ptr [rcx + 4*rsi], xmm1
 21751  	movss	xmm1, dword ptr [rdx + 4*rsi + 4] # xmm1 = mem[0],zero,zero,zero
 21752  	xorpd	xmm1, xmm0
 21753  	movss	dword ptr [rcx + 4*rsi + 4], xmm1
 21754  	movss	xmm1, dword ptr [rdx + 4*rsi + 8] # xmm1 = mem[0],zero,zero,zero
 21755  	xorpd	xmm1, xmm0
 21756  	movss	dword ptr [rcx + 4*rsi + 8], xmm1
 21757  	movss	xmm1, dword ptr [rdx + 4*rsi + 12] # xmm1 = mem[0],zero,zero,zero
 21758  	xorpd	xmm1, xmm0
 21759  	movss	dword ptr [rcx + 4*rsi + 12], xmm1
 21760  	add	rsi, 4
 21761  	cmp	r9, rsi
 21762  	jne	.LBB3_820
 21763  	jmp	.LBB3_923
 21764  .LBB3_173:
 21765  	test	r8d, r8d
 21766  	jle	.LBB3_923
 21767  # %bb.174:
 21768  	mov	r9d, r8d
 21769  	cmp	r8d, 4
 21770  	jb	.LBB3_175
 21771  # %bb.320:
 21772  	lea	rax, [rdx + 8*r9]
 21773  	cmp	rax, rcx
 21774  	jbe	.LBB3_475
 21775  # %bb.321:
 21776  	lea	rax, [rcx + 8*r9]
 21777  	cmp	rax, rdx
 21778  	jbe	.LBB3_475
 21779  .LBB3_175:
 21780  	xor	esi, esi
 21781  .LBB3_825:
 21782  	mov	rax, rsi
 21783  	not	rax
 21784  	test	r9b, 1
 21785  	je	.LBB3_827
 21786  # %bb.826:
 21787  	mov	r8, qword ptr [rdx + 8*rsi]
 21788  	xor	r10d, r10d
 21789  	test	r8, r8
 21790  	setne	r10b
 21791  	neg	r10
 21792  	test	r8, r8
 21793  	mov	edi, 1
 21794  	cmovle	rdi, r10
 21795  	mov	qword ptr [rcx + 8*rsi], rdi
 21796  	or	rsi, 1
 21797  .LBB3_827:
 21798  	add	rax, r9
 21799  	je	.LBB3_923
 21800  # %bb.828:
 21801  	mov	r8d, 1
 21802  .LBB3_829:                              # =>This Inner Loop Header: Depth=1
 21803  	mov	rdi, qword ptr [rdx + 8*rsi]
 21804  	xor	eax, eax
 21805  	test	rdi, rdi
 21806  	setne	al
 21807  	neg	rax
 21808  	test	rdi, rdi
 21809  	cmovg	rax, r8
 21810  	mov	qword ptr [rcx + 8*rsi], rax
 21811  	mov	rax, qword ptr [rdx + 8*rsi + 8]
 21812  	xor	edi, edi
 21813  	test	rax, rax
 21814  	setne	dil
 21815  	neg	rdi
 21816  	test	rax, rax
 21817  	cmovg	rdi, r8
 21818  	mov	qword ptr [rcx + 8*rsi + 8], rdi
 21819  	add	rsi, 2
 21820  	cmp	r9, rsi
 21821  	jne	.LBB3_829
 21822  	jmp	.LBB3_923
 21823  .LBB3_176:
 21824  	test	r8d, r8d
 21825  	jle	.LBB3_923
 21826  # %bb.177:
 21827  	mov	eax, r8d
 21828  	cmp	r8d, 8
 21829  	jb	.LBB3_178
 21830  # %bb.323:
 21831  	lea	rsi, [rdx + 4*rax]
 21832  	cmp	rsi, rcx
 21833  	jbe	.LBB3_478
 21834  # %bb.324:
 21835  	lea	rsi, [rcx + 4*rax]
 21836  	cmp	rsi, rdx
 21837  	jbe	.LBB3_478
 21838  .LBB3_178:
 21839  	xor	esi, esi
 21840  .LBB3_481:
 21841  	mov	r8, rsi
 21842  	not	r8
 21843  	test	al, 1
 21844  	je	.LBB3_483
 21845  # %bb.482:
 21846  	movss	xmm0, dword ptr [rdx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
 21847  	movmskps	edi, xmm0
 21848  	and	edi, 1
 21849  	neg	edi
 21850  	or	edi, 1
 21851  	xorps	xmm1, xmm1
 21852  	cvtsi2ss	xmm1, edi
 21853  	xorps	xmm2, xmm2
 21854  	cmpeqss	xmm2, xmm0
 21855  	andnps	xmm2, xmm1
 21856  	movss	dword ptr [rcx + 4*rsi], xmm2
 21857  	or	rsi, 1
 21858  .LBB3_483:
 21859  	add	r8, rax
 21860  	je	.LBB3_923
 21861  # %bb.484:
 21862  	xorps	xmm0, xmm0
 21863  .LBB3_485:                              # =>This Inner Loop Header: Depth=1
 21864  	movss	xmm1, dword ptr [rdx + 4*rsi]   # xmm1 = mem[0],zero,zero,zero
 21865  	movmskps	edi, xmm1
 21866  	and	edi, 1
 21867  	neg	edi
 21868  	or	edi, 1
 21869  	xorps	xmm2, xmm2
 21870  	cvtsi2ss	xmm2, edi
 21871  	cmpeqss	xmm1, xmm0
 21872  	andnps	xmm1, xmm2
 21873  	movss	dword ptr [rcx + 4*rsi], xmm1
 21874  	movss	xmm1, dword ptr [rdx + 4*rsi + 4] # xmm1 = mem[0],zero,zero,zero
 21875  	movmskps	edi, xmm1
 21876  	and	edi, 1
 21877  	neg	edi
 21878  	or	edi, 1
 21879  	xorps	xmm2, xmm2
 21880  	cvtsi2ss	xmm2, edi
 21881  	cmpeqss	xmm1, xmm0
 21882  	andnps	xmm1, xmm2
 21883  	movss	dword ptr [rcx + 4*rsi + 4], xmm1
 21884  	add	rsi, 2
 21885  	cmp	rax, rsi
 21886  	jne	.LBB3_485
 21887  	jmp	.LBB3_923
 21888  .LBB3_179:
 21889  	test	r8d, r8d
 21890  	jle	.LBB3_923
 21891  # %bb.180:
 21892  	mov	r9d, r8d
 21893  	cmp	r8d, 4
 21894  	jb	.LBB3_181
 21895  # %bb.326:
 21896  	lea	rax, [rdx + 8*r9]
 21897  	cmp	rax, rcx
 21898  	jbe	.LBB3_486
 21899  # %bb.327:
 21900  	lea	rax, [rcx + 8*r9]
 21901  	cmp	rax, rdx
 21902  	jbe	.LBB3_486
 21903  .LBB3_181:
 21904  	xor	esi, esi
 21905  .LBB3_834:
 21906  	mov	rax, rsi
 21907  	not	rax
 21908  	test	r9b, 1
 21909  	je	.LBB3_836
 21910  # %bb.835:
 21911  	mov	r8, qword ptr [rdx + 8*rsi]
 21912  	mov	rdi, r8
 21913  	neg	rdi
 21914  	cmovl	rdi, r8
 21915  	mov	qword ptr [rcx + 8*rsi], rdi
 21916  	or	rsi, 1
 21917  .LBB3_836:
 21918  	add	rax, r9
 21919  	je	.LBB3_923
 21920  .LBB3_837:                              # =>This Inner Loop Header: Depth=1
 21921  	mov	rax, qword ptr [rdx + 8*rsi]
 21922  	mov	rdi, rax
 21923  	neg	rdi
 21924  	cmovl	rdi, rax
 21925  	mov	qword ptr [rcx + 8*rsi], rdi
 21926  	mov	rax, qword ptr [rdx + 8*rsi + 8]
 21927  	mov	rdi, rax
 21928  	neg	rdi
 21929  	cmovl	rdi, rax
 21930  	mov	qword ptr [rcx + 8*rsi + 8], rdi
 21931  	add	rsi, 2
 21932  	cmp	r9, rsi
 21933  	jne	.LBB3_837
 21934  	jmp	.LBB3_923
 21935  .LBB3_182:
 21936  	test	r8d, r8d
 21937  	jle	.LBB3_923
 21938  # %bb.183:
 21939  	mov	r9d, r8d
 21940  	cmp	r8d, 8
 21941  	jb	.LBB3_184
 21942  # %bb.329:
 21943  	lea	rax, [rdx + 4*r9]
 21944  	cmp	rax, rcx
 21945  	jbe	.LBB3_489
 21946  # %bb.330:
 21947  	lea	rax, [rcx + 4*r9]
 21948  	cmp	rax, rdx
 21949  	jbe	.LBB3_489
 21950  .LBB3_184:
 21951  	xor	esi, esi
 21952  .LBB3_842:
 21953  	mov	r8, rsi
 21954  	not	r8
 21955  	add	r8, r9
 21956  	mov	rdi, r9
 21957  	and	rdi, 3
 21958  	je	.LBB3_845
 21959  # %bb.843:
 21960  	mov	r10d, 2147483647
 21961  .LBB3_844:                              # =>This Inner Loop Header: Depth=1
 21962  	mov	eax, dword ptr [rdx + 4*rsi]
 21963  	and	eax, r10d
 21964  	mov	dword ptr [rcx + 4*rsi], eax
 21965  	add	rsi, 1
 21966  	add	rdi, -1
 21967  	jne	.LBB3_844
 21968  .LBB3_845:
 21969  	cmp	r8, 3
 21970  	jb	.LBB3_923
 21971  # %bb.846:
 21972  	mov	eax, 2147483647
 21973  .LBB3_847:                              # =>This Inner Loop Header: Depth=1
 21974  	mov	edi, dword ptr [rdx + 4*rsi]
 21975  	and	edi, eax
 21976  	mov	dword ptr [rcx + 4*rsi], edi
 21977  	mov	edi, dword ptr [rdx + 4*rsi + 4]
 21978  	and	edi, eax
 21979  	mov	dword ptr [rcx + 4*rsi + 4], edi
 21980  	mov	edi, dword ptr [rdx + 4*rsi + 8]
 21981  	and	edi, eax
 21982  	mov	dword ptr [rcx + 4*rsi + 8], edi
 21983  	mov	edi, dword ptr [rdx + 4*rsi + 12]
 21984  	and	edi, eax
 21985  	mov	dword ptr [rcx + 4*rsi + 12], edi
 21986  	add	rsi, 4
 21987  	cmp	r9, rsi
 21988  	jne	.LBB3_847
 21989  	jmp	.LBB3_923
 21990  .LBB3_185:
 21991  	test	r8d, r8d
 21992  	jle	.LBB3_923
 21993  # %bb.186:
 21994  	mov	r9d, r8d
 21995  	cmp	r8d, 4
 21996  	jb	.LBB3_187
 21997  # %bb.332:
 21998  	lea	rax, [rdx + 8*r9]
 21999  	cmp	rax, rcx
 22000  	jbe	.LBB3_492
 22001  # %bb.333:
 22002  	lea	rax, [rcx + 8*r9]
 22003  	cmp	rax, rdx
 22004  	jbe	.LBB3_492
 22005  .LBB3_187:
 22006  	xor	esi, esi
 22007  .LBB3_852:
 22008  	mov	rax, rsi
 22009  	not	rax
 22010  	test	r9b, 1
 22011  	je	.LBB3_854
 22012  # %bb.853:
 22013  	mov	r8, qword ptr [rdx + 8*rsi]
 22014  	mov	rdi, r8
 22015  	neg	rdi
 22016  	cmovl	rdi, r8
 22017  	mov	qword ptr [rcx + 8*rsi], rdi
 22018  	or	rsi, 1
 22019  .LBB3_854:
 22020  	add	rax, r9
 22021  	je	.LBB3_923
 22022  .LBB3_855:                              # =>This Inner Loop Header: Depth=1
 22023  	mov	rax, qword ptr [rdx + 8*rsi]
 22024  	mov	rdi, rax
 22025  	neg	rdi
 22026  	cmovl	rdi, rax
 22027  	mov	qword ptr [rcx + 8*rsi], rdi
 22028  	mov	rax, qword ptr [rdx + 8*rsi + 8]
 22029  	mov	rdi, rax
 22030  	neg	rdi
 22031  	cmovl	rdi, rax
 22032  	mov	qword ptr [rcx + 8*rsi + 8], rdi
 22033  	add	rsi, 2
 22034  	cmp	r9, rsi
 22035  	jne	.LBB3_855
 22036  	jmp	.LBB3_923
 22037  .LBB3_188:
 22038  	test	r8d, r8d
 22039  	jle	.LBB3_923
 22040  # %bb.189:
 22041  	mov	r9d, r8d
 22042  	cmp	r8d, 8
 22043  	jb	.LBB3_190
 22044  # %bb.335:
 22045  	lea	rax, [rdx + 4*r9]
 22046  	cmp	rax, rcx
 22047  	jbe	.LBB3_495
 22048  # %bb.336:
 22049  	lea	rax, [rcx + 4*r9]
 22050  	cmp	rax, rdx
 22051  	jbe	.LBB3_495
 22052  .LBB3_190:
 22053  	xor	esi, esi
 22054  .LBB3_860:
 22055  	mov	r8, rsi
 22056  	not	r8
 22057  	add	r8, r9
 22058  	mov	rdi, r9
 22059  	and	rdi, 3
 22060  	je	.LBB3_863
 22061  # %bb.861:
 22062  	mov	r10d, 2147483647
 22063  .LBB3_862:                              # =>This Inner Loop Header: Depth=1
 22064  	mov	eax, dword ptr [rdx + 4*rsi]
 22065  	and	eax, r10d
 22066  	mov	dword ptr [rcx + 4*rsi], eax
 22067  	add	rsi, 1
 22068  	add	rdi, -1
 22069  	jne	.LBB3_862
 22070  .LBB3_863:
 22071  	cmp	r8, 3
 22072  	jb	.LBB3_923
 22073  # %bb.864:
 22074  	mov	eax, 2147483647
 22075  .LBB3_865:                              # =>This Inner Loop Header: Depth=1
 22076  	mov	edi, dword ptr [rdx + 4*rsi]
 22077  	and	edi, eax
 22078  	mov	dword ptr [rcx + 4*rsi], edi
 22079  	mov	edi, dword ptr [rdx + 4*rsi + 4]
 22080  	and	edi, eax
 22081  	mov	dword ptr [rcx + 4*rsi + 4], edi
 22082  	mov	edi, dword ptr [rdx + 4*rsi + 8]
 22083  	and	edi, eax
 22084  	mov	dword ptr [rcx + 4*rsi + 8], edi
 22085  	mov	edi, dword ptr [rdx + 4*rsi + 12]
 22086  	and	edi, eax
 22087  	mov	dword ptr [rcx + 4*rsi + 12], edi
 22088  	add	rsi, 4
 22089  	cmp	r9, rsi
 22090  	jne	.LBB3_865
 22091  	jmp	.LBB3_923
 22092  .LBB3_191:
 22093  	test	r8d, r8d
 22094  	jle	.LBB3_923
 22095  # %bb.192:
 22096  	mov	r9d, r8d
 22097  	cmp	r8d, 32
 22098  	jae	.LBB3_338
 22099  # %bb.193:
 22100  	xor	edx, edx
 22101  	jmp	.LBB3_504
 22102  .LBB3_194:
 22103  	test	r8d, r8d
 22104  	jle	.LBB3_923
 22105  # %bb.195:
 22106  	mov	r9d, r8d
 22107  	cmp	r8d, 32
 22108  	jb	.LBB3_196
 22109  # %bb.340:
 22110  	lea	rax, [rdx + r9]
 22111  	cmp	rax, rcx
 22112  	jbe	.LBB3_505
 22113  # %bb.341:
 22114  	lea	rax, [rcx + r9]
 22115  	cmp	rax, rdx
 22116  	jbe	.LBB3_505
 22117  .LBB3_196:
 22118  	xor	esi, esi
 22119  .LBB3_870:
 22120  	mov	r8, rsi
 22121  	not	r8
 22122  	add	r8, r9
 22123  	mov	rdi, r9
 22124  	and	rdi, 3
 22125  	je	.LBB3_872
 22126  .LBB3_871:                              # =>This Inner Loop Header: Depth=1
 22127  	movzx	r10d, byte ptr [rdx + rsi]
 22128  	xor	eax, eax
 22129  	sub	al, r10b
 22130  	mov	byte ptr [rcx + rsi], al
 22131  	add	rsi, 1
 22132  	add	rdi, -1
 22133  	jne	.LBB3_871
 22134  .LBB3_872:
 22135  	cmp	r8, 3
 22136  	jb	.LBB3_923
 22137  .LBB3_873:                              # =>This Inner Loop Header: Depth=1
 22138  	xor	eax, eax
 22139  	sub	al, byte ptr [rdx + rsi]
 22140  	mov	byte ptr [rcx + rsi], al
 22141  	xor	eax, eax
 22142  	sub	al, byte ptr [rdx + rsi + 1]
 22143  	mov	byte ptr [rcx + rsi + 1], al
 22144  	xor	eax, eax
 22145  	sub	al, byte ptr [rdx + rsi + 2]
 22146  	mov	byte ptr [rcx + rsi + 2], al
 22147  	movzx	eax, byte ptr [rdx + rsi + 3]
 22148  	xor	edi, edi
 22149  	sub	dil, al
 22150  	mov	byte ptr [rcx + rsi + 3], dil
 22151  	add	rsi, 4
 22152  	cmp	r9, rsi
 22153  	jne	.LBB3_873
 22154  	jmp	.LBB3_923
 22155  .LBB3_197:
 22156  	test	r8d, r8d
 22157  	jle	.LBB3_923
 22158  # %bb.198:
 22159  	mov	r9d, r8d
 22160  	cmp	r8d, 32
 22161  	jb	.LBB3_199
 22162  # %bb.343:
 22163  	lea	rax, [rdx + r9]
 22164  	cmp	rax, rcx
 22165  	jbe	.LBB3_508
 22166  # %bb.344:
 22167  	lea	rax, [rcx + r9]
 22168  	cmp	rax, rdx
 22169  	jbe	.LBB3_508
 22170  .LBB3_199:
 22171  	xor	esi, esi
 22172  .LBB3_878:
 22173  	mov	rax, rsi
 22174  	not	rax
 22175  	add	rax, r9
 22176  	mov	rdi, r9
 22177  	and	rdi, 3
 22178  	je	.LBB3_880
 22179  .LBB3_879:                              # =>This Inner Loop Header: Depth=1
 22180  	cmp	byte ptr [rdx + rsi], 0
 22181  	setne	byte ptr [rcx + rsi]
 22182  	add	rsi, 1
 22183  	add	rdi, -1
 22184  	jne	.LBB3_879
 22185  .LBB3_880:
 22186  	cmp	rax, 3
 22187  	jb	.LBB3_923
 22188  .LBB3_881:                              # =>This Inner Loop Header: Depth=1
 22189  	cmp	byte ptr [rdx + rsi], 0
 22190  	setne	byte ptr [rcx + rsi]
 22191  	cmp	byte ptr [rdx + rsi + 1], 0
 22192  	setne	byte ptr [rcx + rsi + 1]
 22193  	cmp	byte ptr [rdx + rsi + 2], 0
 22194  	setne	byte ptr [rcx + rsi + 2]
 22195  	cmp	byte ptr [rdx + rsi + 3], 0
 22196  	setne	byte ptr [rcx + rsi + 3]
 22197  	add	rsi, 4
 22198  	cmp	r9, rsi
 22199  	jne	.LBB3_881
 22200  	jmp	.LBB3_923
 22201  .LBB3_200:
 22202  	test	r8d, r8d
 22203  	jle	.LBB3_923
 22204  # %bb.201:
 22205  	mov	r9d, r8d
 22206  	cmp	r8d, 32
 22207  	jb	.LBB3_202
 22208  # %bb.346:
 22209  	lea	rax, [rdx + r9]
 22210  	cmp	rax, rcx
 22211  	jbe	.LBB3_511
 22212  # %bb.347:
 22213  	lea	rax, [rcx + r9]
 22214  	cmp	rax, rdx
 22215  	jbe	.LBB3_511
 22216  .LBB3_202:
 22217  	xor	esi, esi
 22218  .LBB3_596:
 22219  	mov	r8, rsi
 22220  	not	r8
 22221  	add	r8, r9
 22222  	mov	rdi, r9
 22223  	and	rdi, 3
 22224  	je	.LBB3_598
 22225  .LBB3_597:                              # =>This Inner Loop Header: Depth=1
 22226  	movzx	eax, byte ptr [rdx + rsi]
 22227  	mov	byte ptr [rcx + rsi], al
 22228  	add	rsi, 1
 22229  	add	rdi, -1
 22230  	jne	.LBB3_597
 22231  .LBB3_598:
 22232  	cmp	r8, 3
 22233  	jb	.LBB3_923
 22234  .LBB3_599:                              # =>This Inner Loop Header: Depth=1
 22235  	movzx	eax, byte ptr [rdx + rsi]
 22236  	mov	byte ptr [rcx + rsi], al
 22237  	movzx	eax, byte ptr [rdx + rsi + 1]
 22238  	mov	byte ptr [rcx + rsi + 1], al
 22239  	movzx	eax, byte ptr [rdx + rsi + 2]
 22240  	mov	byte ptr [rcx + rsi + 2], al
 22241  	movzx	eax, byte ptr [rdx + rsi + 3]
 22242  	mov	byte ptr [rcx + rsi + 3], al
 22243  	add	rsi, 4
 22244  	cmp	r9, rsi
 22245  	jne	.LBB3_599
 22246  	jmp	.LBB3_923
 22247  .LBB3_203:
 22248  	test	r8d, r8d
 22249  	jle	.LBB3_923
 22250  # %bb.204:
 22251  	mov	r9d, r8d
 22252  	cmp	r8d, 32
 22253  	jb	.LBB3_205
 22254  # %bb.349:
 22255  	lea	rax, [rdx + r9]
 22256  	cmp	rax, rcx
 22257  	jbe	.LBB3_513
 22258  # %bb.350:
 22259  	lea	rax, [rcx + r9]
 22260  	cmp	rax, rdx
 22261  	jbe	.LBB3_513
 22262  .LBB3_205:
 22263  	xor	esi, esi
 22264  .LBB3_606:
 22265  	mov	r8, rsi
 22266  	not	r8
 22267  	add	r8, r9
 22268  	mov	rdi, r9
 22269  	and	rdi, 3
 22270  	je	.LBB3_608
 22271  .LBB3_607:                              # =>This Inner Loop Header: Depth=1
 22272  	movzx	eax, byte ptr [rdx + rsi]
 22273  	mov	byte ptr [rcx + rsi], al
 22274  	add	rsi, 1
 22275  	add	rdi, -1
 22276  	jne	.LBB3_607
 22277  .LBB3_608:
 22278  	cmp	r8, 3
 22279  	jb	.LBB3_923
 22280  .LBB3_609:                              # =>This Inner Loop Header: Depth=1
 22281  	movzx	eax, byte ptr [rdx + rsi]
 22282  	mov	byte ptr [rcx + rsi], al
 22283  	movzx	eax, byte ptr [rdx + rsi + 1]
 22284  	mov	byte ptr [rcx + rsi + 1], al
 22285  	movzx	eax, byte ptr [rdx + rsi + 2]
 22286  	mov	byte ptr [rcx + rsi + 2], al
 22287  	movzx	eax, byte ptr [rdx + rsi + 3]
 22288  	mov	byte ptr [rcx + rsi + 3], al
 22289  	add	rsi, 4
 22290  	cmp	r9, rsi
 22291  	jne	.LBB3_609
 22292  	jmp	.LBB3_923
 22293  .LBB3_206:
 22294  	test	r8d, r8d
 22295  	jle	.LBB3_923
 22296  # %bb.207:
 22297  	mov	r9d, r8d
 22298  	cmp	r8d, 8
 22299  	jb	.LBB3_208
 22300  # %bb.352:
 22301  	lea	rax, [rdx + 4*r9]
 22302  	cmp	rax, rcx
 22303  	jbe	.LBB3_515
 22304  # %bb.353:
 22305  	lea	rax, [rcx + 4*r9]
 22306  	cmp	rax, rdx
 22307  	jbe	.LBB3_515
 22308  .LBB3_208:
 22309  	xor	esi, esi
 22310  .LBB3_886:
 22311  	mov	r8, rsi
 22312  	not	r8
 22313  	add	r8, r9
 22314  	mov	rdi, r9
 22315  	and	rdi, 3
 22316  	je	.LBB3_888
 22317  .LBB3_887:                              # =>This Inner Loop Header: Depth=1
 22318  	xor	eax, eax
 22319  	sub	eax, dword ptr [rdx + 4*rsi]
 22320  	mov	dword ptr [rcx + 4*rsi], eax
 22321  	add	rsi, 1
 22322  	add	rdi, -1
 22323  	jne	.LBB3_887
 22324  .LBB3_888:
 22325  	cmp	r8, 3
 22326  	jb	.LBB3_923
 22327  .LBB3_889:                              # =>This Inner Loop Header: Depth=1
 22328  	xor	eax, eax
 22329  	sub	eax, dword ptr [rdx + 4*rsi]
 22330  	mov	dword ptr [rcx + 4*rsi], eax
 22331  	xor	eax, eax
 22332  	sub	eax, dword ptr [rdx + 4*rsi + 4]
 22333  	mov	dword ptr [rcx + 4*rsi + 4], eax
 22334  	xor	eax, eax
 22335  	sub	eax, dword ptr [rdx + 4*rsi + 8]
 22336  	mov	dword ptr [rcx + 4*rsi + 8], eax
 22337  	xor	eax, eax
 22338  	sub	eax, dword ptr [rdx + 4*rsi + 12]
 22339  	mov	dword ptr [rcx + 4*rsi + 12], eax
 22340  	add	rsi, 4
 22341  	cmp	r9, rsi
 22342  	jne	.LBB3_889
 22343  	jmp	.LBB3_923
 22344  .LBB3_209:
 22345  	test	r8d, r8d
 22346  	jle	.LBB3_923
 22347  # %bb.210:
 22348  	mov	r9d, r8d
 22349  	cmp	r8d, 8
 22350  	jb	.LBB3_211
 22351  # %bb.355:
 22352  	lea	rax, [rdx + 4*r9]
 22353  	cmp	rax, rcx
 22354  	jbe	.LBB3_518
 22355  # %bb.356:
 22356  	lea	rax, [rcx + 4*r9]
 22357  	cmp	rax, rdx
 22358  	jbe	.LBB3_518
 22359  .LBB3_211:
 22360  	xor	esi, esi
 22361  .LBB3_894:
 22362  	mov	r8, rsi
 22363  	not	r8
 22364  	add	r8, r9
 22365  	mov	rdi, r9
 22366  	and	rdi, 3
 22367  	je	.LBB3_896
 22368  .LBB3_895:                              # =>This Inner Loop Header: Depth=1
 22369  	xor	eax, eax
 22370  	sub	eax, dword ptr [rdx + 4*rsi]
 22371  	mov	dword ptr [rcx + 4*rsi], eax
 22372  	add	rsi, 1
 22373  	add	rdi, -1
 22374  	jne	.LBB3_895
 22375  .LBB3_896:
 22376  	cmp	r8, 3
 22377  	jb	.LBB3_923
 22378  .LBB3_897:                              # =>This Inner Loop Header: Depth=1
 22379  	xor	eax, eax
 22380  	sub	eax, dword ptr [rdx + 4*rsi]
 22381  	mov	dword ptr [rcx + 4*rsi], eax
 22382  	xor	eax, eax
 22383  	sub	eax, dword ptr [rdx + 4*rsi + 4]
 22384  	mov	dword ptr [rcx + 4*rsi + 4], eax
 22385  	xor	eax, eax
 22386  	sub	eax, dword ptr [rdx + 4*rsi + 8]
 22387  	mov	dword ptr [rcx + 4*rsi + 8], eax
 22388  	xor	eax, eax
 22389  	sub	eax, dword ptr [rdx + 4*rsi + 12]
 22390  	mov	dword ptr [rcx + 4*rsi + 12], eax
 22391  	add	rsi, 4
 22392  	cmp	r9, rsi
 22393  	jne	.LBB3_897
 22394  	jmp	.LBB3_923
 22395  .LBB3_212:
 22396  	test	r8d, r8d
 22397  	jle	.LBB3_923
 22398  # %bb.213:
 22399  	mov	r9d, r8d
 22400  	cmp	r8d, 8
 22401  	jb	.LBB3_214
 22402  # %bb.358:
 22403  	lea	rax, [rdx + 4*r9]
 22404  	cmp	rax, rcx
 22405  	jbe	.LBB3_521
 22406  # %bb.359:
 22407  	lea	rax, [rcx + 4*r9]
 22408  	cmp	rax, rdx
 22409  	jbe	.LBB3_521
 22410  .LBB3_214:
 22411  	xor	esi, esi
 22412  .LBB3_902:
 22413  	mov	rax, rsi
 22414  	not	rax
 22415  	test	r9b, 1
 22416  	je	.LBB3_904
 22417  # %bb.903:
 22418  	mov	r8d, dword ptr [rdx + 4*rsi]
 22419  	xor	r10d, r10d
 22420  	test	r8d, r8d
 22421  	setne	r10b
 22422  	neg	r10d
 22423  	test	r8d, r8d
 22424  	mov	edi, 1
 22425  	cmovle	edi, r10d
 22426  	mov	dword ptr [rcx + 4*rsi], edi
 22427  	or	rsi, 1
 22428  .LBB3_904:
 22429  	add	rax, r9
 22430  	je	.LBB3_923
 22431  # %bb.905:
 22432  	mov	r8d, 1
 22433  .LBB3_906:                              # =>This Inner Loop Header: Depth=1
 22434  	mov	edi, dword ptr [rdx + 4*rsi]
 22435  	xor	eax, eax
 22436  	test	edi, edi
 22437  	setne	al
 22438  	neg	eax
 22439  	test	edi, edi
 22440  	cmovg	eax, r8d
 22441  	mov	dword ptr [rcx + 4*rsi], eax
 22442  	mov	eax, dword ptr [rdx + 4*rsi + 4]
 22443  	xor	edi, edi
 22444  	test	eax, eax
 22445  	setne	dil
 22446  	neg	edi
 22447  	test	eax, eax
 22448  	cmovg	edi, r8d
 22449  	mov	dword ptr [rcx + 4*rsi + 4], edi
 22450  	add	rsi, 2
 22451  	cmp	r9, rsi
 22452  	jne	.LBB3_906
 22453  	jmp	.LBB3_923
 22454  .LBB3_215:
 22455  	test	r8d, r8d
 22456  	jle	.LBB3_923
 22457  # %bb.216:
 22458  	mov	r9d, r8d
 22459  	cmp	r8d, 8
 22460  	jb	.LBB3_217
 22461  # %bb.361:
 22462  	lea	rax, [rdx + 4*r9]
 22463  	cmp	rax, rcx
 22464  	jbe	.LBB3_524
 22465  # %bb.362:
 22466  	lea	rax, [rcx + 4*r9]
 22467  	cmp	rax, rdx
 22468  	jbe	.LBB3_524
 22469  .LBB3_217:
 22470  	xor	esi, esi
 22471  .LBB3_911:
 22472  	mov	rax, rsi
 22473  	not	rax
 22474  	test	r9b, 1
 22475  	je	.LBB3_913
 22476  # %bb.912:
 22477  	mov	r8d, dword ptr [rdx + 4*rsi]
 22478  	mov	edi, r8d
 22479  	neg	edi
 22480  	cmovl	edi, r8d
 22481  	mov	dword ptr [rcx + 4*rsi], edi
 22482  	or	rsi, 1
 22483  .LBB3_913:
 22484  	add	rax, r9
 22485  	je	.LBB3_923
 22486  .LBB3_914:                              # =>This Inner Loop Header: Depth=1
 22487  	mov	eax, dword ptr [rdx + 4*rsi]
 22488  	mov	edi, eax
 22489  	neg	edi
 22490  	cmovl	edi, eax
 22491  	mov	dword ptr [rcx + 4*rsi], edi
 22492  	mov	eax, dword ptr [rdx + 4*rsi + 4]
 22493  	mov	edi, eax
 22494  	neg	edi
 22495  	cmovl	edi, eax
 22496  	mov	dword ptr [rcx + 4*rsi + 4], edi
 22497  	add	rsi, 2
 22498  	cmp	r9, rsi
 22499  	jne	.LBB3_914
 22500  	jmp	.LBB3_923
 22501  .LBB3_218:
 22502  	test	r8d, r8d
 22503  	jle	.LBB3_923
 22504  # %bb.219:
 22505  	mov	r9d, r8d
 22506  	cmp	r8d, 8
 22507  	jb	.LBB3_220
 22508  # %bb.364:
 22509  	lea	rax, [rdx + 4*r9]
 22510  	cmp	rax, rcx
 22511  	jbe	.LBB3_527
 22512  # %bb.365:
 22513  	lea	rax, [rcx + 4*r9]
 22514  	cmp	rax, rdx
 22515  	jbe	.LBB3_527
 22516  .LBB3_220:
 22517  	xor	esi, esi
 22518  .LBB3_919:
 22519  	mov	rax, rsi
 22520  	not	rax
 22521  	test	r9b, 1
 22522  	je	.LBB3_921
 22523  # %bb.920:
 22524  	mov	r8d, dword ptr [rdx + 4*rsi]
 22525  	mov	edi, r8d
 22526  	neg	edi
 22527  	cmovl	edi, r8d
 22528  	mov	dword ptr [rcx + 4*rsi], edi
 22529  	or	rsi, 1
 22530  .LBB3_921:
 22531  	add	rax, r9
 22532  	je	.LBB3_923
 22533  .LBB3_922:                              # =>This Inner Loop Header: Depth=1
 22534  	mov	eax, dword ptr [rdx + 4*rsi]
 22535  	mov	edi, eax
 22536  	neg	edi
 22537  	cmovl	edi, eax
 22538  	mov	dword ptr [rcx + 4*rsi], edi
 22539  	mov	eax, dword ptr [rdx + 4*rsi + 4]
 22540  	mov	edi, eax
 22541  	neg	edi
 22542  	cmovl	edi, eax
 22543  	mov	dword ptr [rcx + 4*rsi + 4], edi
 22544  	add	rsi, 2
 22545  	cmp	r9, rsi
 22546  	jne	.LBB3_922
 22547  	jmp	.LBB3_923
 22548  .LBB3_221:
 22549  	mov	edx, r9d
 22550  	and	edx, -8
 22551  	lea	rax, [rdx - 8]
 22552  	mov	rdi, rax
 22553  	shr	rdi, 3
 22554  	add	rdi, 1
 22555  	mov	esi, edi
 22556  	and	esi, 7
 22557  	cmp	rax, 56
 22558  	jae	.LBB3_367
 22559  # %bb.222:
 22560  	xor	eax, eax
 22561  	jmp	.LBB3_369
 22562  .LBB3_265:
 22563  	mov	edx, r9d
 22564  	and	edx, -4
 22565  	lea	rax, [rdx - 4]
 22566  	mov	rdi, rax
 22567  	shr	rdi, 2
 22568  	add	rdi, 1
 22569  	mov	esi, edi
 22570  	and	esi, 7
 22571  	cmp	rax, 28
 22572  	jae	.LBB3_414
 22573  # %bb.266:
 22574  	xor	eax, eax
 22575  	jmp	.LBB3_416
 22576  .LBB3_279:
 22577  	mov	edx, r9d
 22578  	and	edx, -16
 22579  	lea	rax, [rdx - 16]
 22580  	mov	rdi, rax
 22581  	shr	rdi, 4
 22582  	add	rdi, 1
 22583  	mov	esi, edi
 22584  	and	esi, 7
 22585  	cmp	rax, 112
 22586  	jae	.LBB3_431
 22587  # %bb.280:
 22588  	xor	eax, eax
 22589  	jmp	.LBB3_433
 22590  .LBB3_338:
 22591  	mov	edx, r9d
 22592  	and	edx, -32
 22593  	lea	rax, [rdx - 32]
 22594  	mov	rdi, rax
 22595  	shr	rdi, 5
 22596  	add	rdi, 1
 22597  	mov	esi, edi
 22598  	and	esi, 7
 22599  	cmp	rax, 224
 22600  	jae	.LBB3_498
 22601  # %bb.339:
 22602  	xor	eax, eax
 22603  	jmp	.LBB3_500
 22604  .LBB3_374:
 22605  	mov	esi, r9d
 22606  	and	esi, -8
 22607  	lea	rax, [rsi - 8]
 22608  	mov	r8, rax
 22609  	shr	r8, 3
 22610  	add	r8, 1
 22611  	test	rax, rax
 22612  	je	.LBB3_610
 22613  # %bb.375:
 22614  	mov	rax, r8
 22615  	and	rax, -2
 22616  	neg	rax
 22617  	xor	edi, edi
 22618  .LBB3_376:                              # =>This Inner Loop Header: Depth=1
 22619  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
 22620  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 22621  	pxor	xmm2, xmm2
 22622  	psubd	xmm2, xmm0
 22623  	pxor	xmm0, xmm0
 22624  	psubd	xmm0, xmm1
 22625  	movdqu	xmmword ptr [rcx + 4*rdi], xmm2
 22626  	movdqu	xmmword ptr [rcx + 4*rdi + 16], xmm0
 22627  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
 22628  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
 22629  	pxor	xmm2, xmm2
 22630  	psubd	xmm2, xmm0
 22631  	pxor	xmm0, xmm0
 22632  	psubd	xmm0, xmm1
 22633  	movdqu	xmmword ptr [rcx + 4*rdi + 32], xmm2
 22634  	movdqu	xmmword ptr [rcx + 4*rdi + 48], xmm0
 22635  	add	rdi, 16
 22636  	add	rax, 2
 22637  	jne	.LBB3_376
 22638  	jmp	.LBB3_611
 22639  .LBB3_377:
 22640  	mov	esi, r9d
 22641  	and	esi, -8
 22642  	lea	rax, [rsi - 8]
 22643  	mov	r8, rax
 22644  	shr	r8, 3
 22645  	add	r8, 1
 22646  	test	rax, rax
 22647  	je	.LBB3_618
 22648  # %bb.378:
 22649  	mov	rax, r8
 22650  	and	rax, -2
 22651  	neg	rax
 22652  	xor	edi, edi
 22653  	pxor	xmm0, xmm0
 22654  	movdqa	xmm1, xmmword ptr [rip + .LCPI3_3] # xmm1 = [1,1,1,1]
 22655  .LBB3_379:                              # =>This Inner Loop Header: Depth=1
 22656  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi]
 22657  	movdqu	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 22658  	pcmpeqd	xmm2, xmm0
 22659  	pandn	xmm2, xmm1
 22660  	pcmpeqd	xmm3, xmm0
 22661  	pandn	xmm3, xmm1
 22662  	movdqu	xmmword ptr [rcx + 4*rdi], xmm2
 22663  	movdqu	xmmword ptr [rcx + 4*rdi + 16], xmm3
 22664  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 32]
 22665  	movdqu	xmm3, xmmword ptr [rdx + 4*rdi + 48]
 22666  	pcmpeqd	xmm2, xmm0
 22667  	pandn	xmm2, xmm1
 22668  	pcmpeqd	xmm3, xmm0
 22669  	pandn	xmm3, xmm1
 22670  	movdqu	xmmword ptr [rcx + 4*rdi + 32], xmm2
 22671  	movdqu	xmmword ptr [rcx + 4*rdi + 48], xmm3
 22672  	add	rdi, 16
 22673  	add	rax, 2
 22674  	jne	.LBB3_379
 22675  	jmp	.LBB3_619
 22676  .LBB3_380:
 22677  	mov	esi, r9d
 22678  	and	esi, -8
 22679  	lea	rax, [rsi - 8]
 22680  	mov	rdi, rax
 22681  	shr	rdi, 3
 22682  	add	rdi, 1
 22683  	mov	r8d, edi
 22684  	and	r8d, 3
 22685  	cmp	rax, 24
 22686  	jae	.LBB3_530
 22687  # %bb.381:
 22688  	xor	eax, eax
 22689  	jmp	.LBB3_532
 22690  .LBB3_382:
 22691  	mov	esi, r9d
 22692  	and	esi, -8
 22693  	lea	rax, [rsi - 8]
 22694  	mov	rdi, rax
 22695  	shr	rdi, 3
 22696  	add	rdi, 1
 22697  	mov	r8d, edi
 22698  	and	r8d, 3
 22699  	cmp	rax, 24
 22700  	jae	.LBB3_540
 22701  # %bb.383:
 22702  	xor	eax, eax
 22703  	jmp	.LBB3_542
 22704  .LBB3_384:
 22705  	mov	esi, r9d
 22706  	and	esi, -4
 22707  	lea	rax, [rsi - 4]
 22708  	mov	r8, rax
 22709  	shr	r8, 2
 22710  	add	r8, 1
 22711  	test	rax, rax
 22712  	je	.LBB3_626
 22713  # %bb.385:
 22714  	mov	rax, r8
 22715  	and	rax, -2
 22716  	neg	rax
 22717  	xor	edi, edi
 22718  	movapd	xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0]
 22719  .LBB3_386:                              # =>This Inner Loop Header: Depth=1
 22720  	movupd	xmm1, xmmword ptr [rdx + 8*rdi]
 22721  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 22722  	xorpd	xmm1, xmm0
 22723  	xorpd	xmm2, xmm0
 22724  	movupd	xmmword ptr [rcx + 8*rdi], xmm1
 22725  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm2
 22726  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 22727  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 22728  	xorpd	xmm1, xmm0
 22729  	xorpd	xmm2, xmm0
 22730  	movupd	xmmword ptr [rcx + 8*rdi + 32], xmm1
 22731  	movupd	xmmword ptr [rcx + 8*rdi + 48], xmm2
 22732  	add	rdi, 8
 22733  	add	rax, 2
 22734  	jne	.LBB3_386
 22735  	jmp	.LBB3_627
 22736  .LBB3_387:
 22737  	mov	esi, r9d
 22738  	and	esi, -4
 22739  	lea	rax, [rsi - 4]
 22740  	mov	r8, rax
 22741  	shr	r8, 2
 22742  	add	r8, 1
 22743  	test	rax, rax
 22744  	je	.LBB3_636
 22745  # %bb.388:
 22746  	mov	rax, r8
 22747  	and	rax, -2
 22748  	neg	rax
 22749  	xor	edi, edi
 22750  	movapd	xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0]
 22751  .LBB3_389:                              # =>This Inner Loop Header: Depth=1
 22752  	movupd	xmm1, xmmword ptr [rdx + 8*rdi]
 22753  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 22754  	xorpd	xmm1, xmm0
 22755  	xorpd	xmm2, xmm0
 22756  	movupd	xmmword ptr [rcx + 8*rdi], xmm1
 22757  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm2
 22758  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 22759  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 22760  	xorpd	xmm1, xmm0
 22761  	xorpd	xmm2, xmm0
 22762  	movupd	xmmword ptr [rcx + 8*rdi + 32], xmm1
 22763  	movupd	xmmword ptr [rcx + 8*rdi + 48], xmm2
 22764  	add	rdi, 8
 22765  	add	rax, 2
 22766  	jne	.LBB3_389
 22767  	jmp	.LBB3_637
 22768  .LBB3_390:
 22769  	mov	esi, r9d
 22770  	and	esi, -4
 22771  	lea	rax, [rsi - 4]
 22772  	mov	r8, rax
 22773  	shr	r8, 2
 22774  	add	r8, 1
 22775  	test	rax, rax
 22776  	je	.LBB3_646
 22777  # %bb.391:
 22778  	mov	rax, r8
 22779  	and	rax, -2
 22780  	neg	rax
 22781  	xor	edi, edi
 22782  	xorpd	xmm0, xmm0
 22783  	movapd	xmm1, xmmword ptr [rip + .LCPI3_0] # xmm1 = [-0.0E+0,-0.0E+0]
 22784  	movapd	xmm2, xmmword ptr [rip + .LCPI3_1] # xmm2 = [1.0E+0,1.0E+0]
 22785  .LBB3_392:                              # =>This Inner Loop Header: Depth=1
 22786  	movupd	xmm3, xmmword ptr [rdx + 8*rdi]
 22787  	movupd	xmm4, xmmword ptr [rdx + 8*rdi + 16]
 22788  	movapd	xmm5, xmm3
 22789  	andpd	xmm5, xmm1
 22790  	orpd	xmm5, xmm2
 22791  	movapd	xmm6, xmm4
 22792  	andpd	xmm6, xmm1
 22793  	orpd	xmm6, xmm2
 22794  	cmpneqpd	xmm3, xmm0
 22795  	andpd	xmm3, xmm5
 22796  	cmpneqpd	xmm4, xmm0
 22797  	andpd	xmm4, xmm6
 22798  	movupd	xmmword ptr [rcx + 8*rdi], xmm3
 22799  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm4
 22800  	movupd	xmm3, xmmword ptr [rdx + 8*rdi + 32]
 22801  	movupd	xmm4, xmmword ptr [rdx + 8*rdi + 48]
 22802  	movapd	xmm5, xmm3
 22803  	andpd	xmm5, xmm1
 22804  	orpd	xmm5, xmm2
 22805  	movapd	xmm6, xmm4
 22806  	andpd	xmm6, xmm1
 22807  	orpd	xmm6, xmm2
 22808  	cmpneqpd	xmm3, xmm0
 22809  	andpd	xmm3, xmm5
 22810  	cmpneqpd	xmm4, xmm0
 22811  	andpd	xmm4, xmm6
 22812  	movupd	xmmword ptr [rcx + 8*rdi + 32], xmm3
 22813  	movupd	xmmword ptr [rcx + 8*rdi + 48], xmm4
 22814  	add	rdi, 8
 22815  	add	rax, 2
 22816  	jne	.LBB3_392
 22817  	jmp	.LBB3_647
 22818  .LBB3_393:
 22819  	mov	esi, r9d
 22820  	and	esi, -4
 22821  	lea	rax, [rsi - 4]
 22822  	mov	r8, rax
 22823  	shr	r8, 2
 22824  	add	r8, 1
 22825  	test	rax, rax
 22826  	je	.LBB3_655
 22827  # %bb.394:
 22828  	mov	rax, r8
 22829  	and	rax, -2
 22830  	neg	rax
 22831  	xor	edi, edi
 22832  	movapd	xmm0, xmmword ptr [rip + .LCPI3_8] # xmm0 = [9223372036854775807,9223372036854775807]
 22833  .LBB3_395:                              # =>This Inner Loop Header: Depth=1
 22834  	movupd	xmm1, xmmword ptr [rdx + 8*rdi]
 22835  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 22836  	andpd	xmm1, xmm0
 22837  	andpd	xmm2, xmm0
 22838  	movupd	xmmword ptr [rcx + 8*rdi], xmm1
 22839  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm2
 22840  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 22841  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 22842  	andpd	xmm1, xmm0
 22843  	andpd	xmm2, xmm0
 22844  	movupd	xmmword ptr [rcx + 8*rdi + 32], xmm1
 22845  	movupd	xmmword ptr [rcx + 8*rdi + 48], xmm2
 22846  	add	rdi, 8
 22847  	add	rax, 2
 22848  	jne	.LBB3_395
 22849  	jmp	.LBB3_656
 22850  .LBB3_396:
 22851  	mov	esi, r9d
 22852  	and	esi, -4
 22853  	lea	rax, [rsi - 4]
 22854  	mov	r8, rax
 22855  	shr	r8, 2
 22856  	add	r8, 1
 22857  	test	rax, rax
 22858  	je	.LBB3_663
 22859  # %bb.397:
 22860  	mov	rax, r8
 22861  	and	rax, -2
 22862  	neg	rax
 22863  	xor	edi, edi
 22864  	movapd	xmm0, xmmword ptr [rip + .LCPI3_8] # xmm0 = [9223372036854775807,9223372036854775807]
 22865  .LBB3_398:                              # =>This Inner Loop Header: Depth=1
 22866  	movupd	xmm1, xmmword ptr [rdx + 8*rdi]
 22867  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 22868  	andpd	xmm1, xmm0
 22869  	andpd	xmm2, xmm0
 22870  	movupd	xmmword ptr [rcx + 8*rdi], xmm1
 22871  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm2
 22872  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 22873  	movupd	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 22874  	andpd	xmm1, xmm0
 22875  	andpd	xmm2, xmm0
 22876  	movupd	xmmword ptr [rcx + 8*rdi + 32], xmm1
 22877  	movupd	xmmword ptr [rcx + 8*rdi + 48], xmm2
 22878  	add	rdi, 8
 22879  	add	rax, 2
 22880  	jne	.LBB3_398
 22881  	jmp	.LBB3_664
 22882  .LBB3_399:
 22883  	mov	esi, r9d
 22884  	and	esi, -32
 22885  	lea	rax, [rsi - 32]
 22886  	mov	r8, rax
 22887  	shr	r8, 5
 22888  	add	r8, 1
 22889  	test	rax, rax
 22890  	je	.LBB3_671
 22891  # %bb.400:
 22892  	mov	rax, r8
 22893  	and	rax, -2
 22894  	neg	rax
 22895  	xor	edi, edi
 22896  .LBB3_401:                              # =>This Inner Loop Header: Depth=1
 22897  	movdqu	xmm0, xmmword ptr [rdx + rdi]
 22898  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
 22899  	pxor	xmm2, xmm2
 22900  	psubb	xmm2, xmm0
 22901  	pxor	xmm0, xmm0
 22902  	psubb	xmm0, xmm1
 22903  	movdqu	xmmword ptr [rcx + rdi], xmm2
 22904  	movdqu	xmmword ptr [rcx + rdi + 16], xmm0
 22905  	movdqu	xmm0, xmmword ptr [rdx + rdi + 32]
 22906  	movdqu	xmm1, xmmword ptr [rdx + rdi + 48]
 22907  	pxor	xmm2, xmm2
 22908  	psubb	xmm2, xmm0
 22909  	pxor	xmm0, xmm0
 22910  	psubb	xmm0, xmm1
 22911  	movdqu	xmmword ptr [rcx + rdi + 32], xmm2
 22912  	movdqu	xmmword ptr [rcx + rdi + 48], xmm0
 22913  	add	rdi, 64
 22914  	add	rax, 2
 22915  	jne	.LBB3_401
 22916  	jmp	.LBB3_672
 22917  .LBB3_402:
 22918  	mov	esi, r9d
 22919  	and	esi, -32
 22920  	lea	rax, [rsi - 32]
 22921  	mov	r8, rax
 22922  	shr	r8, 5
 22923  	add	r8, 1
 22924  	test	rax, rax
 22925  	je	.LBB3_679
 22926  # %bb.403:
 22927  	mov	rax, r8
 22928  	and	rax, -2
 22929  	neg	rax
 22930  	xor	edi, edi
 22931  .LBB3_404:                              # =>This Inner Loop Header: Depth=1
 22932  	movdqu	xmm0, xmmword ptr [rdx + rdi]
 22933  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
 22934  	pxor	xmm2, xmm2
 22935  	psubb	xmm2, xmm0
 22936  	pxor	xmm0, xmm0
 22937  	psubb	xmm0, xmm1
 22938  	movdqu	xmmword ptr [rcx + rdi], xmm2
 22939  	movdqu	xmmword ptr [rcx + rdi + 16], xmm0
 22940  	movdqu	xmm0, xmmword ptr [rdx + rdi + 32]
 22941  	movdqu	xmm1, xmmword ptr [rdx + rdi + 48]
 22942  	pxor	xmm2, xmm2
 22943  	psubb	xmm2, xmm0
 22944  	pxor	xmm0, xmm0
 22945  	psubb	xmm0, xmm1
 22946  	movdqu	xmmword ptr [rcx + rdi + 32], xmm2
 22947  	movdqu	xmmword ptr [rcx + rdi + 48], xmm0
 22948  	add	rdi, 64
 22949  	add	rax, 2
 22950  	jne	.LBB3_404
 22951  	jmp	.LBB3_680
 22952  .LBB3_405:
 22953  	mov	esi, r9d
 22954  	and	esi, -32
 22955  	lea	rax, [rsi - 32]
 22956  	mov	r8, rax
 22957  	shr	r8, 5
 22958  	add	r8, 1
 22959  	test	rax, rax
 22960  	je	.LBB3_687
 22961  # %bb.406:
 22962  	mov	rax, r8
 22963  	and	rax, -2
 22964  	neg	rax
 22965  	xor	edi, edi
 22966  	pxor	xmm2, xmm2
 22967  	pcmpeqd	xmm3, xmm3
 22968  	movdqa	xmm4, xmmword ptr [rip + .LCPI3_6] # xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 22969  .LBB3_407:                              # =>This Inner Loop Header: Depth=1
 22970  	movdqu	xmm5, xmmword ptr [rdx + rdi]
 22971  	movdqu	xmm6, xmmword ptr [rdx + rdi + 16]
 22972  	movdqa	xmm0, xmm4
 22973  	pcmpgtb	xmm0, xmm5
 22974  	pcmpeqb	xmm5, xmm2
 22975  	pxor	xmm5, xmm3
 22976  	movdqa	xmm1, xmm4
 22977  	pcmpgtb	xmm1, xmm6
 22978  	pcmpeqb	xmm6, xmm2
 22979  	pxor	xmm6, xmm3
 22980  	movdqa	xmm7, xmm4
 22981  	pblendvb	xmm7, xmm5, xmm0
 22982  	movdqa	xmm5, xmm4
 22983  	movdqa	xmm0, xmm1
 22984  	pblendvb	xmm5, xmm6, xmm0
 22985  	movdqu	xmmword ptr [rcx + rdi], xmm7
 22986  	movdqu	xmmword ptr [rcx + rdi + 16], xmm5
 22987  	movdqu	xmm5, xmmword ptr [rdx + rdi + 32]
 22988  	movdqu	xmm6, xmmword ptr [rdx + rdi + 48]
 22989  	movdqa	xmm0, xmm4
 22990  	pcmpgtb	xmm0, xmm5
 22991  	pcmpeqb	xmm5, xmm2
 22992  	pxor	xmm5, xmm3
 22993  	movdqa	xmm1, xmm4
 22994  	pcmpgtb	xmm1, xmm6
 22995  	pcmpeqb	xmm6, xmm2
 22996  	pxor	xmm6, xmm3
 22997  	movdqa	xmm7, xmm4
 22998  	pblendvb	xmm7, xmm5, xmm0
 22999  	movdqa	xmm5, xmm4
 23000  	movdqa	xmm0, xmm1
 23001  	pblendvb	xmm5, xmm6, xmm0
 23002  	movdqu	xmmword ptr [rcx + rdi + 32], xmm7
 23003  	movdqu	xmmword ptr [rcx + rdi + 48], xmm5
 23004  	add	rdi, 64
 23005  	add	rax, 2
 23006  	jne	.LBB3_407
 23007  	jmp	.LBB3_688
 23008  .LBB3_408:
 23009  	mov	esi, r9d
 23010  	and	esi, -16
 23011  	lea	rax, [rsi - 16]
 23012  	mov	r8, rax
 23013  	shr	r8, 4
 23014  	add	r8, 1
 23015  	test	rax, rax
 23016  	je	.LBB3_696
 23017  # %bb.409:
 23018  	mov	rax, r8
 23019  	and	rax, -2
 23020  	neg	rax
 23021  	xor	edi, edi
 23022  	movdqa	xmm8, xmmword ptr [rip + .LCPI3_10] # xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 23023  .LBB3_410:                              # =>This Inner Loop Header: Depth=1
 23024  	pmovsxbd	xmm4, dword ptr [rdx + rdi + 12]
 23025  	pmovsxbd	xmm1, dword ptr [rdx + rdi + 8]
 23026  	pmovsxbd	xmm3, dword ptr [rdx + rdi + 4]
 23027  	pmovsxbd	xmm2, dword ptr [rdx + rdi]
 23028  	movdqa	xmm5, xmm2
 23029  	psrad	xmm5, 7
 23030  	movdqa	xmm6, xmm3
 23031  	psrad	xmm6, 7
 23032  	movdqa	xmm7, xmm1
 23033  	psrad	xmm7, 7
 23034  	movdqa	xmm0, xmm4
 23035  	psrad	xmm0, 7
 23036  	paddd	xmm4, xmm0
 23037  	paddd	xmm1, xmm7
 23038  	paddd	xmm3, xmm6
 23039  	paddd	xmm2, xmm5
 23040  	pxor	xmm2, xmm5
 23041  	pxor	xmm3, xmm6
 23042  	pxor	xmm1, xmm7
 23043  	pxor	xmm4, xmm0
 23044  	pand	xmm4, xmm8
 23045  	pand	xmm1, xmm8
 23046  	packusdw	xmm1, xmm4
 23047  	pand	xmm3, xmm8
 23048  	pand	xmm2, xmm8
 23049  	packusdw	xmm2, xmm3
 23050  	packuswb	xmm2, xmm1
 23051  	movdqu	xmmword ptr [rcx + rdi], xmm2
 23052  	pmovsxbd	xmm4, dword ptr [rdx + rdi + 28]
 23053  	pmovsxbd	xmm1, dword ptr [rdx + rdi + 24]
 23054  	pmovsxbd	xmm3, dword ptr [rdx + rdi + 20]
 23055  	pmovsxbd	xmm2, dword ptr [rdx + rdi + 16]
 23056  	movdqa	xmm0, xmm2
 23057  	psrad	xmm0, 7
 23058  	movdqa	xmm5, xmm3
 23059  	psrad	xmm5, 7
 23060  	movdqa	xmm6, xmm1
 23061  	psrad	xmm6, 7
 23062  	movdqa	xmm7, xmm4
 23063  	psrad	xmm7, 7
 23064  	paddd	xmm4, xmm7
 23065  	paddd	xmm1, xmm6
 23066  	paddd	xmm3, xmm5
 23067  	paddd	xmm2, xmm0
 23068  	pxor	xmm2, xmm0
 23069  	pxor	xmm3, xmm5
 23070  	pxor	xmm1, xmm6
 23071  	pxor	xmm4, xmm7
 23072  	pand	xmm4, xmm8
 23073  	pand	xmm1, xmm8
 23074  	packusdw	xmm1, xmm4
 23075  	pand	xmm3, xmm8
 23076  	pand	xmm2, xmm8
 23077  	packusdw	xmm2, xmm3
 23078  	packuswb	xmm2, xmm1
 23079  	movdqu	xmmword ptr [rcx + rdi + 16], xmm2
 23080  	add	rdi, 32
 23081  	add	rax, 2
 23082  	jne	.LBB3_410
 23083  	jmp	.LBB3_697
 23084  .LBB3_411:
 23085  	mov	esi, r9d
 23086  	and	esi, -16
 23087  	lea	rax, [rsi - 16]
 23088  	mov	r8, rax
 23089  	shr	r8, 4
 23090  	add	r8, 1
 23091  	test	rax, rax
 23092  	je	.LBB3_704
 23093  # %bb.412:
 23094  	mov	rax, r8
 23095  	and	rax, -2
 23096  	neg	rax
 23097  	xor	edi, edi
 23098  	movdqa	xmm8, xmmword ptr [rip + .LCPI3_10] # xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 23099  .LBB3_413:                              # =>This Inner Loop Header: Depth=1
 23100  	pmovsxbd	xmm4, dword ptr [rdx + rdi + 12]
 23101  	pmovsxbd	xmm1, dword ptr [rdx + rdi + 8]
 23102  	pmovsxbd	xmm3, dword ptr [rdx + rdi + 4]
 23103  	pmovsxbd	xmm2, dword ptr [rdx + rdi]
 23104  	movdqa	xmm5, xmm2
 23105  	psrad	xmm5, 7
 23106  	movdqa	xmm6, xmm3
 23107  	psrad	xmm6, 7
 23108  	movdqa	xmm7, xmm1
 23109  	psrad	xmm7, 7
 23110  	movdqa	xmm0, xmm4
 23111  	psrad	xmm0, 7
 23112  	paddd	xmm4, xmm0
 23113  	paddd	xmm1, xmm7
 23114  	paddd	xmm3, xmm6
 23115  	paddd	xmm2, xmm5
 23116  	pxor	xmm2, xmm5
 23117  	pxor	xmm3, xmm6
 23118  	pxor	xmm1, xmm7
 23119  	pxor	xmm4, xmm0
 23120  	pand	xmm4, xmm8
 23121  	pand	xmm1, xmm8
 23122  	packusdw	xmm1, xmm4
 23123  	pand	xmm3, xmm8
 23124  	pand	xmm2, xmm8
 23125  	packusdw	xmm2, xmm3
 23126  	packuswb	xmm2, xmm1
 23127  	movdqu	xmmword ptr [rcx + rdi], xmm2
 23128  	pmovsxbd	xmm4, dword ptr [rdx + rdi + 28]
 23129  	pmovsxbd	xmm1, dword ptr [rdx + rdi + 24]
 23130  	pmovsxbd	xmm3, dword ptr [rdx + rdi + 20]
 23131  	pmovsxbd	xmm2, dword ptr [rdx + rdi + 16]
 23132  	movdqa	xmm0, xmm2
 23133  	psrad	xmm0, 7
 23134  	movdqa	xmm5, xmm3
 23135  	psrad	xmm5, 7
 23136  	movdqa	xmm6, xmm1
 23137  	psrad	xmm6, 7
 23138  	movdqa	xmm7, xmm4
 23139  	psrad	xmm7, 7
 23140  	paddd	xmm4, xmm7
 23141  	paddd	xmm1, xmm6
 23142  	paddd	xmm3, xmm5
 23143  	paddd	xmm2, xmm0
 23144  	pxor	xmm2, xmm0
 23145  	pxor	xmm3, xmm5
 23146  	pxor	xmm1, xmm6
 23147  	pxor	xmm4, xmm7
 23148  	pand	xmm4, xmm8
 23149  	pand	xmm1, xmm8
 23150  	packusdw	xmm1, xmm4
 23151  	pand	xmm3, xmm8
 23152  	pand	xmm2, xmm8
 23153  	packusdw	xmm2, xmm3
 23154  	packuswb	xmm2, xmm1
 23155  	movdqu	xmmword ptr [rcx + rdi + 16], xmm2
 23156  	add	rdi, 32
 23157  	add	rax, 2
 23158  	jne	.LBB3_413
 23159  	jmp	.LBB3_705
 23160  .LBB3_421:
 23161  	mov	esi, r9d
 23162  	and	esi, -4
 23163  	lea	rax, [rsi - 4]
 23164  	mov	r8, rax
 23165  	shr	r8, 2
 23166  	add	r8, 1
 23167  	test	rax, rax
 23168  	je	.LBB3_712
 23169  # %bb.422:
 23170  	mov	rax, r8
 23171  	and	rax, -2
 23172  	neg	rax
 23173  	xor	edi, edi
 23174  .LBB3_423:                              # =>This Inner Loop Header: Depth=1
 23175  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
 23176  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
 23177  	pxor	xmm2, xmm2
 23178  	psubq	xmm2, xmm0
 23179  	pxor	xmm0, xmm0
 23180  	psubq	xmm0, xmm1
 23181  	movdqu	xmmword ptr [rcx + 8*rdi], xmm2
 23182  	movdqu	xmmword ptr [rcx + 8*rdi + 16], xmm0
 23183  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi + 32]
 23184  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 48]
 23185  	pxor	xmm2, xmm2
 23186  	psubq	xmm2, xmm0
 23187  	pxor	xmm0, xmm0
 23188  	psubq	xmm0, xmm1
 23189  	movdqu	xmmword ptr [rcx + 8*rdi + 32], xmm2
 23190  	movdqu	xmmword ptr [rcx + 8*rdi + 48], xmm0
 23191  	add	rdi, 8
 23192  	add	rax, 2
 23193  	jne	.LBB3_423
 23194  	jmp	.LBB3_713
 23195  .LBB3_424:
 23196  	mov	esi, r9d
 23197  	and	esi, -4
 23198  	lea	rax, [rsi - 4]
 23199  	mov	r8, rax
 23200  	shr	r8, 2
 23201  	add	r8, 1
 23202  	test	rax, rax
 23203  	je	.LBB3_720
 23204  # %bb.425:
 23205  	mov	rax, r8
 23206  	and	rax, -2
 23207  	neg	rax
 23208  	xor	edi, edi
 23209  	pxor	xmm0, xmm0
 23210  	movdqa	xmm1, xmmword ptr [rip + .LCPI3_4] # xmm1 = [1,1]
 23211  .LBB3_426:                              # =>This Inner Loop Header: Depth=1
 23212  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi]
 23213  	movdqu	xmm3, xmmword ptr [rdx + 8*rdi + 16]
 23214  	pcmpeqq	xmm2, xmm0
 23215  	pandn	xmm2, xmm1
 23216  	pcmpeqq	xmm3, xmm0
 23217  	pandn	xmm3, xmm1
 23218  	movdqu	xmmword ptr [rcx + 8*rdi], xmm2
 23219  	movdqu	xmmword ptr [rcx + 8*rdi + 16], xmm3
 23220  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 32]
 23221  	movdqu	xmm3, xmmword ptr [rdx + 8*rdi + 48]
 23222  	pcmpeqq	xmm2, xmm0
 23223  	pandn	xmm2, xmm1
 23224  	pcmpeqq	xmm3, xmm0
 23225  	pandn	xmm3, xmm1
 23226  	movdqu	xmmword ptr [rcx + 8*rdi + 32], xmm2
 23227  	movdqu	xmmword ptr [rcx + 8*rdi + 48], xmm3
 23228  	add	rdi, 8
 23229  	add	rax, 2
 23230  	jne	.LBB3_426
 23231  	jmp	.LBB3_721
 23232  .LBB3_427:
 23233  	mov	esi, r9d
 23234  	and	esi, -4
 23235  	lea	rax, [rsi - 4]
 23236  	mov	rdi, rax
 23237  	shr	rdi, 2
 23238  	add	rdi, 1
 23239  	mov	r8d, edi
 23240  	and	r8d, 3
 23241  	cmp	rax, 12
 23242  	jae	.LBB3_550
 23243  # %bb.428:
 23244  	xor	eax, eax
 23245  	jmp	.LBB3_552
 23246  .LBB3_429:
 23247  	mov	esi, r9d
 23248  	and	esi, -4
 23249  	lea	rax, [rsi - 4]
 23250  	mov	rdi, rax
 23251  	shr	rdi, 2
 23252  	add	rdi, 1
 23253  	mov	r8d, edi
 23254  	and	r8d, 3
 23255  	cmp	rax, 12
 23256  	jae	.LBB3_560
 23257  # %bb.430:
 23258  	xor	eax, eax
 23259  	jmp	.LBB3_562
 23260  .LBB3_438:
 23261  	mov	esi, r9d
 23262  	and	esi, -16
 23263  	lea	rax, [rsi - 16]
 23264  	mov	r8, rax
 23265  	shr	r8, 4
 23266  	add	r8, 1
 23267  	test	rax, rax
 23268  	je	.LBB3_728
 23269  # %bb.439:
 23270  	mov	rax, r8
 23271  	and	rax, -2
 23272  	neg	rax
 23273  	xor	edi, edi
 23274  .LBB3_440:                              # =>This Inner Loop Header: Depth=1
 23275  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
 23276  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
 23277  	pxor	xmm2, xmm2
 23278  	psubw	xmm2, xmm0
 23279  	pxor	xmm0, xmm0
 23280  	psubw	xmm0, xmm1
 23281  	movdqu	xmmword ptr [rcx + 2*rdi], xmm2
 23282  	movdqu	xmmword ptr [rcx + 2*rdi + 16], xmm0
 23283  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
 23284  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
 23285  	pxor	xmm2, xmm2
 23286  	psubw	xmm2, xmm0
 23287  	pxor	xmm0, xmm0
 23288  	psubw	xmm0, xmm1
 23289  	movdqu	xmmword ptr [rcx + 2*rdi + 32], xmm2
 23290  	movdqu	xmmword ptr [rcx + 2*rdi + 48], xmm0
 23291  	add	rdi, 32
 23292  	add	rax, 2
 23293  	jne	.LBB3_440
 23294  	jmp	.LBB3_729
 23295  .LBB3_441:
 23296  	mov	esi, r9d
 23297  	and	esi, -16
 23298  	lea	rax, [rsi - 16]
 23299  	mov	r8, rax
 23300  	shr	r8, 4
 23301  	add	r8, 1
 23302  	test	rax, rax
 23303  	je	.LBB3_736
 23304  # %bb.442:
 23305  	mov	rax, r8
 23306  	and	rax, -2
 23307  	neg	rax
 23308  	xor	edi, edi
 23309  .LBB3_443:                              # =>This Inner Loop Header: Depth=1
 23310  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
 23311  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
 23312  	pxor	xmm2, xmm2
 23313  	psubw	xmm2, xmm0
 23314  	pxor	xmm0, xmm0
 23315  	psubw	xmm0, xmm1
 23316  	movdqu	xmmword ptr [rcx + 2*rdi], xmm2
 23317  	movdqu	xmmword ptr [rcx + 2*rdi + 16], xmm0
 23318  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
 23319  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
 23320  	pxor	xmm2, xmm2
 23321  	psubw	xmm2, xmm0
 23322  	pxor	xmm0, xmm0
 23323  	psubw	xmm0, xmm1
 23324  	movdqu	xmmword ptr [rcx + 2*rdi + 32], xmm2
 23325  	movdqu	xmmword ptr [rcx + 2*rdi + 48], xmm0
 23326  	add	rdi, 32
 23327  	add	rax, 2
 23328  	jne	.LBB3_443
 23329  	jmp	.LBB3_737
 23330  .LBB3_444:
 23331  	mov	esi, r9d
 23332  	and	esi, -16
 23333  	lea	rax, [rsi - 16]
 23334  	mov	r8, rax
 23335  	shr	r8, 4
 23336  	add	r8, 1
 23337  	test	rax, rax
 23338  	je	.LBB3_744
 23339  # %bb.445:
 23340  	mov	rax, r8
 23341  	and	rax, -2
 23342  	neg	rax
 23343  	xor	edi, edi
 23344  .LBB3_446:                              # =>This Inner Loop Header: Depth=1
 23345  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
 23346  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
 23347  	pxor	xmm2, xmm2
 23348  	psubw	xmm2, xmm0
 23349  	pxor	xmm0, xmm0
 23350  	psubw	xmm0, xmm1
 23351  	movdqu	xmmword ptr [rcx + 2*rdi], xmm2
 23352  	movdqu	xmmword ptr [rcx + 2*rdi + 16], xmm0
 23353  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi + 32]
 23354  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 48]
 23355  	pxor	xmm2, xmm2
 23356  	psubw	xmm2, xmm0
 23357  	pxor	xmm0, xmm0
 23358  	psubw	xmm0, xmm1
 23359  	movdqu	xmmword ptr [rcx + 2*rdi + 32], xmm2
 23360  	movdqu	xmmword ptr [rcx + 2*rdi + 48], xmm0
 23361  	add	rdi, 32
 23362  	add	rax, 2
 23363  	jne	.LBB3_446
 23364  	jmp	.LBB3_745
 23365  .LBB3_447:
 23366  	mov	esi, r9d
 23367  	and	esi, -16
 23368  	lea	rax, [rsi - 16]
 23369  	mov	r8, rax
 23370  	shr	r8, 4
 23371  	add	r8, 1
 23372  	test	rax, rax
 23373  	je	.LBB3_752
 23374  # %bb.448:
 23375  	mov	rax, r8
 23376  	and	rax, -2
 23377  	neg	rax
 23378  	xor	edi, edi
 23379  	pxor	xmm0, xmm0
 23380  	movdqa	xmm1, xmmword ptr [rip + .LCPI3_5] # xmm1 = [1,1,1,1,1,1,1,1]
 23381  .LBB3_449:                              # =>This Inner Loop Header: Depth=1
 23382  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi]
 23383  	movdqu	xmm3, xmmword ptr [rdx + 2*rdi + 16]
 23384  	pcmpeqw	xmm2, xmm0
 23385  	pandn	xmm2, xmm1
 23386  	pcmpeqw	xmm3, xmm0
 23387  	pandn	xmm3, xmm1
 23388  	movdqu	xmmword ptr [rcx + 2*rdi], xmm2
 23389  	movdqu	xmmword ptr [rcx + 2*rdi + 16], xmm3
 23390  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 32]
 23391  	movdqu	xmm3, xmmword ptr [rdx + 2*rdi + 48]
 23392  	pcmpeqw	xmm2, xmm0
 23393  	pandn	xmm2, xmm1
 23394  	pcmpeqw	xmm3, xmm0
 23395  	pandn	xmm3, xmm1
 23396  	movdqu	xmmword ptr [rcx + 2*rdi + 32], xmm2
 23397  	movdqu	xmmword ptr [rcx + 2*rdi + 48], xmm3
 23398  	add	rdi, 32
 23399  	add	rax, 2
 23400  	jne	.LBB3_449
 23401  	jmp	.LBB3_753
 23402  .LBB3_450:
 23403  	mov	esi, r9d
 23404  	and	esi, -16
 23405  	lea	rax, [rsi - 16]
 23406  	mov	r8, rax
 23407  	shr	r8, 4
 23408  	add	r8, 1
 23409  	test	rax, rax
 23410  	je	.LBB3_760
 23411  # %bb.451:
 23412  	mov	rax, r8
 23413  	and	rax, -2
 23414  	neg	rax
 23415  	xor	edi, edi
 23416  	pxor	xmm2, xmm2
 23417  	pcmpeqd	xmm3, xmm3
 23418  	movdqa	xmm4, xmmword ptr [rip + .LCPI3_5] # xmm4 = [1,1,1,1,1,1,1,1]
 23419  .LBB3_452:                              # =>This Inner Loop Header: Depth=1
 23420  	movdqu	xmm5, xmmword ptr [rdx + 2*rdi]
 23421  	movdqu	xmm6, xmmword ptr [rdx + 2*rdi + 16]
 23422  	movdqa	xmm0, xmm4
 23423  	pcmpgtw	xmm0, xmm5
 23424  	pcmpeqw	xmm5, xmm2
 23425  	pxor	xmm5, xmm3
 23426  	movdqa	xmm1, xmm4
 23427  	pcmpgtw	xmm1, xmm6
 23428  	pcmpeqw	xmm6, xmm2
 23429  	pxor	xmm6, xmm3
 23430  	movdqa	xmm7, xmm4
 23431  	pblendvb	xmm7, xmm5, xmm0
 23432  	movdqa	xmm5, xmm4
 23433  	movdqa	xmm0, xmm1
 23434  	pblendvb	xmm5, xmm6, xmm0
 23435  	movdqu	xmmword ptr [rcx + 2*rdi], xmm7
 23436  	movdqu	xmmword ptr [rcx + 2*rdi + 16], xmm5
 23437  	movdqu	xmm5, xmmword ptr [rdx + 2*rdi + 32]
 23438  	movdqu	xmm6, xmmword ptr [rdx + 2*rdi + 48]
 23439  	movdqa	xmm0, xmm4
 23440  	pcmpgtw	xmm0, xmm5
 23441  	pcmpeqw	xmm5, xmm2
 23442  	pxor	xmm5, xmm3
 23443  	movdqa	xmm1, xmm4
 23444  	pcmpgtw	xmm1, xmm6
 23445  	pcmpeqw	xmm6, xmm2
 23446  	pxor	xmm6, xmm3
 23447  	movdqa	xmm7, xmm4
 23448  	pblendvb	xmm7, xmm5, xmm0
 23449  	movdqa	xmm5, xmm4
 23450  	movdqa	xmm0, xmm1
 23451  	pblendvb	xmm5, xmm6, xmm0
 23452  	movdqu	xmmword ptr [rcx + 2*rdi + 32], xmm7
 23453  	movdqu	xmmword ptr [rcx + 2*rdi + 48], xmm5
 23454  	add	rdi, 32
 23455  	add	rax, 2
 23456  	jne	.LBB3_452
 23457  	jmp	.LBB3_761
 23458  .LBB3_453:
 23459  	mov	esi, r9d
 23460  	and	esi, -16
 23461  	lea	rax, [rsi - 16]
 23462  	mov	rdi, rax
 23463  	shr	rdi, 4
 23464  	add	rdi, 1
 23465  	mov	r8d, edi
 23466  	and	r8d, 3
 23467  	cmp	rax, 48
 23468  	jae	.LBB3_570
 23469  # %bb.454:
 23470  	xor	eax, eax
 23471  	jmp	.LBB3_572
 23472  .LBB3_455:
 23473  	mov	esi, r9d
 23474  	and	esi, -8
 23475  	lea	rax, [rsi - 8]
 23476  	mov	r8, rax
 23477  	shr	r8, 3
 23478  	add	r8, 1
 23479  	test	rax, rax
 23480  	je	.LBB3_769
 23481  # %bb.456:
 23482  	mov	rax, r8
 23483  	and	rax, -2
 23484  	neg	rax
 23485  	xor	edi, edi
 23486  	pxor	xmm0, xmm0
 23487  .LBB3_457:                              # =>This Inner Loop Header: Depth=1
 23488  	pmovsxwd	xmm1, qword ptr [rdx + 2*rdi + 8]
 23489  	pmovsxwd	xmm2, qword ptr [rdx + 2*rdi]
 23490  	movdqa	xmm3, xmm2
 23491  	psrad	xmm3, 15
 23492  	movdqa	xmm4, xmm1
 23493  	psrad	xmm4, 15
 23494  	paddd	xmm1, xmm4
 23495  	paddd	xmm2, xmm3
 23496  	pxor	xmm2, xmm3
 23497  	pxor	xmm1, xmm4
 23498  	pblendw	xmm1, xmm0, 170                 # xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
 23499  	pblendw	xmm2, xmm0, 170                 # xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
 23500  	packusdw	xmm2, xmm1
 23501  	movdqu	xmmword ptr [rcx + 2*rdi], xmm2
 23502  	pmovsxwd	xmm1, qword ptr [rdx + 2*rdi + 24]
 23503  	pmovsxwd	xmm2, qword ptr [rdx + 2*rdi + 16]
 23504  	movdqa	xmm3, xmm2
 23505  	psrad	xmm3, 15
 23506  	movdqa	xmm4, xmm1
 23507  	psrad	xmm4, 15
 23508  	paddd	xmm1, xmm4
 23509  	paddd	xmm2, xmm3
 23510  	pxor	xmm2, xmm3
 23511  	pxor	xmm1, xmm4
 23512  	pblendw	xmm1, xmm0, 170                 # xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
 23513  	pblendw	xmm2, xmm0, 170                 # xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
 23514  	packusdw	xmm2, xmm1
 23515  	movdqu	xmmword ptr [rcx + 2*rdi + 16], xmm2
 23516  	add	rdi, 16
 23517  	add	rax, 2
 23518  	jne	.LBB3_457
 23519  	jmp	.LBB3_770
 23520  .LBB3_458:
 23521  	mov	esi, r9d
 23522  	and	esi, -16
 23523  	lea	rax, [rsi - 16]
 23524  	mov	rdi, rax
 23525  	shr	rdi, 4
 23526  	add	rdi, 1
 23527  	mov	r8d, edi
 23528  	and	r8d, 3
 23529  	cmp	rax, 48
 23530  	jae	.LBB3_580
 23531  # %bb.459:
 23532  	xor	eax, eax
 23533  	jmp	.LBB3_582
 23534  .LBB3_460:
 23535  	mov	esi, r9d
 23536  	and	esi, -8
 23537  	lea	rax, [rsi - 8]
 23538  	mov	r8, rax
 23539  	shr	r8, 3
 23540  	add	r8, 1
 23541  	test	rax, rax
 23542  	je	.LBB3_777
 23543  # %bb.461:
 23544  	mov	rax, r8
 23545  	and	rax, -2
 23546  	neg	rax
 23547  	xor	edi, edi
 23548  	pxor	xmm0, xmm0
 23549  .LBB3_462:                              # =>This Inner Loop Header: Depth=1
 23550  	pmovsxwd	xmm1, qword ptr [rdx + 2*rdi + 8]
 23551  	pmovsxwd	xmm2, qword ptr [rdx + 2*rdi]
 23552  	movdqa	xmm3, xmm2
 23553  	psrad	xmm3, 15
 23554  	movdqa	xmm4, xmm1
 23555  	psrad	xmm4, 15
 23556  	paddd	xmm1, xmm4
 23557  	paddd	xmm2, xmm3
 23558  	pxor	xmm2, xmm3
 23559  	pxor	xmm1, xmm4
 23560  	pblendw	xmm1, xmm0, 170                 # xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
 23561  	pblendw	xmm2, xmm0, 170                 # xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
 23562  	packusdw	xmm2, xmm1
 23563  	movdqu	xmmword ptr [rcx + 2*rdi], xmm2
 23564  	pmovsxwd	xmm1, qword ptr [rdx + 2*rdi + 24]
 23565  	pmovsxwd	xmm2, qword ptr [rdx + 2*rdi + 16]
 23566  	movdqa	xmm3, xmm2
 23567  	psrad	xmm3, 15
 23568  	movdqa	xmm4, xmm1
 23569  	psrad	xmm4, 15
 23570  	paddd	xmm1, xmm4
 23571  	paddd	xmm2, xmm3
 23572  	pxor	xmm2, xmm3
 23573  	pxor	xmm1, xmm4
 23574  	pblendw	xmm1, xmm0, 170                 # xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
 23575  	pblendw	xmm2, xmm0, 170                 # xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
 23576  	packusdw	xmm2, xmm1
 23577  	movdqu	xmmword ptr [rcx + 2*rdi + 16], xmm2
 23578  	add	rdi, 16
 23579  	add	rax, 2
 23580  	jne	.LBB3_462
 23581  	jmp	.LBB3_778
 23582  .LBB3_463:
 23583  	mov	esi, r9d
 23584  	and	esi, -4
 23585  	lea	rax, [rsi - 4]
 23586  	mov	r8, rax
 23587  	shr	r8, 2
 23588  	add	r8, 1
 23589  	test	rax, rax
 23590  	je	.LBB3_785
 23591  # %bb.464:
 23592  	mov	rax, r8
 23593  	and	rax, -2
 23594  	neg	rax
 23595  	xor	edi, edi
 23596  .LBB3_465:                              # =>This Inner Loop Header: Depth=1
 23597  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
 23598  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
 23599  	pxor	xmm2, xmm2
 23600  	psubq	xmm2, xmm0
 23601  	pxor	xmm0, xmm0
 23602  	psubq	xmm0, xmm1
 23603  	movdqu	xmmword ptr [rcx + 8*rdi], xmm2
 23604  	movdqu	xmmword ptr [rcx + 8*rdi + 16], xmm0
 23605  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi + 32]
 23606  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 48]
 23607  	pxor	xmm2, xmm2
 23608  	psubq	xmm2, xmm0
 23609  	pxor	xmm0, xmm0
 23610  	psubq	xmm0, xmm1
 23611  	movdqu	xmmword ptr [rcx + 8*rdi + 32], xmm2
 23612  	movdqu	xmmword ptr [rcx + 8*rdi + 48], xmm0
 23613  	add	rdi, 8
 23614  	add	rax, 2
 23615  	jne	.LBB3_465
 23616  	jmp	.LBB3_786
 23617  .LBB3_466:
 23618  	mov	esi, r9d
 23619  	and	esi, -8
 23620  	lea	rax, [rsi - 8]
 23621  	mov	r8, rax
 23622  	shr	r8, 3
 23623  	add	r8, 1
 23624  	test	rax, rax
 23625  	je	.LBB3_793
 23626  # %bb.467:
 23627  	mov	rax, r8
 23628  	and	rax, -2
 23629  	neg	rax
 23630  	xor	edi, edi
 23631  	movapd	xmm0, xmmword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 23632  .LBB3_468:                              # =>This Inner Loop Header: Depth=1
 23633  	movupd	xmm1, xmmword ptr [rdx + 4*rdi]
 23634  	movupd	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 23635  	xorpd	xmm1, xmm0
 23636  	xorpd	xmm2, xmm0
 23637  	movupd	xmmword ptr [rcx + 4*rdi], xmm1
 23638  	movupd	xmmword ptr [rcx + 4*rdi + 16], xmm2
 23639  	movupd	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 23640  	movupd	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 23641  	xorpd	xmm1, xmm0
 23642  	xorpd	xmm2, xmm0
 23643  	movupd	xmmword ptr [rcx + 4*rdi + 32], xmm1
 23644  	movupd	xmmword ptr [rcx + 4*rdi + 48], xmm2
 23645  	add	rdi, 16
 23646  	add	rax, 2
 23647  	jne	.LBB3_468
 23648  	jmp	.LBB3_794
 23649  .LBB3_469:
 23650  	mov	esi, r9d
 23651  	and	esi, -4
 23652  	lea	rax, [rsi - 4]
 23653  	mov	r8, rax
 23654  	shr	r8, 2
 23655  	add	r8, 1
 23656  	test	rax, rax
 23657  	je	.LBB3_803
 23658  # %bb.470:
 23659  	mov	rax, r8
 23660  	and	rax, -2
 23661  	neg	rax
 23662  	xor	edi, edi
 23663  .LBB3_471:                              # =>This Inner Loop Header: Depth=1
 23664  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
 23665  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
 23666  	pxor	xmm2, xmm2
 23667  	psubq	xmm2, xmm0
 23668  	pxor	xmm0, xmm0
 23669  	psubq	xmm0, xmm1
 23670  	movdqu	xmmword ptr [rcx + 8*rdi], xmm2
 23671  	movdqu	xmmword ptr [rcx + 8*rdi + 16], xmm0
 23672  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi + 32]
 23673  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 48]
 23674  	pxor	xmm2, xmm2
 23675  	psubq	xmm2, xmm0
 23676  	pxor	xmm0, xmm0
 23677  	psubq	xmm0, xmm1
 23678  	movdqu	xmmword ptr [rcx + 8*rdi + 32], xmm2
 23679  	movdqu	xmmword ptr [rcx + 8*rdi + 48], xmm0
 23680  	add	rdi, 8
 23681  	add	rax, 2
 23682  	jne	.LBB3_471
 23683  	jmp	.LBB3_804
 23684  .LBB3_472:
 23685  	mov	esi, r9d
 23686  	and	esi, -8
 23687  	lea	rax, [rsi - 8]
 23688  	mov	r8, rax
 23689  	shr	r8, 3
 23690  	add	r8, 1
 23691  	test	rax, rax
 23692  	je	.LBB3_811
 23693  # %bb.473:
 23694  	mov	rax, r8
 23695  	and	rax, -2
 23696  	neg	rax
 23697  	xor	edi, edi
 23698  	movapd	xmm0, xmmword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 23699  .LBB3_474:                              # =>This Inner Loop Header: Depth=1
 23700  	movupd	xmm1, xmmword ptr [rdx + 4*rdi]
 23701  	movupd	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 23702  	xorpd	xmm1, xmm0
 23703  	xorpd	xmm2, xmm0
 23704  	movupd	xmmword ptr [rcx + 4*rdi], xmm1
 23705  	movupd	xmmword ptr [rcx + 4*rdi + 16], xmm2
 23706  	movupd	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 23707  	movupd	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 23708  	xorpd	xmm1, xmm0
 23709  	xorpd	xmm2, xmm0
 23710  	movupd	xmmword ptr [rcx + 4*rdi + 32], xmm1
 23711  	movupd	xmmword ptr [rcx + 4*rdi + 48], xmm2
 23712  	add	rdi, 16
 23713  	add	rax, 2
 23714  	jne	.LBB3_474
 23715  	jmp	.LBB3_812
 23716  .LBB3_475:
 23717  	mov	esi, r9d
 23718  	and	esi, -4
 23719  	lea	rax, [rsi - 4]
 23720  	mov	r8, rax
 23721  	shr	r8, 2
 23722  	add	r8, 1
 23723  	test	rax, rax
 23724  	je	.LBB3_821
 23725  # %bb.476:
 23726  	mov	rax, r8
 23727  	and	rax, -2
 23728  	neg	rax
 23729  	xor	edi, edi
 23730  	pxor	xmm2, xmm2
 23731  	pcmpeqd	xmm3, xmm3
 23732  	movdqa	xmm4, xmmword ptr [rip + .LCPI3_4] # xmm4 = [1,1]
 23733  .LBB3_477:                              # =>This Inner Loop Header: Depth=1
 23734  	movdqu	xmm5, xmmword ptr [rdx + 8*rdi]
 23735  	movdqu	xmm6, xmmword ptr [rdx + 8*rdi + 16]
 23736  	movdqa	xmm0, xmm4
 23737  	pcmpgtq	xmm0, xmm5
 23738  	pcmpeqq	xmm5, xmm2
 23739  	pxor	xmm5, xmm3
 23740  	movdqa	xmm1, xmm4
 23741  	pcmpgtq	xmm1, xmm6
 23742  	pcmpeqq	xmm6, xmm2
 23743  	pxor	xmm6, xmm3
 23744  	movdqa	xmm7, xmm4
 23745  	blendvpd	xmm7, xmm5, xmm0
 23746  	movdqa	xmm5, xmm4
 23747  	movdqa	xmm0, xmm1
 23748  	blendvpd	xmm5, xmm6, xmm0
 23749  	movupd	xmmword ptr [rcx + 8*rdi], xmm7
 23750  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm5
 23751  	movdqu	xmm5, xmmword ptr [rdx + 8*rdi + 32]
 23752  	movdqu	xmm6, xmmword ptr [rdx + 8*rdi + 48]
 23753  	movdqa	xmm0, xmm4
 23754  	pcmpgtq	xmm0, xmm5
 23755  	pcmpeqq	xmm5, xmm2
 23756  	pxor	xmm5, xmm3
 23757  	movdqa	xmm1, xmm4
 23758  	pcmpgtq	xmm1, xmm6
 23759  	pcmpeqq	xmm6, xmm2
 23760  	pxor	xmm6, xmm3
 23761  	movdqa	xmm7, xmm4
 23762  	blendvpd	xmm7, xmm5, xmm0
 23763  	movdqa	xmm5, xmm4
 23764  	movdqa	xmm0, xmm1
 23765  	blendvpd	xmm5, xmm6, xmm0
 23766  	movupd	xmmword ptr [rcx + 8*rdi + 32], xmm7
 23767  	movupd	xmmword ptr [rcx + 8*rdi + 48], xmm5
 23768  	add	rdi, 8
 23769  	add	rax, 2
 23770  	jne	.LBB3_477
 23771  	jmp	.LBB3_822
 23772  .LBB3_478:
 23773  	mov	esi, eax
 23774  	and	esi, -8
 23775  	xor	edi, edi
 23776  	xorps	xmm0, xmm0
 23777  	movdqa	xmm1, xmmword ptr [rip + .LCPI3_3] # xmm1 = [1,1,1,1]
 23778  .LBB3_479:                              # =>This Inner Loop Header: Depth=1
 23779  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi]
 23780  	movdqu	xmm3, xmmword ptr [rdx + 4*rdi + 16]
 23781  	movdqa	xmm4, xmm2
 23782  	psrad	xmm4, 31
 23783  	por	xmm4, xmm1
 23784  	movdqa	xmm5, xmm3
 23785  	psrad	xmm5, 31
 23786  	por	xmm5, xmm1
 23787  	cvtdq2ps	xmm4, xmm4
 23788  	cvtdq2ps	xmm5, xmm5
 23789  	cmpneqps	xmm2, xmm0
 23790  	andps	xmm2, xmm4
 23791  	cmpneqps	xmm3, xmm0
 23792  	andps	xmm3, xmm5
 23793  	movups	xmmword ptr [rcx + 4*rdi], xmm2
 23794  	movups	xmmword ptr [rcx + 4*rdi + 16], xmm3
 23795  	add	rdi, 8
 23796  	cmp	rsi, rdi
 23797  	jne	.LBB3_479
 23798  # %bb.480:
 23799  	cmp	rsi, rax
 23800  	je	.LBB3_923
 23801  	jmp	.LBB3_481
 23802  .LBB3_486:
 23803  	mov	esi, r9d
 23804  	and	esi, -4
 23805  	lea	rax, [rsi - 4]
 23806  	mov	r8, rax
 23807  	shr	r8, 2
 23808  	add	r8, 1
 23809  	test	rax, rax
 23810  	je	.LBB3_830
 23811  # %bb.487:
 23812  	mov	rax, r8
 23813  	and	rax, -2
 23814  	neg	rax
 23815  	xor	edi, edi
 23816  .LBB3_488:                              # =>This Inner Loop Header: Depth=1
 23817  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 23818  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 23819  	pxor	xmm3, xmm3
 23820  	psubq	xmm3, xmm1
 23821  	movdqa	xmm0, xmm1
 23822  	blendvpd	xmm1, xmm3, xmm0
 23823  	pxor	xmm3, xmm3
 23824  	psubq	xmm3, xmm2
 23825  	movdqa	xmm0, xmm2
 23826  	blendvpd	xmm2, xmm3, xmm0
 23827  	movupd	xmmword ptr [rcx + 8*rdi], xmm1
 23828  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm2
 23829  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 23830  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 23831  	pxor	xmm3, xmm3
 23832  	psubq	xmm3, xmm1
 23833  	movdqa	xmm0, xmm1
 23834  	blendvpd	xmm1, xmm3, xmm0
 23835  	pxor	xmm3, xmm3
 23836  	psubq	xmm3, xmm2
 23837  	movdqa	xmm0, xmm2
 23838  	blendvpd	xmm2, xmm3, xmm0
 23839  	movupd	xmmword ptr [rcx + 8*rdi + 32], xmm1
 23840  	movupd	xmmword ptr [rcx + 8*rdi + 48], xmm2
 23841  	add	rdi, 8
 23842  	add	rax, 2
 23843  	jne	.LBB3_488
 23844  	jmp	.LBB3_831
 23845  .LBB3_489:
 23846  	mov	esi, r9d
 23847  	and	esi, -8
 23848  	lea	rax, [rsi - 8]
 23849  	mov	r8, rax
 23850  	shr	r8, 3
 23851  	add	r8, 1
 23852  	test	rax, rax
 23853  	je	.LBB3_838
 23854  # %bb.490:
 23855  	mov	rax, r8
 23856  	and	rax, -2
 23857  	neg	rax
 23858  	xor	edi, edi
 23859  	movapd	xmm0, xmmword ptr [rip + .LCPI3_9] # xmm0 = [2147483647,2147483647,2147483647,2147483647]
 23860  .LBB3_491:                              # =>This Inner Loop Header: Depth=1
 23861  	movupd	xmm1, xmmword ptr [rdx + 4*rdi]
 23862  	movupd	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 23863  	andpd	xmm1, xmm0
 23864  	andpd	xmm2, xmm0
 23865  	movupd	xmmword ptr [rcx + 4*rdi], xmm1
 23866  	movupd	xmmword ptr [rcx + 4*rdi + 16], xmm2
 23867  	movupd	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 23868  	movupd	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 23869  	andpd	xmm1, xmm0
 23870  	andpd	xmm2, xmm0
 23871  	movupd	xmmword ptr [rcx + 4*rdi + 32], xmm1
 23872  	movupd	xmmword ptr [rcx + 4*rdi + 48], xmm2
 23873  	add	rdi, 16
 23874  	add	rax, 2
 23875  	jne	.LBB3_491
 23876  	jmp	.LBB3_839
 23877  .LBB3_492:
 23878  	mov	esi, r9d
 23879  	and	esi, -4
 23880  	lea	rax, [rsi - 4]
 23881  	mov	r8, rax
 23882  	shr	r8, 2
 23883  	add	r8, 1
 23884  	test	rax, rax
 23885  	je	.LBB3_848
 23886  # %bb.493:
 23887  	mov	rax, r8
 23888  	and	rax, -2
 23889  	neg	rax
 23890  	xor	edi, edi
 23891  .LBB3_494:                              # =>This Inner Loop Header: Depth=1
 23892  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 23893  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 23894  	pxor	xmm3, xmm3
 23895  	psubq	xmm3, xmm1
 23896  	movdqa	xmm0, xmm1
 23897  	blendvpd	xmm1, xmm3, xmm0
 23898  	pxor	xmm3, xmm3
 23899  	psubq	xmm3, xmm2
 23900  	movdqa	xmm0, xmm2
 23901  	blendvpd	xmm2, xmm3, xmm0
 23902  	movupd	xmmword ptr [rcx + 8*rdi], xmm1
 23903  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm2
 23904  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 32]
 23905  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 48]
 23906  	pxor	xmm3, xmm3
 23907  	psubq	xmm3, xmm1
 23908  	movdqa	xmm0, xmm1
 23909  	blendvpd	xmm1, xmm3, xmm0
 23910  	pxor	xmm3, xmm3
 23911  	psubq	xmm3, xmm2
 23912  	movdqa	xmm0, xmm2
 23913  	blendvpd	xmm2, xmm3, xmm0
 23914  	movupd	xmmword ptr [rcx + 8*rdi + 32], xmm1
 23915  	movupd	xmmword ptr [rcx + 8*rdi + 48], xmm2
 23916  	add	rdi, 8
 23917  	add	rax, 2
 23918  	jne	.LBB3_494
 23919  	jmp	.LBB3_849
 23920  .LBB3_495:
 23921  	mov	esi, r9d
 23922  	and	esi, -8
 23923  	lea	rax, [rsi - 8]
 23924  	mov	r8, rax
 23925  	shr	r8, 3
 23926  	add	r8, 1
 23927  	test	rax, rax
 23928  	je	.LBB3_856
 23929  # %bb.496:
 23930  	mov	rax, r8
 23931  	and	rax, -2
 23932  	neg	rax
 23933  	xor	edi, edi
 23934  	movapd	xmm0, xmmword ptr [rip + .LCPI3_9] # xmm0 = [2147483647,2147483647,2147483647,2147483647]
 23935  .LBB3_497:                              # =>This Inner Loop Header: Depth=1
 23936  	movupd	xmm1, xmmword ptr [rdx + 4*rdi]
 23937  	movupd	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 23938  	andpd	xmm1, xmm0
 23939  	andpd	xmm2, xmm0
 23940  	movupd	xmmword ptr [rcx + 4*rdi], xmm1
 23941  	movupd	xmmword ptr [rcx + 4*rdi + 16], xmm2
 23942  	movupd	xmm1, xmmword ptr [rdx + 4*rdi + 32]
 23943  	movupd	xmm2, xmmword ptr [rdx + 4*rdi + 48]
 23944  	andpd	xmm1, xmm0
 23945  	andpd	xmm2, xmm0
 23946  	movupd	xmmword ptr [rcx + 4*rdi + 32], xmm1
 23947  	movupd	xmmword ptr [rcx + 4*rdi + 48], xmm2
 23948  	add	rdi, 16
 23949  	add	rax, 2
 23950  	jne	.LBB3_497
 23951  	jmp	.LBB3_857
 23952  .LBB3_505:
 23953  	mov	esi, r9d
 23954  	and	esi, -32
 23955  	lea	rax, [rsi - 32]
 23956  	mov	r8, rax
 23957  	shr	r8, 5
 23958  	add	r8, 1
 23959  	test	rax, rax
 23960  	je	.LBB3_866
 23961  # %bb.506:
 23962  	mov	rax, r8
 23963  	and	rax, -2
 23964  	neg	rax
 23965  	xor	edi, edi
 23966  .LBB3_507:                              # =>This Inner Loop Header: Depth=1
 23967  	movdqu	xmm0, xmmword ptr [rdx + rdi]
 23968  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
 23969  	pxor	xmm2, xmm2
 23970  	psubb	xmm2, xmm0
 23971  	pxor	xmm0, xmm0
 23972  	psubb	xmm0, xmm1
 23973  	movdqu	xmmword ptr [rcx + rdi], xmm2
 23974  	movdqu	xmmword ptr [rcx + rdi + 16], xmm0
 23975  	movdqu	xmm0, xmmword ptr [rdx + rdi + 32]
 23976  	movdqu	xmm1, xmmword ptr [rdx + rdi + 48]
 23977  	pxor	xmm2, xmm2
 23978  	psubb	xmm2, xmm0
 23979  	pxor	xmm0, xmm0
 23980  	psubb	xmm0, xmm1
 23981  	movdqu	xmmword ptr [rcx + rdi + 32], xmm2
 23982  	movdqu	xmmword ptr [rcx + rdi + 48], xmm0
 23983  	add	rdi, 64
 23984  	add	rax, 2
 23985  	jne	.LBB3_507
 23986  	jmp	.LBB3_867
 23987  .LBB3_508:
 23988  	mov	esi, r9d
 23989  	and	esi, -32
 23990  	lea	rax, [rsi - 32]
 23991  	mov	r8, rax
 23992  	shr	r8, 5
 23993  	add	r8, 1
 23994  	test	rax, rax
 23995  	je	.LBB3_874
 23996  # %bb.509:
 23997  	mov	rax, r8
 23998  	and	rax, -2
 23999  	neg	rax
 24000  	xor	edi, edi
 24001  	pxor	xmm0, xmm0
 24002  	movdqa	xmm1, xmmword ptr [rip + .LCPI3_6] # xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 24003  .LBB3_510:                              # =>This Inner Loop Header: Depth=1
 24004  	movdqu	xmm2, xmmword ptr [rdx + rdi]
 24005  	movdqu	xmm3, xmmword ptr [rdx + rdi + 16]
 24006  	pcmpeqb	xmm2, xmm0
 24007  	pandn	xmm2, xmm1
 24008  	pcmpeqb	xmm3, xmm0
 24009  	pandn	xmm3, xmm1
 24010  	movdqu	xmmword ptr [rcx + rdi], xmm2
 24011  	movdqu	xmmword ptr [rcx + rdi + 16], xmm3
 24012  	movdqu	xmm2, xmmword ptr [rdx + rdi + 32]
 24013  	movdqu	xmm3, xmmword ptr [rdx + rdi + 48]
 24014  	pcmpeqb	xmm2, xmm0
 24015  	pandn	xmm2, xmm1
 24016  	pcmpeqb	xmm3, xmm0
 24017  	pandn	xmm3, xmm1
 24018  	movdqu	xmmword ptr [rcx + rdi + 32], xmm2
 24019  	movdqu	xmmword ptr [rcx + rdi + 48], xmm3
 24020  	add	rdi, 64
 24021  	add	rax, 2
 24022  	jne	.LBB3_510
 24023  	jmp	.LBB3_875
 24024  .LBB3_511:
 24025  	mov	esi, r9d
 24026  	and	esi, -32
 24027  	lea	rax, [rsi - 32]
 24028  	mov	rdi, rax
 24029  	shr	rdi, 5
 24030  	add	rdi, 1
 24031  	mov	r8d, edi
 24032  	and	r8d, 3
 24033  	cmp	rax, 96
 24034  	jae	.LBB3_590
 24035  # %bb.512:
 24036  	xor	eax, eax
 24037  	jmp	.LBB3_592
 24038  .LBB3_513:
 24039  	mov	esi, r9d
 24040  	and	esi, -32
 24041  	lea	rax, [rsi - 32]
 24042  	mov	rdi, rax
 24043  	shr	rdi, 5
 24044  	add	rdi, 1
 24045  	mov	r8d, edi
 24046  	and	r8d, 3
 24047  	cmp	rax, 96
 24048  	jae	.LBB3_600
 24049  # %bb.514:
 24050  	xor	eax, eax
 24051  	jmp	.LBB3_602
 24052  .LBB3_515:
 24053  	mov	esi, r9d
 24054  	and	esi, -8
 24055  	lea	rax, [rsi - 8]
 24056  	mov	r8, rax
 24057  	shr	r8, 3
 24058  	add	r8, 1
 24059  	test	rax, rax
 24060  	je	.LBB3_882
 24061  # %bb.516:
 24062  	mov	rax, r8
 24063  	and	rax, -2
 24064  	neg	rax
 24065  	xor	edi, edi
 24066  .LBB3_517:                              # =>This Inner Loop Header: Depth=1
 24067  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
 24068  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 24069  	pxor	xmm2, xmm2
 24070  	psubd	xmm2, xmm0
 24071  	pxor	xmm0, xmm0
 24072  	psubd	xmm0, xmm1
 24073  	movdqu	xmmword ptr [rcx + 4*rdi], xmm2
 24074  	movdqu	xmmword ptr [rcx + 4*rdi + 16], xmm0
 24075  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
 24076  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
 24077  	pxor	xmm2, xmm2
 24078  	psubd	xmm2, xmm0
 24079  	pxor	xmm0, xmm0
 24080  	psubd	xmm0, xmm1
 24081  	movdqu	xmmword ptr [rcx + 4*rdi + 32], xmm2
 24082  	movdqu	xmmword ptr [rcx + 4*rdi + 48], xmm0
 24083  	add	rdi, 16
 24084  	add	rax, 2
 24085  	jne	.LBB3_517
 24086  	jmp	.LBB3_883
 24087  .LBB3_518:
 24088  	mov	esi, r9d
 24089  	and	esi, -8
 24090  	lea	rax, [rsi - 8]
 24091  	mov	r8, rax
 24092  	shr	r8, 3
 24093  	add	r8, 1
 24094  	test	rax, rax
 24095  	je	.LBB3_890
 24096  # %bb.519:
 24097  	mov	rax, r8
 24098  	and	rax, -2
 24099  	neg	rax
 24100  	xor	edi, edi
 24101  .LBB3_520:                              # =>This Inner Loop Header: Depth=1
 24102  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
 24103  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 24104  	pxor	xmm2, xmm2
 24105  	psubd	xmm2, xmm0
 24106  	pxor	xmm0, xmm0
 24107  	psubd	xmm0, xmm1
 24108  	movdqu	xmmword ptr [rcx + 4*rdi], xmm2
 24109  	movdqu	xmmword ptr [rcx + 4*rdi + 16], xmm0
 24110  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
 24111  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
 24112  	pxor	xmm2, xmm2
 24113  	psubd	xmm2, xmm0
 24114  	pxor	xmm0, xmm0
 24115  	psubd	xmm0, xmm1
 24116  	movdqu	xmmword ptr [rcx + 4*rdi + 32], xmm2
 24117  	movdqu	xmmword ptr [rcx + 4*rdi + 48], xmm0
 24118  	add	rdi, 16
 24119  	add	rax, 2
 24120  	jne	.LBB3_520
 24121  	jmp	.LBB3_891
 24122  .LBB3_521:
 24123  	mov	esi, r9d
 24124  	and	esi, -8
 24125  	lea	rax, [rsi - 8]
 24126  	mov	r8, rax
 24127  	shr	r8, 3
 24128  	add	r8, 1
 24129  	test	rax, rax
 24130  	je	.LBB3_898
 24131  # %bb.522:
 24132  	mov	rax, r8
 24133  	and	rax, -2
 24134  	neg	rax
 24135  	xor	edi, edi
 24136  	pxor	xmm2, xmm2
 24137  	pcmpeqd	xmm3, xmm3
 24138  	movdqa	xmm4, xmmword ptr [rip + .LCPI3_3] # xmm4 = [1,1,1,1]
 24139  .LBB3_523:                              # =>This Inner Loop Header: Depth=1
 24140  	movdqu	xmm5, xmmword ptr [rdx + 4*rdi]
 24141  	movdqu	xmm6, xmmword ptr [rdx + 4*rdi + 16]
 24142  	movdqa	xmm0, xmm4
 24143  	pcmpgtd	xmm0, xmm5
 24144  	pcmpeqd	xmm5, xmm2
 24145  	pxor	xmm5, xmm3
 24146  	movdqa	xmm1, xmm4
 24147  	pcmpgtd	xmm1, xmm6
 24148  	pcmpeqd	xmm6, xmm2
 24149  	pxor	xmm6, xmm3
 24150  	movdqa	xmm7, xmm4
 24151  	blendvps	xmm7, xmm5, xmm0
 24152  	movdqa	xmm5, xmm4
 24153  	movdqa	xmm0, xmm1
 24154  	blendvps	xmm5, xmm6, xmm0
 24155  	movups	xmmword ptr [rcx + 4*rdi], xmm7
 24156  	movups	xmmword ptr [rcx + 4*rdi + 16], xmm5
 24157  	movdqu	xmm5, xmmword ptr [rdx + 4*rdi + 32]
 24158  	movdqu	xmm6, xmmword ptr [rdx + 4*rdi + 48]
 24159  	movdqa	xmm0, xmm4
 24160  	pcmpgtd	xmm0, xmm5
 24161  	pcmpeqd	xmm5, xmm2
 24162  	pxor	xmm5, xmm3
 24163  	movdqa	xmm1, xmm4
 24164  	pcmpgtd	xmm1, xmm6
 24165  	pcmpeqd	xmm6, xmm2
 24166  	pxor	xmm6, xmm3
 24167  	movdqa	xmm7, xmm4
 24168  	blendvps	xmm7, xmm5, xmm0
 24169  	movdqa	xmm5, xmm4
 24170  	movdqa	xmm0, xmm1
 24171  	blendvps	xmm5, xmm6, xmm0
 24172  	movups	xmmword ptr [rcx + 4*rdi + 32], xmm7
 24173  	movups	xmmword ptr [rcx + 4*rdi + 48], xmm5
 24174  	add	rdi, 16
 24175  	add	rax, 2
 24176  	jne	.LBB3_523
 24177  	jmp	.LBB3_899
 24178  .LBB3_524:
 24179  	mov	esi, r9d
 24180  	and	esi, -8
 24181  	lea	rax, [rsi - 8]
 24182  	mov	r8, rax
 24183  	shr	r8, 3
 24184  	add	r8, 1
 24185  	test	rax, rax
 24186  	je	.LBB3_907
 24187  # %bb.525:
 24188  	mov	rax, r8
 24189  	and	rax, -2
 24190  	neg	rax
 24191  	xor	edi, edi
 24192  .LBB3_526:                              # =>This Inner Loop Header: Depth=1
 24193  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
 24194  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 24195  	pabsd	xmm0, xmm0
 24196  	pabsd	xmm1, xmm1
 24197  	movdqu	xmmword ptr [rcx + 4*rdi], xmm0
 24198  	movdqu	xmmword ptr [rcx + 4*rdi + 16], xmm1
 24199  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
 24200  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
 24201  	pabsd	xmm0, xmm0
 24202  	pabsd	xmm1, xmm1
 24203  	movdqu	xmmword ptr [rcx + 4*rdi + 32], xmm0
 24204  	movdqu	xmmword ptr [rcx + 4*rdi + 48], xmm1
 24205  	add	rdi, 16
 24206  	add	rax, 2
 24207  	jne	.LBB3_526
 24208  	jmp	.LBB3_908
 24209  .LBB3_527:
 24210  	mov	esi, r9d
 24211  	and	esi, -8
 24212  	lea	rax, [rsi - 8]
 24213  	mov	r8, rax
 24214  	shr	r8, 3
 24215  	add	r8, 1
 24216  	test	rax, rax
 24217  	je	.LBB3_915
 24218  # %bb.528:
 24219  	mov	rax, r8
 24220  	and	rax, -2
 24221  	neg	rax
 24222  	xor	edi, edi
 24223  .LBB3_529:                              # =>This Inner Loop Header: Depth=1
 24224  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
 24225  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 24226  	pabsd	xmm0, xmm0
 24227  	pabsd	xmm1, xmm1
 24228  	movdqu	xmmword ptr [rcx + 4*rdi], xmm0
 24229  	movdqu	xmmword ptr [rcx + 4*rdi + 16], xmm1
 24230  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi + 32]
 24231  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 48]
 24232  	pabsd	xmm0, xmm0
 24233  	pabsd	xmm1, xmm1
 24234  	movdqu	xmmword ptr [rcx + 4*rdi + 32], xmm0
 24235  	movdqu	xmmword ptr [rcx + 4*rdi + 48], xmm1
 24236  	add	rdi, 16
 24237  	add	rax, 2
 24238  	jne	.LBB3_529
 24239  	jmp	.LBB3_916
 24240  .LBB3_367:
 24241  	and	rdi, -8
 24242  	neg	rdi
 24243  	xor	eax, eax
 24244  	xorpd	xmm0, xmm0
 24245  .LBB3_368:                              # =>This Inner Loop Header: Depth=1
 24246  	movupd	xmmword ptr [rcx + 4*rax], xmm0
 24247  	movupd	xmmword ptr [rcx + 4*rax + 16], xmm0
 24248  	movupd	xmmword ptr [rcx + 4*rax + 32], xmm0
 24249  	movupd	xmmword ptr [rcx + 4*rax + 48], xmm0
 24250  	movupd	xmmword ptr [rcx + 4*rax + 64], xmm0
 24251  	movupd	xmmword ptr [rcx + 4*rax + 80], xmm0
 24252  	movupd	xmmword ptr [rcx + 4*rax + 96], xmm0
 24253  	movupd	xmmword ptr [rcx + 4*rax + 112], xmm0
 24254  	movupd	xmmword ptr [rcx + 4*rax + 128], xmm0
 24255  	movupd	xmmword ptr [rcx + 4*rax + 144], xmm0
 24256  	movupd	xmmword ptr [rcx + 4*rax + 160], xmm0
 24257  	movupd	xmmword ptr [rcx + 4*rax + 176], xmm0
 24258  	movupd	xmmword ptr [rcx + 4*rax + 192], xmm0
 24259  	movupd	xmmword ptr [rcx + 4*rax + 208], xmm0
 24260  	movupd	xmmword ptr [rcx + 4*rax + 224], xmm0
 24261  	movupd	xmmword ptr [rcx + 4*rax + 240], xmm0
 24262  	add	rax, 64
 24263  	add	rdi, 8
 24264  	jne	.LBB3_368
 24265  .LBB3_369:
 24266  	test	rsi, rsi
 24267  	je	.LBB3_372
 24268  # %bb.370:
 24269  	lea	rax, [rcx + 4*rax]
 24270  	add	rax, 16
 24271  	neg	rsi
 24272  	xorpd	xmm0, xmm0
 24273  .LBB3_371:                              # =>This Inner Loop Header: Depth=1
 24274  	movupd	xmmword ptr [rax - 16], xmm0
 24275  	movupd	xmmword ptr [rax], xmm0
 24276  	add	rax, 32
 24277  	inc	rsi
 24278  	jne	.LBB3_371
 24279  .LBB3_372:
 24280  	cmp	rdx, r9
 24281  	je	.LBB3_923
 24282  	.p2align	4, 0x90
 24283  .LBB3_373:                              # =>This Inner Loop Header: Depth=1
 24284  	mov	dword ptr [rcx + 4*rdx], 0
 24285  	add	rdx, 1
 24286  	cmp	r9, rdx
 24287  	jne	.LBB3_373
 24288  	jmp	.LBB3_923
 24289  .LBB3_414:
 24290  	and	rdi, -8
 24291  	neg	rdi
 24292  	xor	eax, eax
 24293  	xorpd	xmm0, xmm0
 24294  .LBB3_415:                              # =>This Inner Loop Header: Depth=1
 24295  	movupd	xmmword ptr [rcx + 8*rax], xmm0
 24296  	movupd	xmmword ptr [rcx + 8*rax + 16], xmm0
 24297  	movupd	xmmword ptr [rcx + 8*rax + 32], xmm0
 24298  	movupd	xmmword ptr [rcx + 8*rax + 48], xmm0
 24299  	movupd	xmmword ptr [rcx + 8*rax + 64], xmm0
 24300  	movupd	xmmword ptr [rcx + 8*rax + 80], xmm0
 24301  	movupd	xmmword ptr [rcx + 8*rax + 96], xmm0
 24302  	movupd	xmmword ptr [rcx + 8*rax + 112], xmm0
 24303  	movupd	xmmword ptr [rcx + 8*rax + 128], xmm0
 24304  	movupd	xmmword ptr [rcx + 8*rax + 144], xmm0
 24305  	movupd	xmmword ptr [rcx + 8*rax + 160], xmm0
 24306  	movupd	xmmword ptr [rcx + 8*rax + 176], xmm0
 24307  	movupd	xmmword ptr [rcx + 8*rax + 192], xmm0
 24308  	movupd	xmmword ptr [rcx + 8*rax + 208], xmm0
 24309  	movupd	xmmword ptr [rcx + 8*rax + 224], xmm0
 24310  	movupd	xmmword ptr [rcx + 8*rax + 240], xmm0
 24311  	add	rax, 32
 24312  	add	rdi, 8
 24313  	jne	.LBB3_415
 24314  .LBB3_416:
 24315  	test	rsi, rsi
 24316  	je	.LBB3_419
 24317  # %bb.417:
 24318  	lea	rax, [rcx + 8*rax]
 24319  	add	rax, 16
 24320  	neg	rsi
 24321  	xorpd	xmm0, xmm0
 24322  .LBB3_418:                              # =>This Inner Loop Header: Depth=1
 24323  	movupd	xmmword ptr [rax - 16], xmm0
 24324  	movupd	xmmword ptr [rax], xmm0
 24325  	add	rax, 32
 24326  	inc	rsi
 24327  	jne	.LBB3_418
 24328  .LBB3_419:
 24329  	cmp	rdx, r9
 24330  	je	.LBB3_923
 24331  	.p2align	4, 0x90
 24332  .LBB3_420:                              # =>This Inner Loop Header: Depth=1
 24333  	mov	qword ptr [rcx + 8*rdx], 0
 24334  	add	rdx, 1
 24335  	cmp	r9, rdx
 24336  	jne	.LBB3_420
 24337  	jmp	.LBB3_923
 24338  .LBB3_431:
 24339  	and	rdi, -8
 24340  	neg	rdi
 24341  	xor	eax, eax
 24342  	xorpd	xmm0, xmm0
 24343  .LBB3_432:                              # =>This Inner Loop Header: Depth=1
 24344  	movupd	xmmword ptr [rcx + 2*rax], xmm0
 24345  	movupd	xmmword ptr [rcx + 2*rax + 16], xmm0
 24346  	movupd	xmmword ptr [rcx + 2*rax + 32], xmm0
 24347  	movupd	xmmword ptr [rcx + 2*rax + 48], xmm0
 24348  	movupd	xmmword ptr [rcx + 2*rax + 64], xmm0
 24349  	movupd	xmmword ptr [rcx + 2*rax + 80], xmm0
 24350  	movupd	xmmword ptr [rcx + 2*rax + 96], xmm0
 24351  	movupd	xmmword ptr [rcx + 2*rax + 112], xmm0
 24352  	movupd	xmmword ptr [rcx + 2*rax + 128], xmm0
 24353  	movupd	xmmword ptr [rcx + 2*rax + 144], xmm0
 24354  	movupd	xmmword ptr [rcx + 2*rax + 160], xmm0
 24355  	movupd	xmmword ptr [rcx + 2*rax + 176], xmm0
 24356  	movupd	xmmword ptr [rcx + 2*rax + 192], xmm0
 24357  	movupd	xmmword ptr [rcx + 2*rax + 208], xmm0
 24358  	movupd	xmmword ptr [rcx + 2*rax + 224], xmm0
 24359  	movupd	xmmword ptr [rcx + 2*rax + 240], xmm0
 24360  	sub	rax, -128
 24361  	add	rdi, 8
 24362  	jne	.LBB3_432
 24363  .LBB3_433:
 24364  	test	rsi, rsi
 24365  	je	.LBB3_436
 24366  # %bb.434:
 24367  	lea	rax, [rcx + 2*rax]
 24368  	add	rax, 16
 24369  	neg	rsi
 24370  	xorpd	xmm0, xmm0
 24371  .LBB3_435:                              # =>This Inner Loop Header: Depth=1
 24372  	movupd	xmmword ptr [rax - 16], xmm0
 24373  	movupd	xmmword ptr [rax], xmm0
 24374  	add	rax, 32
 24375  	inc	rsi
 24376  	jne	.LBB3_435
 24377  .LBB3_436:
 24378  	cmp	rdx, r9
 24379  	je	.LBB3_923
 24380  	.p2align	4, 0x90
 24381  .LBB3_437:                              # =>This Inner Loop Header: Depth=1
 24382  	mov	word ptr [rcx + 2*rdx], 0
 24383  	add	rdx, 1
 24384  	cmp	r9, rdx
 24385  	jne	.LBB3_437
 24386  	jmp	.LBB3_923
 24387  .LBB3_498:
 24388  	and	rdi, -8
 24389  	neg	rdi
 24390  	xor	eax, eax
 24391  	xorpd	xmm0, xmm0
 24392  .LBB3_499:                              # =>This Inner Loop Header: Depth=1
 24393  	movupd	xmmword ptr [rcx + rax], xmm0
 24394  	movupd	xmmword ptr [rcx + rax + 16], xmm0
 24395  	movupd	xmmword ptr [rcx + rax + 32], xmm0
 24396  	movupd	xmmword ptr [rcx + rax + 48], xmm0
 24397  	movupd	xmmword ptr [rcx + rax + 64], xmm0
 24398  	movupd	xmmword ptr [rcx + rax + 80], xmm0
 24399  	movupd	xmmword ptr [rcx + rax + 96], xmm0
 24400  	movupd	xmmword ptr [rcx + rax + 112], xmm0
 24401  	movupd	xmmword ptr [rcx + rax + 128], xmm0
 24402  	movupd	xmmword ptr [rcx + rax + 144], xmm0
 24403  	movupd	xmmword ptr [rcx + rax + 160], xmm0
 24404  	movupd	xmmword ptr [rcx + rax + 176], xmm0
 24405  	movupd	xmmword ptr [rcx + rax + 192], xmm0
 24406  	movupd	xmmword ptr [rcx + rax + 208], xmm0
 24407  	movupd	xmmword ptr [rcx + rax + 224], xmm0
 24408  	movupd	xmmword ptr [rcx + rax + 240], xmm0
 24409  	add	rax, 256
 24410  	add	rdi, 8
 24411  	jne	.LBB3_499
 24412  .LBB3_500:
 24413  	test	rsi, rsi
 24414  	je	.LBB3_503
 24415  # %bb.501:
 24416  	add	rax, rcx
 24417  	add	rax, 16
 24418  	neg	rsi
 24419  	xorpd	xmm0, xmm0
 24420  .LBB3_502:                              # =>This Inner Loop Header: Depth=1
 24421  	movupd	xmmword ptr [rax - 16], xmm0
 24422  	movupd	xmmword ptr [rax], xmm0
 24423  	add	rax, 32
 24424  	inc	rsi
 24425  	jne	.LBB3_502
 24426  .LBB3_503:
 24427  	cmp	rdx, r9
 24428  	je	.LBB3_923
 24429  	.p2align	4, 0x90
 24430  .LBB3_504:                              # =>This Inner Loop Header: Depth=1
 24431  	mov	byte ptr [rcx + rdx], 0
 24432  	add	rdx, 1
 24433  	cmp	r9, rdx
 24434  	jne	.LBB3_504
 24435  .LBB3_923:
 24436  	mov	rsp, rbp
 24437  	pop	rbp
 24438  	ret
 24439  .LBB3_530:
 24440  	and	rdi, -4
 24441  	neg	rdi
 24442  	xor	eax, eax
 24443  .LBB3_531:                              # =>This Inner Loop Header: Depth=1
 24444  	movups	xmm0, xmmword ptr [rdx + 4*rax]
 24445  	movups	xmm1, xmmword ptr [rdx + 4*rax + 16]
 24446  	movups	xmmword ptr [rcx + 4*rax], xmm0
 24447  	movups	xmmword ptr [rcx + 4*rax + 16], xmm1
 24448  	movups	xmm0, xmmword ptr [rdx + 4*rax + 32]
 24449  	movups	xmm1, xmmword ptr [rdx + 4*rax + 48]
 24450  	movups	xmmword ptr [rcx + 4*rax + 32], xmm0
 24451  	movups	xmmword ptr [rcx + 4*rax + 48], xmm1
 24452  	movups	xmm0, xmmword ptr [rdx + 4*rax + 64]
 24453  	movups	xmm1, xmmword ptr [rdx + 4*rax + 80]
 24454  	movups	xmmword ptr [rcx + 4*rax + 64], xmm0
 24455  	movups	xmmword ptr [rcx + 4*rax + 80], xmm1
 24456  	movupd	xmm0, xmmword ptr [rdx + 4*rax + 96]
 24457  	movupd	xmm1, xmmword ptr [rdx + 4*rax + 112]
 24458  	movupd	xmmword ptr [rcx + 4*rax + 96], xmm0
 24459  	movupd	xmmword ptr [rcx + 4*rax + 112], xmm1
 24460  	add	rax, 32
 24461  	add	rdi, 4
 24462  	jne	.LBB3_531
 24463  .LBB3_532:
 24464  	test	r8, r8
 24465  	je	.LBB3_535
 24466  # %bb.533:
 24467  	lea	rax, [4*rax + 16]
 24468  	neg	r8
 24469  .LBB3_534:                              # =>This Inner Loop Header: Depth=1
 24470  	movupd	xmm0, xmmword ptr [rdx + rax - 16]
 24471  	movupd	xmm1, xmmword ptr [rdx + rax]
 24472  	movupd	xmmword ptr [rcx + rax - 16], xmm0
 24473  	movupd	xmmword ptr [rcx + rax], xmm1
 24474  	add	rax, 32
 24475  	inc	r8
 24476  	jne	.LBB3_534
 24477  .LBB3_535:
 24478  	cmp	rsi, r9
 24479  	je	.LBB3_923
 24480  	jmp	.LBB3_536
 24481  .LBB3_540:
 24482  	and	rdi, -4
 24483  	neg	rdi
 24484  	xor	eax, eax
 24485  .LBB3_541:                              # =>This Inner Loop Header: Depth=1
 24486  	movups	xmm0, xmmword ptr [rdx + 4*rax]
 24487  	movups	xmm1, xmmword ptr [rdx + 4*rax + 16]
 24488  	movups	xmmword ptr [rcx + 4*rax], xmm0
 24489  	movups	xmmword ptr [rcx + 4*rax + 16], xmm1
 24490  	movups	xmm0, xmmword ptr [rdx + 4*rax + 32]
 24491  	movups	xmm1, xmmword ptr [rdx + 4*rax + 48]
 24492  	movups	xmmword ptr [rcx + 4*rax + 32], xmm0
 24493  	movups	xmmword ptr [rcx + 4*rax + 48], xmm1
 24494  	movups	xmm0, xmmword ptr [rdx + 4*rax + 64]
 24495  	movups	xmm1, xmmword ptr [rdx + 4*rax + 80]
 24496  	movups	xmmword ptr [rcx + 4*rax + 64], xmm0
 24497  	movups	xmmword ptr [rcx + 4*rax + 80], xmm1
 24498  	movupd	xmm0, xmmword ptr [rdx + 4*rax + 96]
 24499  	movupd	xmm1, xmmword ptr [rdx + 4*rax + 112]
 24500  	movupd	xmmword ptr [rcx + 4*rax + 96], xmm0
 24501  	movupd	xmmword ptr [rcx + 4*rax + 112], xmm1
 24502  	add	rax, 32
 24503  	add	rdi, 4
 24504  	jne	.LBB3_541
 24505  .LBB3_542:
 24506  	test	r8, r8
 24507  	je	.LBB3_545
 24508  # %bb.543:
 24509  	lea	rax, [4*rax + 16]
 24510  	neg	r8
 24511  .LBB3_544:                              # =>This Inner Loop Header: Depth=1
 24512  	movupd	xmm0, xmmword ptr [rdx + rax - 16]
 24513  	movupd	xmm1, xmmword ptr [rdx + rax]
 24514  	movupd	xmmword ptr [rcx + rax - 16], xmm0
 24515  	movupd	xmmword ptr [rcx + rax], xmm1
 24516  	add	rax, 32
 24517  	inc	r8
 24518  	jne	.LBB3_544
 24519  .LBB3_545:
 24520  	cmp	rsi, r9
 24521  	je	.LBB3_923
 24522  	jmp	.LBB3_546
 24523  .LBB3_550:
 24524  	and	rdi, -4
 24525  	neg	rdi
 24526  	xor	eax, eax
 24527  .LBB3_551:                              # =>This Inner Loop Header: Depth=1
 24528  	movups	xmm0, xmmword ptr [rdx + 8*rax]
 24529  	movups	xmm1, xmmword ptr [rdx + 8*rax + 16]
 24530  	movups	xmmword ptr [rcx + 8*rax], xmm0
 24531  	movups	xmmword ptr [rcx + 8*rax + 16], xmm1
 24532  	movups	xmm0, xmmword ptr [rdx + 8*rax + 32]
 24533  	movups	xmm1, xmmword ptr [rdx + 8*rax + 48]
 24534  	movups	xmmword ptr [rcx + 8*rax + 32], xmm0
 24535  	movups	xmmword ptr [rcx + 8*rax + 48], xmm1
 24536  	movups	xmm0, xmmword ptr [rdx + 8*rax + 64]
 24537  	movups	xmm1, xmmword ptr [rdx + 8*rax + 80]
 24538  	movups	xmmword ptr [rcx + 8*rax + 64], xmm0
 24539  	movups	xmmword ptr [rcx + 8*rax + 80], xmm1
 24540  	movupd	xmm0, xmmword ptr [rdx + 8*rax + 96]
 24541  	movupd	xmm1, xmmword ptr [rdx + 8*rax + 112]
 24542  	movupd	xmmword ptr [rcx + 8*rax + 96], xmm0
 24543  	movupd	xmmword ptr [rcx + 8*rax + 112], xmm1
 24544  	add	rax, 16
 24545  	add	rdi, 4
 24546  	jne	.LBB3_551
 24547  .LBB3_552:
 24548  	test	r8, r8
 24549  	je	.LBB3_555
 24550  # %bb.553:
 24551  	lea	rax, [8*rax + 16]
 24552  	neg	r8
 24553  .LBB3_554:                              # =>This Inner Loop Header: Depth=1
 24554  	movupd	xmm0, xmmword ptr [rdx + rax - 16]
 24555  	movupd	xmm1, xmmword ptr [rdx + rax]
 24556  	movupd	xmmword ptr [rcx + rax - 16], xmm0
 24557  	movupd	xmmword ptr [rcx + rax], xmm1
 24558  	add	rax, 32
 24559  	inc	r8
 24560  	jne	.LBB3_554
 24561  .LBB3_555:
 24562  	cmp	rsi, r9
 24563  	je	.LBB3_923
 24564  	jmp	.LBB3_556
 24565  .LBB3_560:
 24566  	and	rdi, -4
 24567  	neg	rdi
 24568  	xor	eax, eax
 24569  .LBB3_561:                              # =>This Inner Loop Header: Depth=1
 24570  	movups	xmm0, xmmword ptr [rdx + 8*rax]
 24571  	movups	xmm1, xmmword ptr [rdx + 8*rax + 16]
 24572  	movups	xmmword ptr [rcx + 8*rax], xmm0
 24573  	movups	xmmword ptr [rcx + 8*rax + 16], xmm1
 24574  	movups	xmm0, xmmword ptr [rdx + 8*rax + 32]
 24575  	movups	xmm1, xmmword ptr [rdx + 8*rax + 48]
 24576  	movups	xmmword ptr [rcx + 8*rax + 32], xmm0
 24577  	movups	xmmword ptr [rcx + 8*rax + 48], xmm1
 24578  	movups	xmm0, xmmword ptr [rdx + 8*rax + 64]
 24579  	movups	xmm1, xmmword ptr [rdx + 8*rax + 80]
 24580  	movups	xmmword ptr [rcx + 8*rax + 64], xmm0
 24581  	movups	xmmword ptr [rcx + 8*rax + 80], xmm1
 24582  	movupd	xmm0, xmmword ptr [rdx + 8*rax + 96]
 24583  	movupd	xmm1, xmmword ptr [rdx + 8*rax + 112]
 24584  	movupd	xmmword ptr [rcx + 8*rax + 96], xmm0
 24585  	movupd	xmmword ptr [rcx + 8*rax + 112], xmm1
 24586  	add	rax, 16
 24587  	add	rdi, 4
 24588  	jne	.LBB3_561
 24589  .LBB3_562:
 24590  	test	r8, r8
 24591  	je	.LBB3_565
 24592  # %bb.563:
 24593  	lea	rax, [8*rax + 16]
 24594  	neg	r8
 24595  .LBB3_564:                              # =>This Inner Loop Header: Depth=1
 24596  	movupd	xmm0, xmmword ptr [rdx + rax - 16]
 24597  	movupd	xmm1, xmmword ptr [rdx + rax]
 24598  	movupd	xmmword ptr [rcx + rax - 16], xmm0
 24599  	movupd	xmmword ptr [rcx + rax], xmm1
 24600  	add	rax, 32
 24601  	inc	r8
 24602  	jne	.LBB3_564
 24603  .LBB3_565:
 24604  	cmp	rsi, r9
 24605  	je	.LBB3_923
 24606  	jmp	.LBB3_566
 24607  .LBB3_570:
 24608  	and	rdi, -4
 24609  	neg	rdi
 24610  	xor	eax, eax
 24611  .LBB3_571:                              # =>This Inner Loop Header: Depth=1
 24612  	movups	xmm0, xmmword ptr [rdx + 2*rax]
 24613  	movups	xmm1, xmmword ptr [rdx + 2*rax + 16]
 24614  	movups	xmmword ptr [rcx + 2*rax], xmm0
 24615  	movups	xmmword ptr [rcx + 2*rax + 16], xmm1
 24616  	movups	xmm0, xmmword ptr [rdx + 2*rax + 32]
 24617  	movups	xmm1, xmmword ptr [rdx + 2*rax + 48]
 24618  	movups	xmmword ptr [rcx + 2*rax + 32], xmm0
 24619  	movups	xmmword ptr [rcx + 2*rax + 48], xmm1
 24620  	movups	xmm0, xmmword ptr [rdx + 2*rax + 64]
 24621  	movups	xmm1, xmmword ptr [rdx + 2*rax + 80]
 24622  	movups	xmmword ptr [rcx + 2*rax + 64], xmm0
 24623  	movups	xmmword ptr [rcx + 2*rax + 80], xmm1
 24624  	movupd	xmm0, xmmword ptr [rdx + 2*rax + 96]
 24625  	movupd	xmm1, xmmword ptr [rdx + 2*rax + 112]
 24626  	movupd	xmmword ptr [rcx + 2*rax + 96], xmm0
 24627  	movupd	xmmword ptr [rcx + 2*rax + 112], xmm1
 24628  	add	rax, 64
 24629  	add	rdi, 4
 24630  	jne	.LBB3_571
 24631  .LBB3_572:
 24632  	test	r8, r8
 24633  	je	.LBB3_575
 24634  # %bb.573:
 24635  	add	rax, rax
 24636  	add	rax, 16
 24637  	neg	r8
 24638  .LBB3_574:                              # =>This Inner Loop Header: Depth=1
 24639  	movupd	xmm0, xmmword ptr [rdx + rax - 16]
 24640  	movupd	xmm1, xmmword ptr [rdx + rax]
 24641  	movupd	xmmword ptr [rcx + rax - 16], xmm0
 24642  	movupd	xmmword ptr [rcx + rax], xmm1
 24643  	add	rax, 32
 24644  	inc	r8
 24645  	jne	.LBB3_574
 24646  .LBB3_575:
 24647  	cmp	rsi, r9
 24648  	je	.LBB3_923
 24649  	jmp	.LBB3_576
 24650  .LBB3_580:
 24651  	and	rdi, -4
 24652  	neg	rdi
 24653  	xor	eax, eax
 24654  .LBB3_581:                              # =>This Inner Loop Header: Depth=1
 24655  	movups	xmm0, xmmword ptr [rdx + 2*rax]
 24656  	movups	xmm1, xmmword ptr [rdx + 2*rax + 16]
 24657  	movups	xmmword ptr [rcx + 2*rax], xmm0
 24658  	movups	xmmword ptr [rcx + 2*rax + 16], xmm1
 24659  	movups	xmm0, xmmword ptr [rdx + 2*rax + 32]
 24660  	movups	xmm1, xmmword ptr [rdx + 2*rax + 48]
 24661  	movups	xmmword ptr [rcx + 2*rax + 32], xmm0
 24662  	movups	xmmword ptr [rcx + 2*rax + 48], xmm1
 24663  	movups	xmm0, xmmword ptr [rdx + 2*rax + 64]
 24664  	movups	xmm1, xmmword ptr [rdx + 2*rax + 80]
 24665  	movups	xmmword ptr [rcx + 2*rax + 64], xmm0
 24666  	movups	xmmword ptr [rcx + 2*rax + 80], xmm1
 24667  	movupd	xmm0, xmmword ptr [rdx + 2*rax + 96]
 24668  	movupd	xmm1, xmmword ptr [rdx + 2*rax + 112]
 24669  	movupd	xmmword ptr [rcx + 2*rax + 96], xmm0
 24670  	movupd	xmmword ptr [rcx + 2*rax + 112], xmm1
 24671  	add	rax, 64
 24672  	add	rdi, 4
 24673  	jne	.LBB3_581
 24674  .LBB3_582:
 24675  	test	r8, r8
 24676  	je	.LBB3_585
 24677  # %bb.583:
 24678  	add	rax, rax
 24679  	add	rax, 16
 24680  	neg	r8
 24681  .LBB3_584:                              # =>This Inner Loop Header: Depth=1
 24682  	movupd	xmm0, xmmword ptr [rdx + rax - 16]
 24683  	movupd	xmm1, xmmword ptr [rdx + rax]
 24684  	movupd	xmmword ptr [rcx + rax - 16], xmm0
 24685  	movupd	xmmword ptr [rcx + rax], xmm1
 24686  	add	rax, 32
 24687  	inc	r8
 24688  	jne	.LBB3_584
 24689  .LBB3_585:
 24690  	cmp	rsi, r9
 24691  	je	.LBB3_923
 24692  	jmp	.LBB3_586
 24693  .LBB3_590:
 24694  	and	rdi, -4
 24695  	neg	rdi
 24696  	xor	eax, eax
 24697  .LBB3_591:                              # =>This Inner Loop Header: Depth=1
 24698  	movups	xmm0, xmmword ptr [rdx + rax]
 24699  	movups	xmm1, xmmword ptr [rdx + rax + 16]
 24700  	movups	xmmword ptr [rcx + rax], xmm0
 24701  	movups	xmmword ptr [rcx + rax + 16], xmm1
 24702  	movups	xmm0, xmmword ptr [rdx + rax + 32]
 24703  	movups	xmm1, xmmword ptr [rdx + rax + 48]
 24704  	movups	xmmword ptr [rcx + rax + 32], xmm0
 24705  	movups	xmmword ptr [rcx + rax + 48], xmm1
 24706  	movups	xmm0, xmmword ptr [rdx + rax + 64]
 24707  	movups	xmm1, xmmword ptr [rdx + rax + 80]
 24708  	movups	xmmword ptr [rcx + rax + 64], xmm0
 24709  	movups	xmmword ptr [rcx + rax + 80], xmm1
 24710  	movupd	xmm0, xmmword ptr [rdx + rax + 96]
 24711  	movupd	xmm1, xmmword ptr [rdx + rax + 112]
 24712  	movupd	xmmword ptr [rcx + rax + 96], xmm0
 24713  	movupd	xmmword ptr [rcx + rax + 112], xmm1
 24714  	sub	rax, -128
 24715  	add	rdi, 4
 24716  	jne	.LBB3_591
 24717  .LBB3_592:
 24718  	test	r8, r8
 24719  	je	.LBB3_595
 24720  # %bb.593:
 24721  	add	rax, 16
 24722  	neg	r8
 24723  .LBB3_594:                              # =>This Inner Loop Header: Depth=1
 24724  	movupd	xmm0, xmmword ptr [rdx + rax - 16]
 24725  	movupd	xmm1, xmmword ptr [rdx + rax]
 24726  	movupd	xmmword ptr [rcx + rax - 16], xmm0
 24727  	movupd	xmmword ptr [rcx + rax], xmm1
 24728  	add	rax, 32
 24729  	inc	r8
 24730  	jne	.LBB3_594
 24731  .LBB3_595:
 24732  	cmp	rsi, r9
 24733  	je	.LBB3_923
 24734  	jmp	.LBB3_596
 24735  .LBB3_600:
 24736  	and	rdi, -4
 24737  	neg	rdi
 24738  	xor	eax, eax
 24739  .LBB3_601:                              # =>This Inner Loop Header: Depth=1
 24740  	movups	xmm0, xmmword ptr [rdx + rax]
 24741  	movups	xmm1, xmmword ptr [rdx + rax + 16]
 24742  	movups	xmmword ptr [rcx + rax], xmm0
 24743  	movups	xmmword ptr [rcx + rax + 16], xmm1
 24744  	movups	xmm0, xmmword ptr [rdx + rax + 32]
 24745  	movups	xmm1, xmmword ptr [rdx + rax + 48]
 24746  	movups	xmmword ptr [rcx + rax + 32], xmm0
 24747  	movups	xmmword ptr [rcx + rax + 48], xmm1
 24748  	movups	xmm0, xmmword ptr [rdx + rax + 64]
 24749  	movups	xmm1, xmmword ptr [rdx + rax + 80]
 24750  	movups	xmmword ptr [rcx + rax + 64], xmm0
 24751  	movups	xmmword ptr [rcx + rax + 80], xmm1
 24752  	movupd	xmm0, xmmword ptr [rdx + rax + 96]
 24753  	movupd	xmm1, xmmword ptr [rdx + rax + 112]
 24754  	movupd	xmmword ptr [rcx + rax + 96], xmm0
 24755  	movupd	xmmword ptr [rcx + rax + 112], xmm1
 24756  	sub	rax, -128
 24757  	add	rdi, 4
 24758  	jne	.LBB3_601
 24759  .LBB3_602:
 24760  	test	r8, r8
 24761  	je	.LBB3_605
 24762  # %bb.603:
 24763  	add	rax, 16
 24764  	neg	r8
 24765  .LBB3_604:                              # =>This Inner Loop Header: Depth=1
 24766  	movupd	xmm0, xmmword ptr [rdx + rax - 16]
 24767  	movupd	xmm1, xmmword ptr [rdx + rax]
 24768  	movupd	xmmword ptr [rcx + rax - 16], xmm0
 24769  	movupd	xmmword ptr [rcx + rax], xmm1
 24770  	add	rax, 32
 24771  	inc	r8
 24772  	jne	.LBB3_604
 24773  .LBB3_605:
 24774  	cmp	rsi, r9
 24775  	je	.LBB3_923
 24776  	jmp	.LBB3_606
 24777  .LBB3_610:
 24778  	xor	edi, edi
 24779  .LBB3_611:
 24780  	test	r8b, 1
 24781  	je	.LBB3_613
 24782  # %bb.612:
 24783  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
 24784  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 24785  	pxor	xmm2, xmm2
 24786  	pxor	xmm3, xmm3
 24787  	psubd	xmm3, xmm0
 24788  	psubd	xmm2, xmm1
 24789  	movdqu	xmmword ptr [rcx + 4*rdi], xmm3
 24790  	movdqu	xmmword ptr [rcx + 4*rdi + 16], xmm2
 24791  .LBB3_613:
 24792  	cmp	rsi, r9
 24793  	je	.LBB3_923
 24794  	jmp	.LBB3_614
 24795  .LBB3_618:
 24796  	xor	edi, edi
 24797  .LBB3_619:
 24798  	test	r8b, 1
 24799  	je	.LBB3_621
 24800  # %bb.620:
 24801  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
 24802  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 24803  	pxor	xmm2, xmm2
 24804  	pcmpeqd	xmm0, xmm2
 24805  	movdqa	xmm3, xmmword ptr [rip + .LCPI3_3] # xmm3 = [1,1,1,1]
 24806  	pandn	xmm0, xmm3
 24807  	pcmpeqd	xmm1, xmm2
 24808  	pandn	xmm1, xmm3
 24809  	movdqu	xmmword ptr [rcx + 4*rdi], xmm0
 24810  	movdqu	xmmword ptr [rcx + 4*rdi + 16], xmm1
 24811  .LBB3_621:
 24812  	cmp	rsi, r9
 24813  	je	.LBB3_923
 24814  	jmp	.LBB3_622
 24815  .LBB3_626:
 24816  	xor	edi, edi
 24817  .LBB3_627:
 24818  	test	r8b, 1
 24819  	je	.LBB3_629
 24820  # %bb.628:
 24821  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
 24822  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
 24823  	movapd	xmm2, xmmword ptr [rip + .LCPI3_0] # xmm2 = [-0.0E+0,-0.0E+0]
 24824  	xorpd	xmm0, xmm2
 24825  	xorpd	xmm1, xmm2
 24826  	movupd	xmmword ptr [rcx + 8*rdi], xmm0
 24827  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm1
 24828  .LBB3_629:
 24829  	cmp	rsi, r9
 24830  	je	.LBB3_923
 24831  	jmp	.LBB3_630
 24832  .LBB3_636:
 24833  	xor	edi, edi
 24834  .LBB3_637:
 24835  	test	r8b, 1
 24836  	je	.LBB3_639
 24837  # %bb.638:
 24838  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
 24839  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
 24840  	movapd	xmm2, xmmword ptr [rip + .LCPI3_0] # xmm2 = [-0.0E+0,-0.0E+0]
 24841  	xorpd	xmm0, xmm2
 24842  	xorpd	xmm1, xmm2
 24843  	movupd	xmmword ptr [rcx + 8*rdi], xmm0
 24844  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm1
 24845  .LBB3_639:
 24846  	cmp	rsi, r9
 24847  	je	.LBB3_923
 24848  	jmp	.LBB3_640
 24849  .LBB3_646:
 24850  	xor	edi, edi
 24851  .LBB3_647:
 24852  	test	r8b, 1
 24853  	je	.LBB3_649
 24854  # %bb.648:
 24855  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
 24856  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
 24857  	xorpd	xmm2, xmm2
 24858  	movapd	xmm3, xmmword ptr [rip + .LCPI3_0] # xmm3 = [-0.0E+0,-0.0E+0]
 24859  	movapd	xmm4, xmm0
 24860  	andpd	xmm4, xmm3
 24861  	movapd	xmm5, xmmword ptr [rip + .LCPI3_1] # xmm5 = [1.0E+0,1.0E+0]
 24862  	orpd	xmm4, xmm5
 24863  	andpd	xmm3, xmm1
 24864  	orpd	xmm3, xmm5
 24865  	cmpneqpd	xmm0, xmm2
 24866  	andpd	xmm0, xmm4
 24867  	cmpneqpd	xmm1, xmm2
 24868  	andpd	xmm1, xmm3
 24869  	movupd	xmmword ptr [rcx + 8*rdi], xmm0
 24870  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm1
 24871  .LBB3_649:
 24872  	cmp	rsi, r9
 24873  	je	.LBB3_923
 24874  	jmp	.LBB3_650
 24875  .LBB3_655:
 24876  	xor	edi, edi
 24877  .LBB3_656:
 24878  	test	r8b, 1
 24879  	je	.LBB3_658
 24880  # %bb.657:
 24881  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
 24882  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
 24883  	movapd	xmm2, xmmword ptr [rip + .LCPI3_8] # xmm2 = [9223372036854775807,9223372036854775807]
 24884  	andpd	xmm0, xmm2
 24885  	andpd	xmm1, xmm2
 24886  	movupd	xmmword ptr [rcx + 8*rdi], xmm0
 24887  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm1
 24888  .LBB3_658:
 24889  	cmp	rsi, r9
 24890  	je	.LBB3_923
 24891  	jmp	.LBB3_659
 24892  .LBB3_663:
 24893  	xor	edi, edi
 24894  .LBB3_664:
 24895  	test	r8b, 1
 24896  	je	.LBB3_666
 24897  # %bb.665:
 24898  	movupd	xmm0, xmmword ptr [rdx + 8*rdi]
 24899  	movupd	xmm1, xmmword ptr [rdx + 8*rdi + 16]
 24900  	movapd	xmm2, xmmword ptr [rip + .LCPI3_8] # xmm2 = [9223372036854775807,9223372036854775807]
 24901  	andpd	xmm0, xmm2
 24902  	andpd	xmm1, xmm2
 24903  	movupd	xmmword ptr [rcx + 8*rdi], xmm0
 24904  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm1
 24905  .LBB3_666:
 24906  	cmp	rsi, r9
 24907  	je	.LBB3_923
 24908  	jmp	.LBB3_667
 24909  .LBB3_671:
 24910  	xor	edi, edi
 24911  .LBB3_672:
 24912  	test	r8b, 1
 24913  	je	.LBB3_674
 24914  # %bb.673:
 24915  	movdqu	xmm0, xmmword ptr [rdx + rdi]
 24916  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
 24917  	pxor	xmm2, xmm2
 24918  	pxor	xmm3, xmm3
 24919  	psubb	xmm3, xmm0
 24920  	psubb	xmm2, xmm1
 24921  	movdqu	xmmword ptr [rcx + rdi], xmm3
 24922  	movdqu	xmmword ptr [rcx + rdi + 16], xmm2
 24923  .LBB3_674:
 24924  	cmp	rsi, r9
 24925  	je	.LBB3_923
 24926  	jmp	.LBB3_675
 24927  .LBB3_679:
 24928  	xor	edi, edi
 24929  .LBB3_680:
 24930  	test	r8b, 1
 24931  	je	.LBB3_682
 24932  # %bb.681:
 24933  	movdqu	xmm0, xmmword ptr [rdx + rdi]
 24934  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
 24935  	pxor	xmm2, xmm2
 24936  	pxor	xmm3, xmm3
 24937  	psubb	xmm3, xmm0
 24938  	psubb	xmm2, xmm1
 24939  	movdqu	xmmword ptr [rcx + rdi], xmm3
 24940  	movdqu	xmmword ptr [rcx + rdi + 16], xmm2
 24941  .LBB3_682:
 24942  	cmp	rsi, r9
 24943  	je	.LBB3_923
 24944  	jmp	.LBB3_683
 24945  .LBB3_687:
 24946  	xor	edi, edi
 24947  .LBB3_688:
 24948  	test	r8b, 1
 24949  	je	.LBB3_690
 24950  # %bb.689:
 24951  	movdqu	xmm1, xmmword ptr [rdx + rdi]
 24952  	movdqu	xmm2, xmmword ptr [rdx + rdi + 16]
 24953  	pxor	xmm3, xmm3
 24954  	movdqa	xmm4, xmmword ptr [rip + .LCPI3_6] # xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 24955  	movdqa	xmm0, xmm4
 24956  	pcmpgtb	xmm0, xmm1
 24957  	movdqa	xmm5, xmm1
 24958  	pcmpeqb	xmm5, xmm3
 24959  	pcmpeqd	xmm1, xmm1
 24960  	pxor	xmm5, xmm1
 24961  	pcmpeqb	xmm3, xmm2
 24962  	pxor	xmm3, xmm1
 24963  	movdqa	xmm1, xmm4
 24964  	pcmpgtb	xmm1, xmm2
 24965  	movdqa	xmm2, xmm4
 24966  	pblendvb	xmm2, xmm5, xmm0
 24967  	movdqa	xmm0, xmm1
 24968  	pblendvb	xmm4, xmm3, xmm0
 24969  	movdqu	xmmword ptr [rcx + rdi], xmm2
 24970  	movdqu	xmmword ptr [rcx + rdi + 16], xmm4
 24971  .LBB3_690:
 24972  	cmp	rsi, r9
 24973  	je	.LBB3_923
 24974  	jmp	.LBB3_691
 24975  .LBB3_696:
 24976  	xor	edi, edi
 24977  .LBB3_697:
 24978  	test	r8b, 1
 24979  	je	.LBB3_699
 24980  # %bb.698:
 24981  	pmovsxbd	xmm3, dword ptr [rdx + rdi + 12]
 24982  	pmovsxbd	xmm0, dword ptr [rdx + rdi + 8]
 24983  	pmovsxbd	xmm2, dword ptr [rdx + rdi + 4]
 24984  	pmovsxbd	xmm1, dword ptr [rdx + rdi]
 24985  	movdqa	xmm4, xmm1
 24986  	psrad	xmm4, 7
 24987  	movdqa	xmm5, xmm2
 24988  	psrad	xmm5, 7
 24989  	movdqa	xmm6, xmm0
 24990  	psrad	xmm6, 7
 24991  	movdqa	xmm7, xmm3
 24992  	psrad	xmm7, 7
 24993  	paddd	xmm3, xmm7
 24994  	paddd	xmm0, xmm6
 24995  	paddd	xmm2, xmm5
 24996  	paddd	xmm1, xmm4
 24997  	pxor	xmm1, xmm4
 24998  	pxor	xmm2, xmm5
 24999  	pxor	xmm0, xmm6
 25000  	pxor	xmm3, xmm7
 25001  	movdqa	xmm4, xmmword ptr [rip + .LCPI3_10] # xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 25002  	pand	xmm3, xmm4
 25003  	pand	xmm0, xmm4
 25004  	packusdw	xmm0, xmm3
 25005  	pand	xmm2, xmm4
 25006  	pand	xmm1, xmm4
 25007  	packusdw	xmm1, xmm2
 25008  	packuswb	xmm1, xmm0
 25009  	movdqu	xmmword ptr [rcx + rdi], xmm1
 25010  .LBB3_699:
 25011  	cmp	rsi, r9
 25012  	je	.LBB3_923
 25013  	jmp	.LBB3_700
 25014  .LBB3_704:
 25015  	xor	edi, edi
 25016  .LBB3_705:
 25017  	test	r8b, 1
 25018  	je	.LBB3_707
 25019  # %bb.706:
 25020  	pmovsxbd	xmm3, dword ptr [rdx + rdi + 12]
 25021  	pmovsxbd	xmm0, dword ptr [rdx + rdi + 8]
 25022  	pmovsxbd	xmm2, dword ptr [rdx + rdi + 4]
 25023  	pmovsxbd	xmm1, dword ptr [rdx + rdi]
 25024  	movdqa	xmm4, xmm1
 25025  	psrad	xmm4, 7
 25026  	movdqa	xmm5, xmm2
 25027  	psrad	xmm5, 7
 25028  	movdqa	xmm6, xmm0
 25029  	psrad	xmm6, 7
 25030  	movdqa	xmm7, xmm3
 25031  	psrad	xmm7, 7
 25032  	paddd	xmm3, xmm7
 25033  	paddd	xmm0, xmm6
 25034  	paddd	xmm2, xmm5
 25035  	paddd	xmm1, xmm4
 25036  	pxor	xmm1, xmm4
 25037  	pxor	xmm2, xmm5
 25038  	pxor	xmm0, xmm6
 25039  	pxor	xmm3, xmm7
 25040  	movdqa	xmm4, xmmword ptr [rip + .LCPI3_10] # xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 25041  	pand	xmm3, xmm4
 25042  	pand	xmm0, xmm4
 25043  	packusdw	xmm0, xmm3
 25044  	pand	xmm2, xmm4
 25045  	pand	xmm1, xmm4
 25046  	packusdw	xmm1, xmm2
 25047  	packuswb	xmm1, xmm0
 25048  	movdqu	xmmword ptr [rcx + rdi], xmm1
 25049  .LBB3_707:
 25050  	cmp	rsi, r9
 25051  	je	.LBB3_923
 25052  	jmp	.LBB3_708
 25053  .LBB3_712:
 25054  	xor	edi, edi
 25055  .LBB3_713:
 25056  	test	r8b, 1
 25057  	je	.LBB3_715
 25058  # %bb.714:
 25059  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
 25060  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
 25061  	pxor	xmm2, xmm2
 25062  	pxor	xmm3, xmm3
 25063  	psubq	xmm3, xmm0
 25064  	psubq	xmm2, xmm1
 25065  	movdqu	xmmword ptr [rcx + 8*rdi], xmm3
 25066  	movdqu	xmmword ptr [rcx + 8*rdi + 16], xmm2
 25067  .LBB3_715:
 25068  	cmp	rsi, r9
 25069  	je	.LBB3_923
 25070  	jmp	.LBB3_716
 25071  .LBB3_720:
 25072  	xor	edi, edi
 25073  .LBB3_721:
 25074  	test	r8b, 1
 25075  	je	.LBB3_723
 25076  # %bb.722:
 25077  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
 25078  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
 25079  	pxor	xmm2, xmm2
 25080  	pcmpeqq	xmm0, xmm2
 25081  	movdqa	xmm3, xmmword ptr [rip + .LCPI3_4] # xmm3 = [1,1]
 25082  	pandn	xmm0, xmm3
 25083  	pcmpeqq	xmm1, xmm2
 25084  	pandn	xmm1, xmm3
 25085  	movdqu	xmmword ptr [rcx + 8*rdi], xmm0
 25086  	movdqu	xmmword ptr [rcx + 8*rdi + 16], xmm1
 25087  .LBB3_723:
 25088  	cmp	rsi, r9
 25089  	je	.LBB3_923
 25090  	jmp	.LBB3_724
 25091  .LBB3_728:
 25092  	xor	edi, edi
 25093  .LBB3_729:
 25094  	test	r8b, 1
 25095  	je	.LBB3_731
 25096  # %bb.730:
 25097  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
 25098  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
 25099  	pxor	xmm2, xmm2
 25100  	pxor	xmm3, xmm3
 25101  	psubw	xmm3, xmm0
 25102  	psubw	xmm2, xmm1
 25103  	movdqu	xmmword ptr [rcx + 2*rdi], xmm3
 25104  	movdqu	xmmword ptr [rcx + 2*rdi + 16], xmm2
 25105  .LBB3_731:
 25106  	cmp	rsi, r9
 25107  	je	.LBB3_923
 25108  	jmp	.LBB3_732
 25109  .LBB3_736:
 25110  	xor	edi, edi
 25111  .LBB3_737:
 25112  	test	r8b, 1
 25113  	je	.LBB3_739
 25114  # %bb.738:
 25115  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
 25116  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
 25117  	pxor	xmm2, xmm2
 25118  	pxor	xmm3, xmm3
 25119  	psubw	xmm3, xmm0
 25120  	psubw	xmm2, xmm1
 25121  	movdqu	xmmword ptr [rcx + 2*rdi], xmm3
 25122  	movdqu	xmmword ptr [rcx + 2*rdi + 16], xmm2
 25123  .LBB3_739:
 25124  	cmp	rsi, r9
 25125  	je	.LBB3_923
 25126  	jmp	.LBB3_740
 25127  .LBB3_744:
 25128  	xor	edi, edi
 25129  .LBB3_745:
 25130  	test	r8b, 1
 25131  	je	.LBB3_747
 25132  # %bb.746:
 25133  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
 25134  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
 25135  	pxor	xmm2, xmm2
 25136  	pxor	xmm3, xmm3
 25137  	psubw	xmm3, xmm0
 25138  	psubw	xmm2, xmm1
 25139  	movdqu	xmmword ptr [rcx + 2*rdi], xmm3
 25140  	movdqu	xmmword ptr [rcx + 2*rdi + 16], xmm2
 25141  .LBB3_747:
 25142  	cmp	rsi, r9
 25143  	je	.LBB3_923
 25144  	jmp	.LBB3_748
 25145  .LBB3_752:
 25146  	xor	edi, edi
 25147  .LBB3_753:
 25148  	test	r8b, 1
 25149  	je	.LBB3_755
 25150  # %bb.754:
 25151  	movdqu	xmm0, xmmword ptr [rdx + 2*rdi]
 25152  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi + 16]
 25153  	pxor	xmm2, xmm2
 25154  	pcmpeqw	xmm0, xmm2
 25155  	movdqa	xmm3, xmmword ptr [rip + .LCPI3_5] # xmm3 = [1,1,1,1,1,1,1,1]
 25156  	pandn	xmm0, xmm3
 25157  	pcmpeqw	xmm1, xmm2
 25158  	pandn	xmm1, xmm3
 25159  	movdqu	xmmword ptr [rcx + 2*rdi], xmm0
 25160  	movdqu	xmmword ptr [rcx + 2*rdi + 16], xmm1
 25161  .LBB3_755:
 25162  	cmp	rsi, r9
 25163  	je	.LBB3_923
 25164  	jmp	.LBB3_756
 25165  .LBB3_760:
 25166  	xor	edi, edi
 25167  .LBB3_761:
 25168  	test	r8b, 1
 25169  	je	.LBB3_763
 25170  # %bb.762:
 25171  	movdqu	xmm1, xmmword ptr [rdx + 2*rdi]
 25172  	movdqu	xmm2, xmmword ptr [rdx + 2*rdi + 16]
 25173  	pxor	xmm3, xmm3
 25174  	movdqa	xmm4, xmmword ptr [rip + .LCPI3_5] # xmm4 = [1,1,1,1,1,1,1,1]
 25175  	movdqa	xmm0, xmm4
 25176  	pcmpgtw	xmm0, xmm1
 25177  	movdqa	xmm5, xmm1
 25178  	pcmpeqw	xmm5, xmm3
 25179  	pcmpeqd	xmm1, xmm1
 25180  	pxor	xmm5, xmm1
 25181  	pcmpeqw	xmm3, xmm2
 25182  	pxor	xmm3, xmm1
 25183  	movdqa	xmm1, xmm4
 25184  	pcmpgtw	xmm1, xmm2
 25185  	movdqa	xmm2, xmm4
 25186  	pblendvb	xmm2, xmm5, xmm0
 25187  	movdqa	xmm0, xmm1
 25188  	pblendvb	xmm4, xmm3, xmm0
 25189  	movdqu	xmmword ptr [rcx + 2*rdi], xmm2
 25190  	movdqu	xmmword ptr [rcx + 2*rdi + 16], xmm4
 25191  .LBB3_763:
 25192  	cmp	rsi, r9
 25193  	je	.LBB3_923
 25194  	jmp	.LBB3_764
 25195  .LBB3_769:
 25196  	xor	edi, edi
 25197  .LBB3_770:
 25198  	test	r8b, 1
 25199  	je	.LBB3_772
 25200  # %bb.771:
 25201  	pmovsxwd	xmm0, qword ptr [rdx + 2*rdi + 8]
 25202  	pmovsxwd	xmm1, qword ptr [rdx + 2*rdi]
 25203  	movdqa	xmm2, xmm1
 25204  	psrad	xmm2, 15
 25205  	movdqa	xmm3, xmm0
 25206  	psrad	xmm3, 15
 25207  	paddd	xmm0, xmm3
 25208  	paddd	xmm1, xmm2
 25209  	pxor	xmm1, xmm2
 25210  	pxor	xmm0, xmm3
 25211  	pxor	xmm2, xmm2
 25212  	pblendw	xmm0, xmm2, 170                 # xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
 25213  	pblendw	xmm1, xmm2, 170                 # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
 25214  	packusdw	xmm1, xmm0
 25215  	movdqu	xmmword ptr [rcx + 2*rdi], xmm1
 25216  .LBB3_772:
 25217  	cmp	rsi, r9
 25218  	je	.LBB3_923
 25219  	jmp	.LBB3_773
 25220  .LBB3_777:
 25221  	xor	edi, edi
 25222  .LBB3_778:
 25223  	test	r8b, 1
 25224  	je	.LBB3_780
 25225  # %bb.779:
 25226  	pmovsxwd	xmm0, qword ptr [rdx + 2*rdi + 8]
 25227  	pmovsxwd	xmm1, qword ptr [rdx + 2*rdi]
 25228  	movdqa	xmm2, xmm1
 25229  	psrad	xmm2, 15
 25230  	movdqa	xmm3, xmm0
 25231  	psrad	xmm3, 15
 25232  	paddd	xmm0, xmm3
 25233  	paddd	xmm1, xmm2
 25234  	pxor	xmm1, xmm2
 25235  	pxor	xmm0, xmm3
 25236  	pxor	xmm2, xmm2
 25237  	pblendw	xmm0, xmm2, 170                 # xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
 25238  	pblendw	xmm1, xmm2, 170                 # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
 25239  	packusdw	xmm1, xmm0
 25240  	movdqu	xmmword ptr [rcx + 2*rdi], xmm1
 25241  .LBB3_780:
 25242  	cmp	rsi, r9
 25243  	je	.LBB3_923
 25244  	jmp	.LBB3_781
 25245  .LBB3_785:
 25246  	xor	edi, edi
 25247  .LBB3_786:
 25248  	test	r8b, 1
 25249  	je	.LBB3_788
 25250  # %bb.787:
 25251  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
 25252  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
 25253  	pxor	xmm2, xmm2
 25254  	pxor	xmm3, xmm3
 25255  	psubq	xmm3, xmm0
 25256  	psubq	xmm2, xmm1
 25257  	movdqu	xmmword ptr [rcx + 8*rdi], xmm3
 25258  	movdqu	xmmword ptr [rcx + 8*rdi + 16], xmm2
 25259  .LBB3_788:
 25260  	cmp	rsi, r9
 25261  	je	.LBB3_923
 25262  	jmp	.LBB3_789
 25263  .LBB3_793:
 25264  	xor	edi, edi
 25265  .LBB3_794:
 25266  	test	r8b, 1
 25267  	je	.LBB3_796
 25268  # %bb.795:
 25269  	movupd	xmm0, xmmword ptr [rdx + 4*rdi]
 25270  	movupd	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 25271  	movapd	xmm2, xmmword ptr [rip + .LCPI3_7] # xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 25272  	xorpd	xmm0, xmm2
 25273  	xorpd	xmm1, xmm2
 25274  	movupd	xmmword ptr [rcx + 4*rdi], xmm0
 25275  	movupd	xmmword ptr [rcx + 4*rdi + 16], xmm1
 25276  .LBB3_796:
 25277  	cmp	rsi, r9
 25278  	je	.LBB3_923
 25279  	jmp	.LBB3_797
 25280  .LBB3_803:
 25281  	xor	edi, edi
 25282  .LBB3_804:
 25283  	test	r8b, 1
 25284  	je	.LBB3_806
 25285  # %bb.805:
 25286  	movdqu	xmm0, xmmword ptr [rdx + 8*rdi]
 25287  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi + 16]
 25288  	pxor	xmm2, xmm2
 25289  	pxor	xmm3, xmm3
 25290  	psubq	xmm3, xmm0
 25291  	psubq	xmm2, xmm1
 25292  	movdqu	xmmword ptr [rcx + 8*rdi], xmm3
 25293  	movdqu	xmmword ptr [rcx + 8*rdi + 16], xmm2
 25294  .LBB3_806:
 25295  	cmp	rsi, r9
 25296  	je	.LBB3_923
 25297  	jmp	.LBB3_807
 25298  .LBB3_811:
 25299  	xor	edi, edi
 25300  .LBB3_812:
 25301  	test	r8b, 1
 25302  	je	.LBB3_814
 25303  # %bb.813:
 25304  	movupd	xmm0, xmmword ptr [rdx + 4*rdi]
 25305  	movupd	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 25306  	movapd	xmm2, xmmword ptr [rip + .LCPI3_7] # xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 25307  	xorpd	xmm0, xmm2
 25308  	xorpd	xmm1, xmm2
 25309  	movupd	xmmword ptr [rcx + 4*rdi], xmm0
 25310  	movupd	xmmword ptr [rcx + 4*rdi + 16], xmm1
 25311  .LBB3_814:
 25312  	cmp	rsi, r9
 25313  	je	.LBB3_923
 25314  	jmp	.LBB3_815
 25315  .LBB3_821:
 25316  	xor	edi, edi
 25317  .LBB3_822:
 25318  	test	r8b, 1
 25319  	je	.LBB3_824
 25320  # %bb.823:
 25321  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 25322  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 25323  	pxor	xmm3, xmm3
 25324  	movdqa	xmm4, xmmword ptr [rip + .LCPI3_4] # xmm4 = [1,1]
 25325  	movdqa	xmm0, xmm4
 25326  	pcmpgtq	xmm0, xmm1
 25327  	movdqa	xmm5, xmm1
 25328  	pcmpeqq	xmm5, xmm3
 25329  	pcmpeqd	xmm1, xmm1
 25330  	pxor	xmm5, xmm1
 25331  	pcmpeqq	xmm3, xmm2
 25332  	pxor	xmm3, xmm1
 25333  	movdqa	xmm1, xmm4
 25334  	pcmpgtq	xmm1, xmm2
 25335  	movdqa	xmm2, xmm4
 25336  	blendvpd	xmm2, xmm5, xmm0
 25337  	movdqa	xmm0, xmm1
 25338  	blendvpd	xmm4, xmm3, xmm0
 25339  	movupd	xmmword ptr [rcx + 8*rdi], xmm2
 25340  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm4
 25341  .LBB3_824:
 25342  	cmp	rsi, r9
 25343  	je	.LBB3_923
 25344  	jmp	.LBB3_825
 25345  .LBB3_830:
 25346  	xor	edi, edi
 25347  .LBB3_831:
 25348  	test	r8b, 1
 25349  	je	.LBB3_833
 25350  # %bb.832:
 25351  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 25352  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 25353  	pxor	xmm3, xmm3
 25354  	pxor	xmm4, xmm4
 25355  	psubq	xmm4, xmm1
 25356  	movdqa	xmm0, xmm1
 25357  	blendvpd	xmm1, xmm4, xmm0
 25358  	psubq	xmm3, xmm2
 25359  	movdqa	xmm0, xmm2
 25360  	blendvpd	xmm2, xmm3, xmm0
 25361  	movupd	xmmword ptr [rcx + 8*rdi], xmm1
 25362  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm2
 25363  .LBB3_833:
 25364  	cmp	rsi, r9
 25365  	je	.LBB3_923
 25366  	jmp	.LBB3_834
 25367  .LBB3_838:
 25368  	xor	edi, edi
 25369  .LBB3_839:
 25370  	test	r8b, 1
 25371  	je	.LBB3_841
 25372  # %bb.840:
 25373  	movupd	xmm0, xmmword ptr [rdx + 4*rdi]
 25374  	movupd	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 25375  	movapd	xmm2, xmmword ptr [rip + .LCPI3_9] # xmm2 = [2147483647,2147483647,2147483647,2147483647]
 25376  	andpd	xmm0, xmm2
 25377  	andpd	xmm1, xmm2
 25378  	movupd	xmmword ptr [rcx + 4*rdi], xmm0
 25379  	movupd	xmmword ptr [rcx + 4*rdi + 16], xmm1
 25380  .LBB3_841:
 25381  	cmp	rsi, r9
 25382  	je	.LBB3_923
 25383  	jmp	.LBB3_842
 25384  .LBB3_848:
 25385  	xor	edi, edi
 25386  .LBB3_849:
 25387  	test	r8b, 1
 25388  	je	.LBB3_851
 25389  # %bb.850:
 25390  	movdqu	xmm1, xmmword ptr [rdx + 8*rdi]
 25391  	movdqu	xmm2, xmmword ptr [rdx + 8*rdi + 16]
 25392  	pxor	xmm3, xmm3
 25393  	pxor	xmm4, xmm4
 25394  	psubq	xmm4, xmm1
 25395  	movdqa	xmm0, xmm1
 25396  	blendvpd	xmm1, xmm4, xmm0
 25397  	psubq	xmm3, xmm2
 25398  	movdqa	xmm0, xmm2
 25399  	blendvpd	xmm2, xmm3, xmm0
 25400  	movupd	xmmword ptr [rcx + 8*rdi], xmm1
 25401  	movupd	xmmword ptr [rcx + 8*rdi + 16], xmm2
 25402  .LBB3_851:
 25403  	cmp	rsi, r9
 25404  	je	.LBB3_923
 25405  	jmp	.LBB3_852
 25406  .LBB3_856:
 25407  	xor	edi, edi
 25408  .LBB3_857:
 25409  	test	r8b, 1
 25410  	je	.LBB3_859
 25411  # %bb.858:
 25412  	movupd	xmm0, xmmword ptr [rdx + 4*rdi]
 25413  	movupd	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 25414  	movapd	xmm2, xmmword ptr [rip + .LCPI3_9] # xmm2 = [2147483647,2147483647,2147483647,2147483647]
 25415  	andpd	xmm0, xmm2
 25416  	andpd	xmm1, xmm2
 25417  	movupd	xmmword ptr [rcx + 4*rdi], xmm0
 25418  	movupd	xmmword ptr [rcx + 4*rdi + 16], xmm1
 25419  .LBB3_859:
 25420  	cmp	rsi, r9
 25421  	je	.LBB3_923
 25422  	jmp	.LBB3_860
 25423  .LBB3_866:
 25424  	xor	edi, edi
 25425  .LBB3_867:
 25426  	test	r8b, 1
 25427  	je	.LBB3_869
 25428  # %bb.868:
 25429  	movdqu	xmm0, xmmword ptr [rdx + rdi]
 25430  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
 25431  	pxor	xmm2, xmm2
 25432  	pxor	xmm3, xmm3
 25433  	psubb	xmm3, xmm0
 25434  	psubb	xmm2, xmm1
 25435  	movdqu	xmmword ptr [rcx + rdi], xmm3
 25436  	movdqu	xmmword ptr [rcx + rdi + 16], xmm2
 25437  .LBB3_869:
 25438  	cmp	rsi, r9
 25439  	je	.LBB3_923
 25440  	jmp	.LBB3_870
 25441  .LBB3_874:
 25442  	xor	edi, edi
 25443  .LBB3_875:
 25444  	test	r8b, 1
 25445  	je	.LBB3_877
 25446  # %bb.876:
 25447  	movdqu	xmm0, xmmword ptr [rdx + rdi]
 25448  	movdqu	xmm1, xmmword ptr [rdx + rdi + 16]
 25449  	pxor	xmm2, xmm2
 25450  	pcmpeqb	xmm0, xmm2
 25451  	movdqa	xmm3, xmmword ptr [rip + .LCPI3_6] # xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 25452  	pandn	xmm0, xmm3
 25453  	pcmpeqb	xmm1, xmm2
 25454  	pandn	xmm1, xmm3
 25455  	movdqu	xmmword ptr [rcx + rdi], xmm0
 25456  	movdqu	xmmword ptr [rcx + rdi + 16], xmm1
 25457  .LBB3_877:
 25458  	cmp	rsi, r9
 25459  	je	.LBB3_923
 25460  	jmp	.LBB3_878
 25461  .LBB3_882:
 25462  	xor	edi, edi
 25463  .LBB3_883:
 25464  	test	r8b, 1
 25465  	je	.LBB3_885
 25466  # %bb.884:
 25467  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
 25468  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 25469  	pxor	xmm2, xmm2
 25470  	pxor	xmm3, xmm3
 25471  	psubd	xmm3, xmm0
 25472  	psubd	xmm2, xmm1
 25473  	movdqu	xmmword ptr [rcx + 4*rdi], xmm3
 25474  	movdqu	xmmword ptr [rcx + 4*rdi + 16], xmm2
 25475  .LBB3_885:
 25476  	cmp	rsi, r9
 25477  	je	.LBB3_923
 25478  	jmp	.LBB3_886
 25479  .LBB3_890:
 25480  	xor	edi, edi
 25481  .LBB3_891:
 25482  	test	r8b, 1
 25483  	je	.LBB3_893
 25484  # %bb.892:
 25485  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
 25486  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 25487  	pxor	xmm2, xmm2
 25488  	pxor	xmm3, xmm3
 25489  	psubd	xmm3, xmm0
 25490  	psubd	xmm2, xmm1
 25491  	movdqu	xmmword ptr [rcx + 4*rdi], xmm3
 25492  	movdqu	xmmword ptr [rcx + 4*rdi + 16], xmm2
 25493  .LBB3_893:
 25494  	cmp	rsi, r9
 25495  	je	.LBB3_923
 25496  	jmp	.LBB3_894
 25497  .LBB3_898:
 25498  	xor	edi, edi
 25499  .LBB3_899:
 25500  	test	r8b, 1
 25501  	je	.LBB3_901
 25502  # %bb.900:
 25503  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi]
 25504  	movdqu	xmm2, xmmword ptr [rdx + 4*rdi + 16]
 25505  	pxor	xmm3, xmm3
 25506  	movdqa	xmm4, xmmword ptr [rip + .LCPI3_3] # xmm4 = [1,1,1,1]
 25507  	movdqa	xmm0, xmm4
 25508  	pcmpgtd	xmm0, xmm1
 25509  	movdqa	xmm5, xmm1
 25510  	pcmpeqd	xmm5, xmm3
 25511  	pcmpeqd	xmm1, xmm1
 25512  	pxor	xmm5, xmm1
 25513  	pcmpeqd	xmm3, xmm2
 25514  	pxor	xmm3, xmm1
 25515  	movdqa	xmm1, xmm4
 25516  	pcmpgtd	xmm1, xmm2
 25517  	movdqa	xmm2, xmm4
 25518  	blendvps	xmm2, xmm5, xmm0
 25519  	movdqa	xmm0, xmm1
 25520  	blendvps	xmm4, xmm3, xmm0
 25521  	movups	xmmword ptr [rcx + 4*rdi], xmm2
 25522  	movups	xmmword ptr [rcx + 4*rdi + 16], xmm4
 25523  .LBB3_901:
 25524  	cmp	rsi, r9
 25525  	je	.LBB3_923
 25526  	jmp	.LBB3_902
 25527  .LBB3_907:
 25528  	xor	edi, edi
 25529  .LBB3_908:
 25530  	test	r8b, 1
 25531  	je	.LBB3_910
 25532  # %bb.909:
 25533  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
 25534  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 25535  	pabsd	xmm0, xmm0
 25536  	pabsd	xmm1, xmm1
 25537  	movdqu	xmmword ptr [rcx + 4*rdi], xmm0
 25538  	movdqu	xmmword ptr [rcx + 4*rdi + 16], xmm1
 25539  .LBB3_910:
 25540  	cmp	rsi, r9
 25541  	je	.LBB3_923
 25542  	jmp	.LBB3_911
 25543  .LBB3_915:
 25544  	xor	edi, edi
 25545  .LBB3_916:
 25546  	test	r8b, 1
 25547  	je	.LBB3_918
 25548  # %bb.917:
 25549  	movdqu	xmm0, xmmword ptr [rdx + 4*rdi]
 25550  	movdqu	xmm1, xmmword ptr [rdx + 4*rdi + 16]
 25551  	pabsd	xmm0, xmm0
 25552  	pabsd	xmm1, xmm1
 25553  	movdqu	xmmword ptr [rcx + 4*rdi], xmm0
 25554  	movdqu	xmmword ptr [rcx + 4*rdi + 16], xmm1
 25555  .LBB3_918:
 25556  	cmp	rsi, r9
 25557  	je	.LBB3_923
 25558  	jmp	.LBB3_919
 25559  .Lfunc_end3:
 25560  	.size	arithmetic_unary_same_types_sse4, .Lfunc_end3-arithmetic_unary_same_types_sse4
 25561                                          # -- End function
 25562  	.section	.rodata.cst16,"aM",@progbits,16
 25563  	.p2align	4                               # -- Begin function arithmetic_unary_diff_type_sse4
 25564  .LCPI4_0:
 25565  	.quad	0x8000000000000000              # double -0
 25566  	.quad	0x8000000000000000              # double -0
 25567  .LCPI4_1:
 25568  	.quad	0x3ff0000000000000              # double 1
 25569  	.quad	0x3ff0000000000000              # double 1
 25570  .LCPI4_3:
 25571  	.long	0x7fffffff                      # float NaN
 25572  	.long	0x7fffffff                      # float NaN
 25573  	.long	0x7fffffff                      # float NaN
 25574  	.long	0x7fffffff                      # float NaN
 25575  .LCPI4_4:
 25576  	.long	0x80000000                      # float -0
 25577  	.long	0x80000000                      # float -0
 25578  	.long	0x80000000                      # float -0
 25579  	.long	0x80000000                      # float -0
 25580  .LCPI4_7:
 25581  	.byte	0                               # 0x0
 25582  	.byte	4                               # 0x4
 25583  	.zero	1
 25584  	.zero	1
 25585  	.zero	1
 25586  	.zero	1
 25587  	.zero	1
 25588  	.zero	1
 25589  	.zero	1
 25590  	.zero	1
 25591  	.zero	1
 25592  	.zero	1
 25593  	.zero	1
 25594  	.zero	1
 25595  	.zero	1
 25596  	.zero	1
 25597  .LCPI4_8:
 25598  	.long	1                               # 0x1
 25599  	.long	1                               # 0x1
 25600  	.long	1                               # 0x1
 25601  	.long	1                               # 0x1
 25602  .LCPI4_10:
 25603  	.long	0x4f000000                      # float 2.14748365E+9
 25604  	.long	0x4f000000                      # float 2.14748365E+9
 25605  	.long	0x4f000000                      # float 2.14748365E+9
 25606  	.long	0x4f000000                      # float 2.14748365E+9
 25607  .LCPI4_11:
 25608  	.short	1                               # 0x1
 25609  	.short	1                               # 0x1
 25610  	.short	1                               # 0x1
 25611  	.short	1                               # 0x1
 25612  	.zero	2
 25613  	.zero	2
 25614  	.zero	2
 25615  	.zero	2
 25616  .LCPI4_12:
 25617  	.byte	1                               # 0x1
 25618  	.byte	1                               # 0x1
 25619  	.byte	1                               # 0x1
 25620  	.byte	1                               # 0x1
 25621  	.zero	1
 25622  	.zero	1
 25623  	.zero	1
 25624  	.zero	1
 25625  	.zero	1
 25626  	.zero	1
 25627  	.zero	1
 25628  	.zero	1
 25629  	.zero	1
 25630  	.zero	1
 25631  	.zero	1
 25632  	.zero	1
 25633  .LCPI4_15:
 25634  	.quad	1                               # 0x1
 25635  	.quad	1                               # 0x1
 25636  .LCPI4_16:
 25637  	.long	1                               # 0x1
 25638  	.long	1                               # 0x1
 25639  	.zero	4
 25640  	.zero	4
 25641  .LCPI4_17:
 25642  	.short	1                               # 0x1
 25643  	.short	1                               # 0x1
 25644  	.zero	2
 25645  	.zero	2
 25646  	.zero	2
 25647  	.zero	2
 25648  	.zero	2
 25649  	.zero	2
 25650  .LCPI4_18:
 25651  	.byte	1                               # 0x1
 25652  	.byte	1                               # 0x1
 25653  	.zero	1
 25654  	.zero	1
 25655  	.zero	1
 25656  	.zero	1
 25657  	.zero	1
 25658  	.zero	1
 25659  	.zero	1
 25660  	.zero	1
 25661  	.zero	1
 25662  	.zero	1
 25663  	.zero	1
 25664  	.zero	1
 25665  	.zero	1
 25666  	.zero	1
 25667  .LCPI4_19:
 25668  	.long	0x3f800000                      # float 1
 25669  	.long	0x3f800000                      # float 1
 25670  	.long	0x3f800000                      # float 1
 25671  	.long	0x3f800000                      # float 1
 25672  .LCPI4_20:
 25673  	.short	1                               # 0x1
 25674  	.short	1                               # 0x1
 25675  	.short	1                               # 0x1
 25676  	.short	1                               # 0x1
 25677  	.short	1                               # 0x1
 25678  	.short	1                               # 0x1
 25679  	.short	1                               # 0x1
 25680  	.short	1                               # 0x1
 25681  .LCPI4_21:
 25682  	.byte	1                               # 0x1
 25683  	.byte	1                               # 0x1
 25684  	.byte	1                               # 0x1
 25685  	.byte	1                               # 0x1
 25686  	.byte	1                               # 0x1
 25687  	.byte	1                               # 0x1
 25688  	.byte	1                               # 0x1
 25689  	.byte	1                               # 0x1
 25690  	.zero	1
 25691  	.zero	1
 25692  	.zero	1
 25693  	.zero	1
 25694  	.zero	1
 25695  	.zero	1
 25696  	.zero	1
 25697  	.zero	1
 25698  .LCPI4_22:
 25699  	.zero	16,1
 25700  	.section	.rodata.cst8,"aM",@progbits,8
 25701  	.p2align	3
 25702  .LCPI4_2:
 25703  	.quad	0x3ff0000000000000              # double 1
 25704  .LCPI4_6:
 25705  	.quad	0x43e0000000000000              # double 9.2233720368547758E+18
 25706  .LCPI4_13:
 25707  	.quad	0xbff0000000000000              # double -1
 25708  	.section	.rodata.cst4,"aM",@progbits,4
 25709  	.p2align	2
 25710  .LCPI4_5:
 25711  	.long	0x3f800000                      # float 1
 25712  .LCPI4_9:
 25713  	.long	0x5f000000                      # float 9.22337203E+18
 25714  .LCPI4_14:
 25715  	.long	0xbf800000                      # float -1
 25716  	.text
 25717  	.globl	arithmetic_unary_diff_type_sse4
 25718  	.p2align	4, 0x90
 25719  	.type	arithmetic_unary_diff_type_sse4,@function
 25720  arithmetic_unary_diff_type_sse4:        # @arithmetic_unary_diff_type_sse4
 25721  # %bb.0:
 25722  	push	rbp
 25723  	mov	rbp, rsp
 25724  	push	r14
 25725  	push	rbx
 25726  	and	rsp, -8
 25727  	cmp	dl, 20
 25728  	jne	.LBB4_1655
 25729  # %bb.1:
 25730  	cmp	edi, 6
 25731  	jg	.LBB4_14
 25732  # %bb.2:
 25733  	cmp	edi, 3
 25734  	jle	.LBB4_26
 25735  # %bb.3:
 25736  	cmp	edi, 4
 25737  	je	.LBB4_46
 25738  # %bb.4:
 25739  	cmp	edi, 5
 25740  	je	.LBB4_54
 25741  # %bb.5:
 25742  	cmp	edi, 6
 25743  	jne	.LBB4_1655
 25744  # %bb.6:
 25745  	cmp	esi, 6
 25746  	jg	.LBB4_94
 25747  # %bb.7:
 25748  	cmp	esi, 3
 25749  	jle	.LBB4_200
 25750  # %bb.8:
 25751  	cmp	esi, 4
 25752  	je	.LBB4_303
 25753  # %bb.9:
 25754  	cmp	esi, 5
 25755  	je	.LBB4_306
 25756  # %bb.10:
 25757  	cmp	esi, 6
 25758  	jne	.LBB4_1655
 25759  # %bb.11:
 25760  	test	r9d, r9d
 25761  	jle	.LBB4_1655
 25762  # %bb.12:
 25763  	mov	r10d, r9d
 25764  	cmp	r9d, 8
 25765  	jb	.LBB4_13
 25766  # %bb.494:
 25767  	lea	rdx, [rcx + 4*r10]
 25768  	cmp	rdx, r8
 25769  	jbe	.LBB4_496
 25770  # %bb.495:
 25771  	lea	rdx, [r8 + 4*r10]
 25772  	cmp	rdx, rcx
 25773  	jbe	.LBB4_496
 25774  .LBB4_13:
 25775  	xor	edx, edx
 25776  .LBB4_1232:
 25777  	mov	rsi, rdx
 25778  	not	rsi
 25779  	add	rsi, r10
 25780  	mov	rdi, r10
 25781  	and	rdi, 3
 25782  	je	.LBB4_1234
 25783  .LBB4_1233:                             # =>This Inner Loop Header: Depth=1
 25784  	xor	eax, eax
 25785  	cmp	dword ptr [rcx + 4*rdx], 0
 25786  	setne	al
 25787  	mov	dword ptr [r8 + 4*rdx], eax
 25788  	add	rdx, 1
 25789  	add	rdi, -1
 25790  	jne	.LBB4_1233
 25791  .LBB4_1234:
 25792  	cmp	rsi, 3
 25793  	jb	.LBB4_1655
 25794  .LBB4_1235:                             # =>This Inner Loop Header: Depth=1
 25795  	xor	eax, eax
 25796  	cmp	dword ptr [rcx + 4*rdx], 0
 25797  	setne	al
 25798  	mov	dword ptr [r8 + 4*rdx], eax
 25799  	xor	eax, eax
 25800  	cmp	dword ptr [rcx + 4*rdx + 4], 0
 25801  	setne	al
 25802  	mov	dword ptr [r8 + 4*rdx + 4], eax
 25803  	xor	eax, eax
 25804  	cmp	dword ptr [rcx + 4*rdx + 8], 0
 25805  	setne	al
 25806  	mov	dword ptr [r8 + 4*rdx + 8], eax
 25807  	xor	eax, eax
 25808  	cmp	dword ptr [rcx + 4*rdx + 12], 0
 25809  	setne	al
 25810  	mov	dword ptr [r8 + 4*rdx + 12], eax
 25811  	add	rdx, 4
 25812  	cmp	r10, rdx
 25813  	jne	.LBB4_1235
 25814  	jmp	.LBB4_1655
 25815  .LBB4_14:
 25816  	cmp	edi, 8
 25817  	jle	.LBB4_36
 25818  # %bb.15:
 25819  	cmp	edi, 9
 25820  	je	.LBB4_62
 25821  # %bb.16:
 25822  	cmp	edi, 11
 25823  	je	.LBB4_70
 25824  # %bb.17:
 25825  	cmp	edi, 12
 25826  	jne	.LBB4_1655
 25827  # %bb.18:
 25828  	cmp	esi, 6
 25829  	jg	.LBB4_106
 25830  # %bb.19:
 25831  	cmp	esi, 3
 25832  	jle	.LBB4_205
 25833  # %bb.20:
 25834  	cmp	esi, 4
 25835  	je	.LBB4_309
 25836  # %bb.21:
 25837  	cmp	esi, 5
 25838  	je	.LBB4_312
 25839  # %bb.22:
 25840  	cmp	esi, 6
 25841  	jne	.LBB4_1655
 25842  # %bb.23:
 25843  	test	r9d, r9d
 25844  	jle	.LBB4_1655
 25845  # %bb.24:
 25846  	mov	r11d, r9d
 25847  	xor	r10d, r10d
 25848  	cmp	r9d, 4
 25849  	jae	.LBB4_499
 25850  # %bb.25:
 25851  	xor	esi, esi
 25852  	jmp	.LBB4_1110
 25853  .LBB4_26:
 25854  	cmp	edi, 2
 25855  	je	.LBB4_78
 25856  # %bb.27:
 25857  	cmp	edi, 3
 25858  	jne	.LBB4_1655
 25859  # %bb.28:
 25860  	cmp	esi, 6
 25861  	jg	.LBB4_113
 25862  # %bb.29:
 25863  	cmp	esi, 3
 25864  	jle	.LBB4_210
 25865  # %bb.30:
 25866  	cmp	esi, 4
 25867  	je	.LBB4_315
 25868  # %bb.31:
 25869  	cmp	esi, 5
 25870  	je	.LBB4_318
 25871  # %bb.32:
 25872  	cmp	esi, 6
 25873  	jne	.LBB4_1655
 25874  # %bb.33:
 25875  	test	r9d, r9d
 25876  	jle	.LBB4_1655
 25877  # %bb.34:
 25878  	mov	r10d, r9d
 25879  	cmp	r9d, 8
 25880  	jb	.LBB4_35
 25881  # %bb.502:
 25882  	lea	rdx, [rcx + r10]
 25883  	cmp	rdx, r8
 25884  	jbe	.LBB4_504
 25885  # %bb.503:
 25886  	lea	rdx, [r8 + 4*r10]
 25887  	cmp	rdx, rcx
 25888  	jbe	.LBB4_504
 25889  .LBB4_35:
 25890  	xor	edx, edx
 25891  .LBB4_1240:
 25892  	mov	rsi, rdx
 25893  	not	rsi
 25894  	test	r10b, 1
 25895  	je	.LBB4_1242
 25896  # %bb.1241:
 25897  	mov	r9b, byte ptr [rcx + rdx]
 25898  	xor	edi, edi
 25899  	test	r9b, r9b
 25900  	setne	dil
 25901  	neg	edi
 25902  	test	r9b, r9b
 25903  	mov	eax, 1
 25904  	cmovle	eax, edi
 25905  	mov	dword ptr [r8 + 4*rdx], eax
 25906  	or	rdx, 1
 25907  .LBB4_1242:
 25908  	add	rsi, r10
 25909  	je	.LBB4_1655
 25910  # %bb.1243:
 25911  	mov	esi, 1
 25912  .LBB4_1244:                             # =>This Inner Loop Header: Depth=1
 25913  	movzx	eax, byte ptr [rcx + rdx]
 25914  	xor	edi, edi
 25915  	test	al, al
 25916  	setne	dil
 25917  	neg	edi
 25918  	test	al, al
 25919  	cmovg	edi, esi
 25920  	mov	dword ptr [r8 + 4*rdx], edi
 25921  	movzx	eax, byte ptr [rcx + rdx + 1]
 25922  	xor	edi, edi
 25923  	test	al, al
 25924  	setne	dil
 25925  	neg	edi
 25926  	test	al, al
 25927  	cmovg	edi, esi
 25928  	mov	dword ptr [r8 + 4*rdx + 4], edi
 25929  	add	rdx, 2
 25930  	cmp	r10, rdx
 25931  	jne	.LBB4_1244
 25932  	jmp	.LBB4_1655
 25933  .LBB4_36:
 25934  	cmp	edi, 7
 25935  	je	.LBB4_86
 25936  # %bb.37:
 25937  	cmp	edi, 8
 25938  	jne	.LBB4_1655
 25939  # %bb.38:
 25940  	cmp	esi, 6
 25941  	jg	.LBB4_123
 25942  # %bb.39:
 25943  	cmp	esi, 3
 25944  	jle	.LBB4_215
 25945  # %bb.40:
 25946  	cmp	esi, 4
 25947  	je	.LBB4_321
 25948  # %bb.41:
 25949  	cmp	esi, 5
 25950  	je	.LBB4_324
 25951  # %bb.42:
 25952  	cmp	esi, 6
 25953  	jne	.LBB4_1655
 25954  # %bb.43:
 25955  	test	r9d, r9d
 25956  	jle	.LBB4_1655
 25957  # %bb.44:
 25958  	mov	eax, r9d
 25959  	cmp	r9d, 4
 25960  	jae	.LBB4_507
 25961  # %bb.45:
 25962  	xor	edx, edx
 25963  	jmp	.LBB4_998
 25964  .LBB4_46:
 25965  	cmp	esi, 6
 25966  	jg	.LBB4_135
 25967  # %bb.47:
 25968  	cmp	esi, 3
 25969  	jle	.LBB4_220
 25970  # %bb.48:
 25971  	cmp	esi, 4
 25972  	je	.LBB4_327
 25973  # %bb.49:
 25974  	cmp	esi, 5
 25975  	je	.LBB4_330
 25976  # %bb.50:
 25977  	cmp	esi, 6
 25978  	jne	.LBB4_1655
 25979  # %bb.51:
 25980  	test	r9d, r9d
 25981  	jle	.LBB4_1655
 25982  # %bb.52:
 25983  	mov	eax, r9d
 25984  	cmp	r9d, 8
 25985  	jae	.LBB4_510
 25986  # %bb.53:
 25987  	xor	edx, edx
 25988  	jmp	.LBB4_1116
 25989  .LBB4_54:
 25990  	cmp	esi, 6
 25991  	jg	.LBB4_147
 25992  # %bb.55:
 25993  	cmp	esi, 3
 25994  	jle	.LBB4_225
 25995  # %bb.56:
 25996  	cmp	esi, 4
 25997  	je	.LBB4_333
 25998  # %bb.57:
 25999  	cmp	esi, 5
 26000  	je	.LBB4_336
 26001  # %bb.58:
 26002  	cmp	esi, 6
 26003  	jne	.LBB4_1655
 26004  # %bb.59:
 26005  	test	r9d, r9d
 26006  	jle	.LBB4_1655
 26007  # %bb.60:
 26008  	mov	r10d, r9d
 26009  	cmp	r9d, 8
 26010  	jae	.LBB4_513
 26011  # %bb.61:
 26012  	xor	edx, edx
 26013  	jmp	.LBB4_1121
 26014  .LBB4_62:
 26015  	cmp	esi, 6
 26016  	jg	.LBB4_157
 26017  # %bb.63:
 26018  	cmp	esi, 3
 26019  	jle	.LBB4_230
 26020  # %bb.64:
 26021  	cmp	esi, 4
 26022  	je	.LBB4_339
 26023  # %bb.65:
 26024  	cmp	esi, 5
 26025  	je	.LBB4_342
 26026  # %bb.66:
 26027  	cmp	esi, 6
 26028  	jne	.LBB4_1655
 26029  # %bb.67:
 26030  	test	r9d, r9d
 26031  	jle	.LBB4_1655
 26032  # %bb.68:
 26033  	mov	r10d, r9d
 26034  	cmp	r9d, 4
 26035  	jae	.LBB4_516
 26036  # %bb.69:
 26037  	xor	edx, edx
 26038  	jmp	.LBB4_1127
 26039  .LBB4_70:
 26040  	cmp	esi, 6
 26041  	jg	.LBB4_167
 26042  # %bb.71:
 26043  	cmp	esi, 3
 26044  	jle	.LBB4_235
 26045  # %bb.72:
 26046  	cmp	esi, 4
 26047  	je	.LBB4_345
 26048  # %bb.73:
 26049  	cmp	esi, 5
 26050  	je	.LBB4_348
 26051  # %bb.74:
 26052  	cmp	esi, 6
 26053  	jne	.LBB4_1655
 26054  # %bb.75:
 26055  	test	r9d, r9d
 26056  	jle	.LBB4_1655
 26057  # %bb.76:
 26058  	mov	eax, r9d
 26059  	cmp	r9d, 4
 26060  	jae	.LBB4_519
 26061  # %bb.77:
 26062  	xor	edx, edx
 26063  	jmp	.LBB4_1133
 26064  .LBB4_78:
 26065  	cmp	esi, 6
 26066  	jg	.LBB4_178
 26067  # %bb.79:
 26068  	cmp	esi, 3
 26069  	jle	.LBB4_240
 26070  # %bb.80:
 26071  	cmp	esi, 4
 26072  	je	.LBB4_351
 26073  # %bb.81:
 26074  	cmp	esi, 5
 26075  	je	.LBB4_354
 26076  # %bb.82:
 26077  	cmp	esi, 6
 26078  	jne	.LBB4_1655
 26079  # %bb.83:
 26080  	test	r9d, r9d
 26081  	jle	.LBB4_1655
 26082  # %bb.84:
 26083  	mov	r10d, r9d
 26084  	cmp	r9d, 8
 26085  	jb	.LBB4_85
 26086  # %bb.522:
 26087  	lea	rdx, [rcx + r10]
 26088  	cmp	rdx, r8
 26089  	jbe	.LBB4_524
 26090  # %bb.523:
 26091  	lea	rdx, [r8 + 4*r10]
 26092  	cmp	rdx, rcx
 26093  	jbe	.LBB4_524
 26094  .LBB4_85:
 26095  	xor	edx, edx
 26096  .LBB4_1249:
 26097  	mov	rsi, rdx
 26098  	not	rsi
 26099  	add	rsi, r10
 26100  	mov	rdi, r10
 26101  	and	rdi, 3
 26102  	je	.LBB4_1251
 26103  .LBB4_1250:                             # =>This Inner Loop Header: Depth=1
 26104  	xor	eax, eax
 26105  	cmp	byte ptr [rcx + rdx], 0
 26106  	setne	al
 26107  	mov	dword ptr [r8 + 4*rdx], eax
 26108  	add	rdx, 1
 26109  	add	rdi, -1
 26110  	jne	.LBB4_1250
 26111  .LBB4_1251:
 26112  	cmp	rsi, 3
 26113  	jb	.LBB4_1655
 26114  .LBB4_1252:                             # =>This Inner Loop Header: Depth=1
 26115  	xor	eax, eax
 26116  	cmp	byte ptr [rcx + rdx], 0
 26117  	setne	al
 26118  	mov	dword ptr [r8 + 4*rdx], eax
 26119  	xor	eax, eax
 26120  	cmp	byte ptr [rcx + rdx + 1], 0
 26121  	setne	al
 26122  	mov	dword ptr [r8 + 4*rdx + 4], eax
 26123  	xor	eax, eax
 26124  	cmp	byte ptr [rcx + rdx + 2], 0
 26125  	setne	al
 26126  	mov	dword ptr [r8 + 4*rdx + 8], eax
 26127  	xor	eax, eax
 26128  	cmp	byte ptr [rcx + rdx + 3], 0
 26129  	setne	al
 26130  	mov	dword ptr [r8 + 4*rdx + 12], eax
 26131  	add	rdx, 4
 26132  	cmp	r10, rdx
 26133  	jne	.LBB4_1252
 26134  	jmp	.LBB4_1655
 26135  .LBB4_86:
 26136  	cmp	esi, 6
 26137  	jg	.LBB4_190
 26138  # %bb.87:
 26139  	cmp	esi, 3
 26140  	jle	.LBB4_245
 26141  # %bb.88:
 26142  	cmp	esi, 4
 26143  	je	.LBB4_357
 26144  # %bb.89:
 26145  	cmp	esi, 5
 26146  	je	.LBB4_360
 26147  # %bb.90:
 26148  	cmp	esi, 6
 26149  	jne	.LBB4_1655
 26150  # %bb.91:
 26151  	test	r9d, r9d
 26152  	jle	.LBB4_1655
 26153  # %bb.92:
 26154  	mov	r11d, r9d
 26155  	cmp	r9d, 8
 26156  	jb	.LBB4_93
 26157  # %bb.527:
 26158  	lea	rdx, [rcx + 4*r11]
 26159  	cmp	rdx, r8
 26160  	jbe	.LBB4_529
 26161  # %bb.528:
 26162  	lea	rdx, [r8 + 4*r11]
 26163  	cmp	rdx, rcx
 26164  	jbe	.LBB4_529
 26165  .LBB4_93:
 26166  	xor	edx, edx
 26167  .LBB4_1257:
 26168  	mov	rsi, rdx
 26169  	not	rsi
 26170  	test	r11b, 1
 26171  	je	.LBB4_1259
 26172  # %bb.1258:
 26173  	mov	r9d, dword ptr [rcx + 4*rdx]
 26174  	xor	r10d, r10d
 26175  	test	r9d, r9d
 26176  	setne	r10b
 26177  	neg	r10d
 26178  	test	r9d, r9d
 26179  	mov	edi, 1
 26180  	cmovle	edi, r10d
 26181  	mov	dword ptr [r8 + 4*rdx], edi
 26182  	or	rdx, 1
 26183  .LBB4_1259:
 26184  	add	rsi, r11
 26185  	je	.LBB4_1655
 26186  # %bb.1260:
 26187  	mov	esi, 1
 26188  .LBB4_1261:                             # =>This Inner Loop Header: Depth=1
 26189  	mov	edi, dword ptr [rcx + 4*rdx]
 26190  	xor	eax, eax
 26191  	test	edi, edi
 26192  	setne	al
 26193  	neg	eax
 26194  	test	edi, edi
 26195  	cmovg	eax, esi
 26196  	mov	dword ptr [r8 + 4*rdx], eax
 26197  	mov	eax, dword ptr [rcx + 4*rdx + 4]
 26198  	xor	edi, edi
 26199  	test	eax, eax
 26200  	setne	dil
 26201  	neg	edi
 26202  	test	eax, eax
 26203  	cmovg	edi, esi
 26204  	mov	dword ptr [r8 + 4*rdx + 4], edi
 26205  	add	rdx, 2
 26206  	cmp	r11, rdx
 26207  	jne	.LBB4_1261
 26208  	jmp	.LBB4_1655
 26209  .LBB4_94:
 26210  	cmp	esi, 8
 26211  	jle	.LBB4_250
 26212  # %bb.95:
 26213  	cmp	esi, 9
 26214  	je	.LBB4_363
 26215  # %bb.96:
 26216  	cmp	esi, 11
 26217  	je	.LBB4_366
 26218  # %bb.97:
 26219  	cmp	esi, 12
 26220  	jne	.LBB4_1655
 26221  # %bb.98:
 26222  	test	r9d, r9d
 26223  	jle	.LBB4_1655
 26224  # %bb.99:
 26225  	mov	edx, r9d
 26226  	lea	rsi, [rdx - 1]
 26227  	mov	eax, edx
 26228  	and	eax, 3
 26229  	cmp	rsi, 3
 26230  	jae	.LBB4_532
 26231  # %bb.100:
 26232  	xor	esi, esi
 26233  .LBB4_101:
 26234  	test	rax, rax
 26235  	je	.LBB4_1655
 26236  # %bb.102:
 26237  	lea	rdx, [r8 + 8*rsi]
 26238  	lea	rcx, [rcx + 4*rsi]
 26239  	xor	esi, esi
 26240  	movsd	xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero
 26241  	jmp	.LBB4_104
 26242  .LBB4_103:                              #   in Loop: Header=BB4_104 Depth=1
 26243  	movsd	qword ptr [rdx + 8*rsi], xmm1
 26244  	add	rsi, 1
 26245  	cmp	rax, rsi
 26246  	je	.LBB4_1655
 26247  .LBB4_104:                              # =>This Inner Loop Header: Depth=1
 26248  	cmp	dword ptr [rcx + 4*rsi], 0
 26249  	movapd	xmm1, xmm0
 26250  	jne	.LBB4_103
 26251  # %bb.105:                              #   in Loop: Header=BB4_104 Depth=1
 26252  	xorpd	xmm1, xmm1
 26253  	jmp	.LBB4_103
 26254  .LBB4_106:
 26255  	cmp	esi, 8
 26256  	jle	.LBB4_255
 26257  # %bb.107:
 26258  	cmp	esi, 9
 26259  	je	.LBB4_369
 26260  # %bb.108:
 26261  	cmp	esi, 11
 26262  	je	.LBB4_372
 26263  # %bb.109:
 26264  	cmp	esi, 12
 26265  	jne	.LBB4_1655
 26266  # %bb.110:
 26267  	test	r9d, r9d
 26268  	jle	.LBB4_1655
 26269  # %bb.111:
 26270  	mov	eax, r9d
 26271  	cmp	r9d, 4
 26272  	jb	.LBB4_112
 26273  # %bb.542:
 26274  	lea	rdx, [rcx + 8*rax]
 26275  	cmp	rdx, r8
 26276  	jbe	.LBB4_544
 26277  # %bb.543:
 26278  	lea	rdx, [r8 + 8*rax]
 26279  	cmp	rdx, rcx
 26280  	jbe	.LBB4_544
 26281  .LBB4_112:
 26282  	xor	edx, edx
 26283  .LBB4_1266:
 26284  	mov	rsi, rdx
 26285  	not	rsi
 26286  	test	al, 1
 26287  	je	.LBB4_1268
 26288  # %bb.1267:
 26289  	movsd	xmm0, qword ptr [rcx + 8*rdx]   # xmm0 = mem[0],zero
 26290  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 26291  	andpd	xmm1, xmm0
 26292  	movsd	xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero
 26293  	orpd	xmm2, xmm1
 26294  	xorpd	xmm1, xmm1
 26295  	cmpeqsd	xmm1, xmm0
 26296  	andnpd	xmm1, xmm2
 26297  	movlpd	qword ptr [r8 + 8*rdx], xmm1
 26298  	or	rdx, 1
 26299  .LBB4_1268:
 26300  	add	rsi, rax
 26301  	je	.LBB4_1655
 26302  # %bb.1269:
 26303  	movapd	xmm0, xmmword ptr [rip + .LCPI4_0] # xmm0 = [-0.0E+0,-0.0E+0]
 26304  	movsd	xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero
 26305  	xorpd	xmm2, xmm2
 26306  .LBB4_1270:                             # =>This Inner Loop Header: Depth=1
 26307  	movsd	xmm3, qword ptr [rcx + 8*rdx]   # xmm3 = mem[0],zero
 26308  	movapd	xmm4, xmm3
 26309  	andpd	xmm4, xmm0
 26310  	orpd	xmm4, xmm1
 26311  	cmpeqsd	xmm3, xmm2
 26312  	andnpd	xmm3, xmm4
 26313  	movlpd	qword ptr [r8 + 8*rdx], xmm3
 26314  	movsd	xmm3, qword ptr [rcx + 8*rdx + 8] # xmm3 = mem[0],zero
 26315  	movapd	xmm4, xmm3
 26316  	andpd	xmm4, xmm0
 26317  	orpd	xmm4, xmm1
 26318  	cmpeqsd	xmm3, xmm2
 26319  	andnpd	xmm3, xmm4
 26320  	movlpd	qword ptr [r8 + 8*rdx + 8], xmm3
 26321  	add	rdx, 2
 26322  	cmp	rax, rdx
 26323  	jne	.LBB4_1270
 26324  	jmp	.LBB4_1655
 26325  .LBB4_113:
 26326  	cmp	esi, 8
 26327  	jle	.LBB4_260
 26328  # %bb.114:
 26329  	cmp	esi, 9
 26330  	je	.LBB4_375
 26331  # %bb.115:
 26332  	cmp	esi, 11
 26333  	je	.LBB4_378
 26334  # %bb.116:
 26335  	cmp	esi, 12
 26336  	jne	.LBB4_1655
 26337  # %bb.117:
 26338  	test	r9d, r9d
 26339  	jle	.LBB4_1655
 26340  # %bb.118:
 26341  	mov	edx, r9d
 26342  	cmp	r9d, 1
 26343  	jne	.LBB4_547
 26344  # %bb.119:
 26345  	xor	eax, eax
 26346  .LBB4_120:
 26347  	test	dl, 1
 26348  	je	.LBB4_1655
 26349  # %bb.121:
 26350  	cmp	byte ptr [rcx + rax], 0
 26351  	jne	.LBB4_982
 26352  .LBB4_122:
 26353  	xorpd	xmm0, xmm0
 26354  	jmp	.LBB4_983
 26355  .LBB4_123:
 26356  	cmp	esi, 8
 26357  	jle	.LBB4_265
 26358  # %bb.124:
 26359  	cmp	esi, 9
 26360  	je	.LBB4_381
 26361  # %bb.125:
 26362  	cmp	esi, 11
 26363  	je	.LBB4_384
 26364  # %bb.126:
 26365  	cmp	esi, 12
 26366  	jne	.LBB4_1655
 26367  # %bb.127:
 26368  	test	r9d, r9d
 26369  	jle	.LBB4_1655
 26370  # %bb.128:
 26371  	mov	edx, r9d
 26372  	lea	rsi, [rdx - 1]
 26373  	mov	eax, edx
 26374  	and	eax, 3
 26375  	cmp	rsi, 3
 26376  	jae	.LBB4_557
 26377  # %bb.129:
 26378  	xor	esi, esi
 26379  .LBB4_130:
 26380  	test	rax, rax
 26381  	je	.LBB4_1655
 26382  # %bb.131:
 26383  	lea	rdx, [r8 + 8*rsi]
 26384  	lea	rcx, [rcx + 8*rsi]
 26385  	xor	esi, esi
 26386  	movsd	xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero
 26387  	jmp	.LBB4_133
 26388  .LBB4_132:                              #   in Loop: Header=BB4_133 Depth=1
 26389  	movsd	qword ptr [rdx + 8*rsi], xmm1
 26390  	add	rsi, 1
 26391  	cmp	rax, rsi
 26392  	je	.LBB4_1655
 26393  .LBB4_133:                              # =>This Inner Loop Header: Depth=1
 26394  	cmp	qword ptr [rcx + 8*rsi], 0
 26395  	movapd	xmm1, xmm0
 26396  	jne	.LBB4_132
 26397  # %bb.134:                              #   in Loop: Header=BB4_133 Depth=1
 26398  	xorpd	xmm1, xmm1
 26399  	jmp	.LBB4_132
 26400  .LBB4_135:
 26401  	cmp	esi, 8
 26402  	jle	.LBB4_270
 26403  # %bb.136:
 26404  	cmp	esi, 9
 26405  	je	.LBB4_392
 26406  # %bb.137:
 26407  	cmp	esi, 11
 26408  	je	.LBB4_395
 26409  # %bb.138:
 26410  	cmp	esi, 12
 26411  	jne	.LBB4_1655
 26412  # %bb.139:
 26413  	test	r9d, r9d
 26414  	jle	.LBB4_1655
 26415  # %bb.140:
 26416  	mov	edx, r9d
 26417  	lea	rsi, [rdx - 1]
 26418  	mov	eax, edx
 26419  	and	eax, 3
 26420  	cmp	rsi, 3
 26421  	jae	.LBB4_567
 26422  # %bb.141:
 26423  	xor	esi, esi
 26424  .LBB4_142:
 26425  	test	rax, rax
 26426  	je	.LBB4_1655
 26427  # %bb.143:
 26428  	lea	rdx, [r8 + 8*rsi]
 26429  	lea	rcx, [rcx + 2*rsi]
 26430  	xor	esi, esi
 26431  	movsd	xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero
 26432  	jmp	.LBB4_145
 26433  .LBB4_144:                              #   in Loop: Header=BB4_145 Depth=1
 26434  	movsd	qword ptr [rdx + 8*rsi], xmm1
 26435  	add	rsi, 1
 26436  	cmp	rax, rsi
 26437  	je	.LBB4_1655
 26438  .LBB4_145:                              # =>This Inner Loop Header: Depth=1
 26439  	cmp	word ptr [rcx + 2*rsi], 0
 26440  	movapd	xmm1, xmm0
 26441  	jne	.LBB4_144
 26442  # %bb.146:                              #   in Loop: Header=BB4_145 Depth=1
 26443  	xorpd	xmm1, xmm1
 26444  	jmp	.LBB4_144
 26445  .LBB4_147:
 26446  	cmp	esi, 8
 26447  	jle	.LBB4_275
 26448  # %bb.148:
 26449  	cmp	esi, 9
 26450  	je	.LBB4_398
 26451  # %bb.149:
 26452  	cmp	esi, 11
 26453  	je	.LBB4_401
 26454  # %bb.150:
 26455  	cmp	esi, 12
 26456  	jne	.LBB4_1655
 26457  # %bb.151:
 26458  	test	r9d, r9d
 26459  	jle	.LBB4_1655
 26460  # %bb.152:
 26461  	mov	edx, r9d
 26462  	cmp	r9d, 1
 26463  	jne	.LBB4_577
 26464  # %bb.153:
 26465  	xor	eax, eax
 26466  .LBB4_154:
 26467  	test	dl, 1
 26468  	je	.LBB4_1655
 26469  # %bb.155:
 26470  	cmp	word ptr [rcx + 2*rax], 0
 26471  	je	.LBB4_122
 26472  .LBB4_982:
 26473  	movsd	xmm0, qword ptr [rip + .LCPI4_13] # xmm0 = mem[0],zero
 26474  .LBB4_983:
 26475  	jle	.LBB4_985
 26476  # %bb.984:
 26477  	movsd	xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero
 26478  .LBB4_985:
 26479  	movsd	qword ptr [r8 + 8*rax], xmm0
 26480  	jmp	.LBB4_1655
 26481  .LBB4_157:
 26482  	cmp	esi, 8
 26483  	jle	.LBB4_280
 26484  # %bb.158:
 26485  	cmp	esi, 9
 26486  	je	.LBB4_404
 26487  # %bb.159:
 26488  	cmp	esi, 11
 26489  	je	.LBB4_407
 26490  # %bb.160:
 26491  	cmp	esi, 12
 26492  	jne	.LBB4_1655
 26493  # %bb.161:
 26494  	test	r9d, r9d
 26495  	jle	.LBB4_1655
 26496  # %bb.162:
 26497  	mov	edx, r9d
 26498  	cmp	r9d, 1
 26499  	jne	.LBB4_587
 26500  # %bb.163:
 26501  	xor	eax, eax
 26502  .LBB4_164:
 26503  	test	dl, 1
 26504  	je	.LBB4_1655
 26505  # %bb.165:
 26506  	cmp	qword ptr [rcx + 8*rax], 0
 26507  	je	.LBB4_122
 26508  	jmp	.LBB4_982
 26509  .LBB4_167:
 26510  	cmp	esi, 8
 26511  	jle	.LBB4_285
 26512  # %bb.168:
 26513  	cmp	esi, 9
 26514  	je	.LBB4_413
 26515  # %bb.169:
 26516  	cmp	esi, 11
 26517  	je	.LBB4_419
 26518  # %bb.170:
 26519  	cmp	esi, 12
 26520  	jne	.LBB4_1655
 26521  # %bb.171:
 26522  	test	r9d, r9d
 26523  	jle	.LBB4_1655
 26524  # %bb.172:
 26525  	mov	edx, r9d
 26526  	cmp	r9d, 1
 26527  	jne	.LBB4_597
 26528  # %bb.173:
 26529  	xor	eax, eax
 26530  .LBB4_174:
 26531  	test	dl, 1
 26532  	je	.LBB4_1655
 26533  # %bb.175:
 26534  	movss	xmm1, dword ptr [rcx + 4*rax]   # xmm1 = mem[0],zero,zero,zero
 26535  	xorps	xmm0, xmm0
 26536  	xorps	xmm2, xmm2
 26537  	ucomiss	xmm2, xmm1
 26538  	je	.LBB4_177
 26539  # %bb.176:
 26540  	movmskps	ecx, xmm1
 26541  	and	ecx, 1
 26542  	neg	ecx
 26543  	or	ecx, 1
 26544  	xorps	xmm0, xmm0
 26545  	cvtsi2ss	xmm0, ecx
 26546  	cvtss2sd	xmm0, xmm0
 26547  .LBB4_177:
 26548  	movsd	qword ptr [r8 + 8*rax], xmm0
 26549  	jmp	.LBB4_1655
 26550  .LBB4_178:
 26551  	cmp	esi, 8
 26552  	jle	.LBB4_293
 26553  # %bb.179:
 26554  	cmp	esi, 9
 26555  	je	.LBB4_422
 26556  # %bb.180:
 26557  	cmp	esi, 11
 26558  	je	.LBB4_425
 26559  # %bb.181:
 26560  	cmp	esi, 12
 26561  	jne	.LBB4_1655
 26562  # %bb.182:
 26563  	test	r9d, r9d
 26564  	jle	.LBB4_1655
 26565  # %bb.183:
 26566  	mov	edx, r9d
 26567  	lea	rsi, [rdx - 1]
 26568  	mov	eax, edx
 26569  	and	eax, 3
 26570  	cmp	rsi, 3
 26571  	jae	.LBB4_603
 26572  # %bb.184:
 26573  	xor	esi, esi
 26574  .LBB4_185:
 26575  	test	rax, rax
 26576  	je	.LBB4_1655
 26577  # %bb.186:
 26578  	lea	rdx, [r8 + 8*rsi]
 26579  	add	rcx, rsi
 26580  	xor	esi, esi
 26581  	movsd	xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero
 26582  	jmp	.LBB4_188
 26583  .LBB4_187:                              #   in Loop: Header=BB4_188 Depth=1
 26584  	movsd	qword ptr [rdx + 8*rsi], xmm1
 26585  	add	rsi, 1
 26586  	cmp	rax, rsi
 26587  	je	.LBB4_1655
 26588  .LBB4_188:                              # =>This Inner Loop Header: Depth=1
 26589  	cmp	byte ptr [rcx + rsi], 0
 26590  	movapd	xmm1, xmm0
 26591  	jne	.LBB4_187
 26592  # %bb.189:                              #   in Loop: Header=BB4_188 Depth=1
 26593  	xorpd	xmm1, xmm1
 26594  	jmp	.LBB4_187
 26595  .LBB4_190:
 26596  	cmp	esi, 8
 26597  	jle	.LBB4_298
 26598  # %bb.191:
 26599  	cmp	esi, 9
 26600  	je	.LBB4_428
 26601  # %bb.192:
 26602  	cmp	esi, 11
 26603  	je	.LBB4_431
 26604  # %bb.193:
 26605  	cmp	esi, 12
 26606  	jne	.LBB4_1655
 26607  # %bb.194:
 26608  	test	r9d, r9d
 26609  	jle	.LBB4_1655
 26610  # %bb.195:
 26611  	mov	edx, r9d
 26612  	cmp	r9d, 1
 26613  	jne	.LBB4_613
 26614  # %bb.196:
 26615  	xor	eax, eax
 26616  .LBB4_197:
 26617  	test	dl, 1
 26618  	je	.LBB4_1655
 26619  # %bb.198:
 26620  	cmp	dword ptr [rcx + 4*rax], 0
 26621  	je	.LBB4_122
 26622  	jmp	.LBB4_982
 26623  .LBB4_200:
 26624  	cmp	esi, 2
 26625  	je	.LBB4_434
 26626  # %bb.201:
 26627  	cmp	esi, 3
 26628  	jne	.LBB4_1655
 26629  # %bb.202:
 26630  	test	r9d, r9d
 26631  	jle	.LBB4_1655
 26632  # %bb.203:
 26633  	mov	eax, r9d
 26634  	cmp	r9d, 8
 26635  	jb	.LBB4_204
 26636  # %bb.623:
 26637  	lea	rdx, [rcx + 4*rax]
 26638  	cmp	rdx, r8
 26639  	jbe	.LBB4_625
 26640  # %bb.624:
 26641  	lea	rdx, [r8 + rax]
 26642  	cmp	rdx, rcx
 26643  	jbe	.LBB4_625
 26644  .LBB4_204:
 26645  	xor	edx, edx
 26646  .LBB4_1275:
 26647  	mov	rsi, rdx
 26648  	not	rsi
 26649  	add	rsi, rax
 26650  	mov	rdi, rax
 26651  	and	rdi, 3
 26652  	je	.LBB4_1277
 26653  .LBB4_1276:                             # =>This Inner Loop Header: Depth=1
 26654  	cmp	dword ptr [rcx + 4*rdx], 0
 26655  	setne	byte ptr [r8 + rdx]
 26656  	add	rdx, 1
 26657  	add	rdi, -1
 26658  	jne	.LBB4_1276
 26659  .LBB4_1277:
 26660  	cmp	rsi, 3
 26661  	jb	.LBB4_1655
 26662  .LBB4_1278:                             # =>This Inner Loop Header: Depth=1
 26663  	cmp	dword ptr [rcx + 4*rdx], 0
 26664  	setne	byte ptr [r8 + rdx]
 26665  	cmp	dword ptr [rcx + 4*rdx + 4], 0
 26666  	setne	byte ptr [r8 + rdx + 1]
 26667  	cmp	dword ptr [rcx + 4*rdx + 8], 0
 26668  	setne	byte ptr [r8 + rdx + 2]
 26669  	cmp	dword ptr [rcx + 4*rdx + 12], 0
 26670  	setne	byte ptr [r8 + rdx + 3]
 26671  	add	rdx, 4
 26672  	cmp	rax, rdx
 26673  	jne	.LBB4_1278
 26674  	jmp	.LBB4_1655
 26675  .LBB4_205:
 26676  	cmp	esi, 2
 26677  	je	.LBB4_437
 26678  # %bb.206:
 26679  	cmp	esi, 3
 26680  	jne	.LBB4_1655
 26681  # %bb.207:
 26682  	test	r9d, r9d
 26683  	jle	.LBB4_1655
 26684  # %bb.208:
 26685  	mov	eax, r9d
 26686  	cmp	r9d, 4
 26687  	jb	.LBB4_209
 26688  # %bb.628:
 26689  	lea	rdx, [rcx + 8*rax]
 26690  	cmp	rdx, r8
 26691  	jbe	.LBB4_630
 26692  # %bb.629:
 26693  	lea	rdx, [r8 + rax]
 26694  	cmp	rdx, rcx
 26695  	jbe	.LBB4_630
 26696  .LBB4_209:
 26697  	xor	edx, edx
 26698  .LBB4_1283:
 26699  	mov	rsi, rdx
 26700  	not	rsi
 26701  	test	al, 1
 26702  	je	.LBB4_1285
 26703  # %bb.1284:
 26704  	movsd	xmm0, qword ptr [rcx + 8*rdx]   # xmm0 = mem[0],zero
 26705  	xor	r9d, r9d
 26706  	pxor	xmm1, xmm1
 26707  	ucomisd	xmm1, xmm0
 26708  	andpd	xmm0, xmmword ptr [rip + .LCPI4_0]
 26709  	movsd	xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero
 26710  	orpd	xmm1, xmm0
 26711  	cvttsd2si	edi, xmm1
 26712  	cmove	edi, r9d
 26713  	mov	byte ptr [r8 + rdx], dil
 26714  	or	rdx, 1
 26715  .LBB4_1285:
 26716  	add	rsi, rax
 26717  	je	.LBB4_1655
 26718  # %bb.1286:
 26719  	xor	esi, esi
 26720  	xorpd	xmm0, xmm0
 26721  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 26722  	movsd	xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero
 26723  .LBB4_1287:                             # =>This Inner Loop Header: Depth=1
 26724  	movsd	xmm3, qword ptr [rcx + 8*rdx]   # xmm3 = mem[0],zero
 26725  	ucomisd	xmm0, xmm3
 26726  	andpd	xmm3, xmm1
 26727  	orpd	xmm3, xmm2
 26728  	cvttsd2si	edi, xmm3
 26729  	cmove	edi, esi
 26730  	mov	byte ptr [r8 + rdx], dil
 26731  	movsd	xmm3, qword ptr [rcx + 8*rdx + 8] # xmm3 = mem[0],zero
 26732  	ucomisd	xmm0, xmm3
 26733  	andpd	xmm3, xmm1
 26734  	orpd	xmm3, xmm2
 26735  	cvttsd2si	edi, xmm3
 26736  	cmove	edi, esi
 26737  	mov	byte ptr [r8 + rdx + 1], dil
 26738  	add	rdx, 2
 26739  	cmp	rax, rdx
 26740  	jne	.LBB4_1287
 26741  	jmp	.LBB4_1655
 26742  .LBB4_210:
 26743  	cmp	esi, 2
 26744  	je	.LBB4_440
 26745  # %bb.211:
 26746  	cmp	esi, 3
 26747  	jne	.LBB4_1655
 26748  # %bb.212:
 26749  	test	r9d, r9d
 26750  	jle	.LBB4_1655
 26751  # %bb.213:
 26752  	mov	r10d, r9d
 26753  	cmp	r9d, 32
 26754  	jb	.LBB4_214
 26755  # %bb.633:
 26756  	lea	rdx, [rcx + r10]
 26757  	cmp	rdx, r8
 26758  	jbe	.LBB4_635
 26759  # %bb.634:
 26760  	lea	rdx, [r8 + r10]
 26761  	cmp	rdx, rcx
 26762  	jbe	.LBB4_635
 26763  .LBB4_214:
 26764  	xor	esi, esi
 26765  .LBB4_1292:
 26766  	mov	rax, rsi
 26767  	not	rax
 26768  	test	r10b, 1
 26769  	je	.LBB4_1294
 26770  # %bb.1293:
 26771  	mov	dil, byte ptr [rcx + rsi]
 26772  	test	dil, dil
 26773  	setne	r9b
 26774  	neg	r9b
 26775  	test	dil, dil
 26776  	movzx	r9d, r9b
 26777  	mov	edi, 1
 26778  	cmovle	edi, r9d
 26779  	mov	byte ptr [r8 + rsi], dil
 26780  	or	rsi, 1
 26781  .LBB4_1294:
 26782  	add	rax, r10
 26783  	je	.LBB4_1655
 26784  # %bb.1295:
 26785  	mov	edi, 1
 26786  .LBB4_1296:                             # =>This Inner Loop Header: Depth=1
 26787  	movzx	eax, byte ptr [rcx + rsi]
 26788  	test	al, al
 26789  	setne	dl
 26790  	neg	dl
 26791  	test	al, al
 26792  	movzx	eax, dl
 26793  	cmovg	eax, edi
 26794  	mov	byte ptr [r8 + rsi], al
 26795  	movzx	eax, byte ptr [rcx + rsi + 1]
 26796  	test	al, al
 26797  	setne	dl
 26798  	neg	dl
 26799  	test	al, al
 26800  	movzx	eax, dl
 26801  	cmovg	eax, edi
 26802  	mov	byte ptr [r8 + rsi + 1], al
 26803  	add	rsi, 2
 26804  	cmp	r10, rsi
 26805  	jne	.LBB4_1296
 26806  	jmp	.LBB4_1655
 26807  .LBB4_215:
 26808  	cmp	esi, 2
 26809  	je	.LBB4_443
 26810  # %bb.216:
 26811  	cmp	esi, 3
 26812  	jne	.LBB4_1655
 26813  # %bb.217:
 26814  	test	r9d, r9d
 26815  	jle	.LBB4_1655
 26816  # %bb.218:
 26817  	mov	eax, r9d
 26818  	cmp	r9d, 4
 26819  	jb	.LBB4_219
 26820  # %bb.638:
 26821  	lea	rdx, [rcx + 8*rax]
 26822  	cmp	rdx, r8
 26823  	jbe	.LBB4_640
 26824  # %bb.639:
 26825  	lea	rdx, [r8 + rax]
 26826  	cmp	rdx, rcx
 26827  	jbe	.LBB4_640
 26828  .LBB4_219:
 26829  	xor	edx, edx
 26830  .LBB4_1301:
 26831  	mov	rsi, rdx
 26832  	not	rsi
 26833  	add	rsi, rax
 26834  	mov	rdi, rax
 26835  	and	rdi, 3
 26836  	je	.LBB4_1303
 26837  .LBB4_1302:                             # =>This Inner Loop Header: Depth=1
 26838  	cmp	qword ptr [rcx + 8*rdx], 0
 26839  	setne	byte ptr [r8 + rdx]
 26840  	add	rdx, 1
 26841  	add	rdi, -1
 26842  	jne	.LBB4_1302
 26843  .LBB4_1303:
 26844  	cmp	rsi, 3
 26845  	jb	.LBB4_1655
 26846  .LBB4_1304:                             # =>This Inner Loop Header: Depth=1
 26847  	cmp	qword ptr [rcx + 8*rdx], 0
 26848  	setne	byte ptr [r8 + rdx]
 26849  	cmp	qword ptr [rcx + 8*rdx + 8], 0
 26850  	setne	byte ptr [r8 + rdx + 1]
 26851  	cmp	qword ptr [rcx + 8*rdx + 16], 0
 26852  	setne	byte ptr [r8 + rdx + 2]
 26853  	cmp	qword ptr [rcx + 8*rdx + 24], 0
 26854  	setne	byte ptr [r8 + rdx + 3]
 26855  	add	rdx, 4
 26856  	cmp	rax, rdx
 26857  	jne	.LBB4_1304
 26858  	jmp	.LBB4_1655
 26859  .LBB4_220:
 26860  	cmp	esi, 2
 26861  	je	.LBB4_446
 26862  # %bb.221:
 26863  	cmp	esi, 3
 26864  	jne	.LBB4_1655
 26865  # %bb.222:
 26866  	test	r9d, r9d
 26867  	jle	.LBB4_1655
 26868  # %bb.223:
 26869  	mov	eax, r9d
 26870  	cmp	r9d, 16
 26871  	jb	.LBB4_224
 26872  # %bb.643:
 26873  	lea	rdx, [rcx + 2*rax]
 26874  	cmp	rdx, r8
 26875  	jbe	.LBB4_645
 26876  # %bb.644:
 26877  	lea	rdx, [r8 + rax]
 26878  	cmp	rdx, rcx
 26879  	jbe	.LBB4_645
 26880  .LBB4_224:
 26881  	xor	edx, edx
 26882  .LBB4_1309:
 26883  	mov	rsi, rdx
 26884  	not	rsi
 26885  	add	rsi, rax
 26886  	mov	rdi, rax
 26887  	and	rdi, 3
 26888  	je	.LBB4_1311
 26889  .LBB4_1310:                             # =>This Inner Loop Header: Depth=1
 26890  	cmp	word ptr [rcx + 2*rdx], 0
 26891  	setne	byte ptr [r8 + rdx]
 26892  	add	rdx, 1
 26893  	add	rdi, -1
 26894  	jne	.LBB4_1310
 26895  .LBB4_1311:
 26896  	cmp	rsi, 3
 26897  	jb	.LBB4_1655
 26898  .LBB4_1312:                             # =>This Inner Loop Header: Depth=1
 26899  	cmp	word ptr [rcx + 2*rdx], 0
 26900  	setne	byte ptr [r8 + rdx]
 26901  	cmp	word ptr [rcx + 2*rdx + 2], 0
 26902  	setne	byte ptr [r8 + rdx + 1]
 26903  	cmp	word ptr [rcx + 2*rdx + 4], 0
 26904  	setne	byte ptr [r8 + rdx + 2]
 26905  	cmp	word ptr [rcx + 2*rdx + 6], 0
 26906  	setne	byte ptr [r8 + rdx + 3]
 26907  	add	rdx, 4
 26908  	cmp	rax, rdx
 26909  	jne	.LBB4_1312
 26910  	jmp	.LBB4_1655
 26911  .LBB4_225:
 26912  	cmp	esi, 2
 26913  	je	.LBB4_449
 26914  # %bb.226:
 26915  	cmp	esi, 3
 26916  	jne	.LBB4_1655
 26917  # %bb.227:
 26918  	test	r9d, r9d
 26919  	jle	.LBB4_1655
 26920  # %bb.228:
 26921  	mov	r10d, r9d
 26922  	cmp	r9d, 16
 26923  	jb	.LBB4_229
 26924  # %bb.648:
 26925  	lea	rdx, [rcx + 2*r10]
 26926  	cmp	rdx, r8
 26927  	jbe	.LBB4_650
 26928  # %bb.649:
 26929  	lea	rdx, [r8 + r10]
 26930  	cmp	rdx, rcx
 26931  	jbe	.LBB4_650
 26932  .LBB4_229:
 26933  	xor	esi, esi
 26934  .LBB4_1317:
 26935  	mov	rax, rsi
 26936  	not	rax
 26937  	test	r10b, 1
 26938  	je	.LBB4_1319
 26939  # %bb.1318:
 26940  	movzx	edi, word ptr [rcx + 2*rsi]
 26941  	test	di, di
 26942  	setne	r9b
 26943  	neg	r9b
 26944  	test	di, di
 26945  	movzx	r9d, r9b
 26946  	mov	edi, 1
 26947  	cmovle	edi, r9d
 26948  	mov	byte ptr [r8 + rsi], dil
 26949  	or	rsi, 1
 26950  .LBB4_1319:
 26951  	add	rax, r10
 26952  	je	.LBB4_1655
 26953  # %bb.1320:
 26954  	mov	r9d, 1
 26955  .LBB4_1321:                             # =>This Inner Loop Header: Depth=1
 26956  	movzx	edi, word ptr [rcx + 2*rsi]
 26957  	test	di, di
 26958  	setne	al
 26959  	neg	al
 26960  	test	di, di
 26961  	movzx	eax, al
 26962  	cmovg	eax, r9d
 26963  	mov	byte ptr [r8 + rsi], al
 26964  	movzx	eax, word ptr [rcx + 2*rsi + 2]
 26965  	test	ax, ax
 26966  	setne	dl
 26967  	neg	dl
 26968  	test	ax, ax
 26969  	movzx	eax, dl
 26970  	cmovg	eax, r9d
 26971  	mov	byte ptr [r8 + rsi + 1], al
 26972  	add	rsi, 2
 26973  	cmp	r10, rsi
 26974  	jne	.LBB4_1321
 26975  	jmp	.LBB4_1655
 26976  .LBB4_230:
 26977  	cmp	esi, 2
 26978  	je	.LBB4_452
 26979  # %bb.231:
 26980  	cmp	esi, 3
 26981  	jne	.LBB4_1655
 26982  # %bb.232:
 26983  	test	r9d, r9d
 26984  	jle	.LBB4_1655
 26985  # %bb.233:
 26986  	mov	r10d, r9d
 26987  	cmp	r9d, 4
 26988  	jb	.LBB4_234
 26989  # %bb.653:
 26990  	lea	rdx, [rcx + 8*r10]
 26991  	cmp	rdx, r8
 26992  	jbe	.LBB4_655
 26993  # %bb.654:
 26994  	lea	rdx, [r8 + r10]
 26995  	cmp	rdx, rcx
 26996  	jbe	.LBB4_655
 26997  .LBB4_234:
 26998  	xor	esi, esi
 26999  .LBB4_1326:
 27000  	mov	rdx, rsi
 27001  	not	rdx
 27002  	test	r10b, 1
 27003  	je	.LBB4_1328
 27004  # %bb.1327:
 27005  	mov	rdi, qword ptr [rcx + 8*rsi]
 27006  	test	rdi, rdi
 27007  	setne	al
 27008  	neg	al
 27009  	test	rdi, rdi
 27010  	movzx	eax, al
 27011  	mov	edi, 1
 27012  	cmovle	edi, eax
 27013  	mov	byte ptr [r8 + rsi], dil
 27014  	or	rsi, 1
 27015  .LBB4_1328:
 27016  	add	rdx, r10
 27017  	je	.LBB4_1655
 27018  # %bb.1329:
 27019  	mov	edi, 1
 27020  .LBB4_1330:                             # =>This Inner Loop Header: Depth=1
 27021  	mov	rax, qword ptr [rcx + 8*rsi]
 27022  	test	rax, rax
 27023  	setne	dl
 27024  	neg	dl
 27025  	test	rax, rax
 27026  	movzx	eax, dl
 27027  	cmovg	eax, edi
 27028  	mov	byte ptr [r8 + rsi], al
 27029  	mov	rax, qword ptr [rcx + 8*rsi + 8]
 27030  	test	rax, rax
 27031  	setne	dl
 27032  	neg	dl
 27033  	test	rax, rax
 27034  	movzx	eax, dl
 27035  	cmovg	eax, edi
 27036  	mov	byte ptr [r8 + rsi + 1], al
 27037  	add	rsi, 2
 27038  	cmp	r10, rsi
 27039  	jne	.LBB4_1330
 27040  	jmp	.LBB4_1655
 27041  .LBB4_235:
 27042  	cmp	esi, 2
 27043  	je	.LBB4_455
 27044  # %bb.236:
 27045  	cmp	esi, 3
 27046  	jne	.LBB4_1655
 27047  # %bb.237:
 27048  	test	r9d, r9d
 27049  	jle	.LBB4_1655
 27050  # %bb.238:
 27051  	mov	r10d, r9d
 27052  	cmp	r9d, 8
 27053  	jb	.LBB4_239
 27054  # %bb.658:
 27055  	lea	rdx, [rcx + 4*r10]
 27056  	cmp	rdx, r8
 27057  	jbe	.LBB4_660
 27058  # %bb.659:
 27059  	lea	rdx, [r8 + r10]
 27060  	cmp	rdx, rcx
 27061  	jbe	.LBB4_660
 27062  .LBB4_239:
 27063  	xor	edx, edx
 27064  .LBB4_1335:
 27065  	mov	rsi, rdx
 27066  	not	rsi
 27067  	test	r10b, 1
 27068  	je	.LBB4_1337
 27069  # %bb.1336:
 27070  	movd	xmm0, dword ptr [rcx + 4*rdx]   # xmm0 = mem[0],zero,zero,zero
 27071  	movd	edi, xmm0
 27072  	test	edi, edi
 27073  	setns	al
 27074  	add	al, al
 27075  	add	al, -1
 27076  	xor	edi, edi
 27077  	pxor	xmm1, xmm1
 27078  	ucomiss	xmm1, xmm0
 27079  	movzx	eax, al
 27080  	cmove	eax, edi
 27081  	mov	byte ptr [r8 + rdx], al
 27082  	or	rdx, 1
 27083  .LBB4_1337:
 27084  	add	rsi, r10
 27085  	je	.LBB4_1655
 27086  # %bb.1338:
 27087  	xor	esi, esi
 27088  	xorps	xmm0, xmm0
 27089  .LBB4_1339:                             # =>This Inner Loop Header: Depth=1
 27090  	movd	xmm1, dword ptr [rcx + 4*rdx]   # xmm1 = mem[0],zero,zero,zero
 27091  	movd	eax, xmm1
 27092  	test	eax, eax
 27093  	setns	al
 27094  	add	al, al
 27095  	add	al, -1
 27096  	ucomiss	xmm0, xmm1
 27097  	movzx	eax, al
 27098  	cmove	eax, esi
 27099  	mov	byte ptr [r8 + rdx], al
 27100  	movd	xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero
 27101  	movd	eax, xmm1
 27102  	test	eax, eax
 27103  	setns	al
 27104  	add	al, al
 27105  	add	al, -1
 27106  	ucomiss	xmm0, xmm1
 27107  	movzx	eax, al
 27108  	cmove	eax, esi
 27109  	mov	byte ptr [r8 + rdx + 1], al
 27110  	add	rdx, 2
 27111  	cmp	r10, rdx
 27112  	jne	.LBB4_1339
 27113  	jmp	.LBB4_1655
 27114  .LBB4_240:
 27115  	cmp	esi, 2
 27116  	je	.LBB4_458
 27117  # %bb.241:
 27118  	cmp	esi, 3
 27119  	jne	.LBB4_1655
 27120  # %bb.242:
 27121  	test	r9d, r9d
 27122  	jle	.LBB4_1655
 27123  # %bb.243:
 27124  	mov	eax, r9d
 27125  	cmp	r9d, 32
 27126  	jb	.LBB4_244
 27127  # %bb.663:
 27128  	lea	rdx, [rcx + rax]
 27129  	cmp	rdx, r8
 27130  	jbe	.LBB4_665
 27131  # %bb.664:
 27132  	lea	rdx, [r8 + rax]
 27133  	cmp	rdx, rcx
 27134  	jbe	.LBB4_665
 27135  .LBB4_244:
 27136  	xor	edx, edx
 27137  .LBB4_1344:
 27138  	mov	rsi, rdx
 27139  	not	rsi
 27140  	add	rsi, rax
 27141  	mov	rdi, rax
 27142  	and	rdi, 3
 27143  	je	.LBB4_1346
 27144  .LBB4_1345:                             # =>This Inner Loop Header: Depth=1
 27145  	cmp	byte ptr [rcx + rdx], 0
 27146  	setne	byte ptr [r8 + rdx]
 27147  	add	rdx, 1
 27148  	add	rdi, -1
 27149  	jne	.LBB4_1345
 27150  .LBB4_1346:
 27151  	cmp	rsi, 3
 27152  	jb	.LBB4_1655
 27153  .LBB4_1347:                             # =>This Inner Loop Header: Depth=1
 27154  	cmp	byte ptr [rcx + rdx], 0
 27155  	setne	byte ptr [r8 + rdx]
 27156  	cmp	byte ptr [rcx + rdx + 1], 0
 27157  	setne	byte ptr [r8 + rdx + 1]
 27158  	cmp	byte ptr [rcx + rdx + 2], 0
 27159  	setne	byte ptr [r8 + rdx + 2]
 27160  	cmp	byte ptr [rcx + rdx + 3], 0
 27161  	setne	byte ptr [r8 + rdx + 3]
 27162  	add	rdx, 4
 27163  	cmp	rax, rdx
 27164  	jne	.LBB4_1347
 27165  	jmp	.LBB4_1655
 27166  .LBB4_245:
 27167  	cmp	esi, 2
 27168  	je	.LBB4_461
 27169  # %bb.246:
 27170  	cmp	esi, 3
 27171  	jne	.LBB4_1655
 27172  # %bb.247:
 27173  	test	r9d, r9d
 27174  	jle	.LBB4_1655
 27175  # %bb.248:
 27176  	mov	r10d, r9d
 27177  	cmp	r9d, 8
 27178  	jb	.LBB4_249
 27179  # %bb.668:
 27180  	lea	rdx, [rcx + 4*r10]
 27181  	cmp	rdx, r8
 27182  	jbe	.LBB4_670
 27183  # %bb.669:
 27184  	lea	rdx, [r8 + r10]
 27185  	cmp	rdx, rcx
 27186  	jbe	.LBB4_670
 27187  .LBB4_249:
 27188  	xor	esi, esi
 27189  .LBB4_1352:
 27190  	mov	rax, rsi
 27191  	not	rax
 27192  	test	r10b, 1
 27193  	je	.LBB4_1354
 27194  # %bb.1353:
 27195  	mov	edi, dword ptr [rcx + 4*rsi]
 27196  	test	edi, edi
 27197  	setne	r9b
 27198  	neg	r9b
 27199  	test	edi, edi
 27200  	movzx	r9d, r9b
 27201  	mov	edi, 1
 27202  	cmovle	edi, r9d
 27203  	mov	byte ptr [r8 + rsi], dil
 27204  	or	rsi, 1
 27205  .LBB4_1354:
 27206  	add	rax, r10
 27207  	je	.LBB4_1655
 27208  # %bb.1355:
 27209  	mov	r9d, 1
 27210  .LBB4_1356:                             # =>This Inner Loop Header: Depth=1
 27211  	mov	edi, dword ptr [rcx + 4*rsi]
 27212  	test	edi, edi
 27213  	setne	al
 27214  	neg	al
 27215  	test	edi, edi
 27216  	movzx	eax, al
 27217  	cmovg	eax, r9d
 27218  	mov	byte ptr [r8 + rsi], al
 27219  	mov	eax, dword ptr [rcx + 4*rsi + 4]
 27220  	test	eax, eax
 27221  	setne	dl
 27222  	neg	dl
 27223  	test	eax, eax
 27224  	movzx	eax, dl
 27225  	cmovg	eax, r9d
 27226  	mov	byte ptr [r8 + rsi + 1], al
 27227  	add	rsi, 2
 27228  	cmp	r10, rsi
 27229  	jne	.LBB4_1356
 27230  	jmp	.LBB4_1655
 27231  .LBB4_250:
 27232  	cmp	esi, 7
 27233  	je	.LBB4_464
 27234  # %bb.251:
 27235  	cmp	esi, 8
 27236  	jne	.LBB4_1655
 27237  # %bb.252:
 27238  	test	r9d, r9d
 27239  	jle	.LBB4_1655
 27240  # %bb.253:
 27241  	mov	eax, r9d
 27242  	cmp	r9d, 4
 27243  	jae	.LBB4_673
 27244  # %bb.254:
 27245  	xor	edx, edx
 27246  	jmp	.LBB4_1003
 27247  .LBB4_255:
 27248  	cmp	esi, 7
 27249  	je	.LBB4_467
 27250  # %bb.256:
 27251  	cmp	esi, 8
 27252  	jne	.LBB4_1655
 27253  # %bb.257:
 27254  	test	r9d, r9d
 27255  	jle	.LBB4_1655
 27256  # %bb.258:
 27257  	mov	r10d, r9d
 27258  	movabs	r11, -9223372036854775808
 27259  	cmp	r9d, 1
 27260  	jne	.LBB4_676
 27261  # %bb.259:
 27262  	xor	esi, esi
 27263  	jmp	.LBB4_1008
 27264  .LBB4_260:
 27265  	cmp	esi, 7
 27266  	je	.LBB4_470
 27267  # %bb.261:
 27268  	cmp	esi, 8
 27269  	jne	.LBB4_1655
 27270  # %bb.262:
 27271  	test	r9d, r9d
 27272  	jle	.LBB4_1655
 27273  # %bb.263:
 27274  	mov	r10d, r9d
 27275  	cmp	r9d, 4
 27276  	jb	.LBB4_264
 27277  # %bb.679:
 27278  	lea	rdx, [rcx + r10]
 27279  	cmp	rdx, r8
 27280  	jbe	.LBB4_681
 27281  # %bb.680:
 27282  	lea	rdx, [r8 + 8*r10]
 27283  	cmp	rdx, rcx
 27284  	jbe	.LBB4_681
 27285  .LBB4_264:
 27286  	xor	edx, edx
 27287  .LBB4_1361:
 27288  	mov	rsi, rdx
 27289  	not	rsi
 27290  	test	r10b, 1
 27291  	je	.LBB4_1363
 27292  # %bb.1362:
 27293  	mov	al, byte ptr [rcx + rdx]
 27294  	xor	edi, edi
 27295  	test	al, al
 27296  	setne	dil
 27297  	neg	rdi
 27298  	test	al, al
 27299  	mov	eax, 1
 27300  	cmovle	rax, rdi
 27301  	mov	qword ptr [r8 + 8*rdx], rax
 27302  	or	rdx, 1
 27303  .LBB4_1363:
 27304  	add	rsi, r10
 27305  	je	.LBB4_1655
 27306  # %bb.1364:
 27307  	mov	esi, 1
 27308  .LBB4_1365:                             # =>This Inner Loop Header: Depth=1
 27309  	movzx	eax, byte ptr [rcx + rdx]
 27310  	xor	edi, edi
 27311  	test	al, al
 27312  	setne	dil
 27313  	neg	rdi
 27314  	test	al, al
 27315  	cmovg	rdi, rsi
 27316  	mov	qword ptr [r8 + 8*rdx], rdi
 27317  	movzx	eax, byte ptr [rcx + rdx + 1]
 27318  	xor	edi, edi
 27319  	test	al, al
 27320  	setne	dil
 27321  	neg	rdi
 27322  	test	al, al
 27323  	cmovg	rdi, rsi
 27324  	mov	qword ptr [r8 + 8*rdx + 8], rdi
 27325  	add	rdx, 2
 27326  	cmp	r10, rdx
 27327  	jne	.LBB4_1365
 27328  	jmp	.LBB4_1655
 27329  .LBB4_265:
 27330  	cmp	esi, 7
 27331  	je	.LBB4_473
 27332  # %bb.266:
 27333  	cmp	esi, 8
 27334  	jne	.LBB4_1655
 27335  # %bb.267:
 27336  	test	r9d, r9d
 27337  	jle	.LBB4_1655
 27338  # %bb.268:
 27339  	mov	r10d, r9d
 27340  	cmp	r9d, 4
 27341  	jb	.LBB4_269
 27342  # %bb.684:
 27343  	lea	rdx, [rcx + 8*r10]
 27344  	cmp	rdx, r8
 27345  	jbe	.LBB4_686
 27346  # %bb.685:
 27347  	lea	rdx, [r8 + 8*r10]
 27348  	cmp	rdx, rcx
 27349  	jbe	.LBB4_686
 27350  .LBB4_269:
 27351  	xor	edx, edx
 27352  .LBB4_1370:
 27353  	mov	rsi, rdx
 27354  	not	rsi
 27355  	add	rsi, r10
 27356  	mov	rdi, r10
 27357  	and	rdi, 3
 27358  	je	.LBB4_1372
 27359  .LBB4_1371:                             # =>This Inner Loop Header: Depth=1
 27360  	xor	eax, eax
 27361  	cmp	qword ptr [rcx + 8*rdx], 0
 27362  	setne	al
 27363  	mov	qword ptr [r8 + 8*rdx], rax
 27364  	add	rdx, 1
 27365  	add	rdi, -1
 27366  	jne	.LBB4_1371
 27367  .LBB4_1372:
 27368  	cmp	rsi, 3
 27369  	jb	.LBB4_1655
 27370  .LBB4_1373:                             # =>This Inner Loop Header: Depth=1
 27371  	xor	eax, eax
 27372  	cmp	qword ptr [rcx + 8*rdx], 0
 27373  	setne	al
 27374  	mov	qword ptr [r8 + 8*rdx], rax
 27375  	xor	eax, eax
 27376  	cmp	qword ptr [rcx + 8*rdx + 8], 0
 27377  	setne	al
 27378  	mov	qword ptr [r8 + 8*rdx + 8], rax
 27379  	xor	eax, eax
 27380  	cmp	qword ptr [rcx + 8*rdx + 16], 0
 27381  	setne	al
 27382  	mov	qword ptr [r8 + 8*rdx + 16], rax
 27383  	xor	eax, eax
 27384  	cmp	qword ptr [rcx + 8*rdx + 24], 0
 27385  	setne	al
 27386  	mov	qword ptr [r8 + 8*rdx + 24], rax
 27387  	add	rdx, 4
 27388  	cmp	r10, rdx
 27389  	jne	.LBB4_1373
 27390  	jmp	.LBB4_1655
 27391  .LBB4_270:
 27392  	cmp	esi, 7
 27393  	je	.LBB4_476
 27394  # %bb.271:
 27395  	cmp	esi, 8
 27396  	jne	.LBB4_1655
 27397  # %bb.272:
 27398  	test	r9d, r9d
 27399  	jle	.LBB4_1655
 27400  # %bb.273:
 27401  	mov	eax, r9d
 27402  	cmp	r9d, 4
 27403  	jae	.LBB4_689
 27404  # %bb.274:
 27405  	xor	edx, edx
 27406  	jmp	.LBB4_1014
 27407  .LBB4_275:
 27408  	cmp	esi, 7
 27409  	je	.LBB4_479
 27410  # %bb.276:
 27411  	cmp	esi, 8
 27412  	jne	.LBB4_1655
 27413  # %bb.277:
 27414  	test	r9d, r9d
 27415  	jle	.LBB4_1655
 27416  # %bb.278:
 27417  	mov	r10d, r9d
 27418  	cmp	r9d, 4
 27419  	jae	.LBB4_692
 27420  # %bb.279:
 27421  	xor	edx, edx
 27422  	jmp	.LBB4_1019
 27423  .LBB4_280:
 27424  	cmp	esi, 7
 27425  	je	.LBB4_482
 27426  # %bb.281:
 27427  	cmp	esi, 8
 27428  	jne	.LBB4_1655
 27429  # %bb.282:
 27430  	test	r9d, r9d
 27431  	jle	.LBB4_1655
 27432  # %bb.283:
 27433  	mov	r11d, r9d
 27434  	cmp	r9d, 4
 27435  	jb	.LBB4_284
 27436  # %bb.695:
 27437  	lea	rdx, [rcx + 8*r11]
 27438  	cmp	rdx, r8
 27439  	jbe	.LBB4_697
 27440  # %bb.696:
 27441  	lea	rdx, [r8 + 8*r11]
 27442  	cmp	rdx, rcx
 27443  	jbe	.LBB4_697
 27444  .LBB4_284:
 27445  	xor	edx, edx
 27446  .LBB4_1378:
 27447  	mov	rsi, rdx
 27448  	not	rsi
 27449  	test	r11b, 1
 27450  	je	.LBB4_1380
 27451  # %bb.1379:
 27452  	mov	r9, qword ptr [rcx + 8*rdx]
 27453  	xor	r10d, r10d
 27454  	test	r9, r9
 27455  	setne	r10b
 27456  	neg	r10
 27457  	test	r9, r9
 27458  	mov	edi, 1
 27459  	cmovle	rdi, r10
 27460  	mov	qword ptr [r8 + 8*rdx], rdi
 27461  	or	rdx, 1
 27462  .LBB4_1380:
 27463  	add	rsi, r11
 27464  	je	.LBB4_1655
 27465  # %bb.1381:
 27466  	mov	esi, 1
 27467  .LBB4_1382:                             # =>This Inner Loop Header: Depth=1
 27468  	mov	rdi, qword ptr [rcx + 8*rdx]
 27469  	xor	eax, eax
 27470  	test	rdi, rdi
 27471  	setne	al
 27472  	neg	rax
 27473  	test	rdi, rdi
 27474  	cmovg	rax, rsi
 27475  	mov	qword ptr [r8 + 8*rdx], rax
 27476  	mov	rax, qword ptr [rcx + 8*rdx + 8]
 27477  	xor	edi, edi
 27478  	test	rax, rax
 27479  	setne	dil
 27480  	neg	rdi
 27481  	test	rax, rax
 27482  	cmovg	rdi, rsi
 27483  	mov	qword ptr [r8 + 8*rdx + 8], rdi
 27484  	add	rdx, 2
 27485  	cmp	r11, rdx
 27486  	jne	.LBB4_1382
 27487  	jmp	.LBB4_1655
 27488  .LBB4_285:
 27489  	cmp	esi, 7
 27490  	je	.LBB4_485
 27491  # %bb.286:
 27492  	cmp	esi, 8
 27493  	jne	.LBB4_1655
 27494  # %bb.287:
 27495  	test	r9d, r9d
 27496  	jle	.LBB4_1655
 27497  # %bb.288:
 27498  	mov	r10d, r9d
 27499  	cmp	r9d, 1
 27500  	jne	.LBB4_700
 27501  # %bb.289:
 27502  	xor	eax, eax
 27503  	jmp	.LBB4_290
 27504  .LBB4_293:
 27505  	cmp	esi, 7
 27506  	je	.LBB4_488
 27507  # %bb.294:
 27508  	cmp	esi, 8
 27509  	jne	.LBB4_1655
 27510  # %bb.295:
 27511  	test	r9d, r9d
 27512  	jle	.LBB4_1655
 27513  # %bb.296:
 27514  	mov	r10d, r9d
 27515  	cmp	r9d, 4
 27516  	jb	.LBB4_297
 27517  # %bb.708:
 27518  	lea	rdx, [rcx + r10]
 27519  	cmp	rdx, r8
 27520  	jbe	.LBB4_710
 27521  # %bb.709:
 27522  	lea	rdx, [r8 + 8*r10]
 27523  	cmp	rdx, rcx
 27524  	jbe	.LBB4_710
 27525  .LBB4_297:
 27526  	xor	edx, edx
 27527  .LBB4_1387:
 27528  	mov	rsi, rdx
 27529  	not	rsi
 27530  	add	rsi, r10
 27531  	mov	rdi, r10
 27532  	and	rdi, 3
 27533  	je	.LBB4_1389
 27534  .LBB4_1388:                             # =>This Inner Loop Header: Depth=1
 27535  	xor	eax, eax
 27536  	cmp	byte ptr [rcx + rdx], 0
 27537  	setne	al
 27538  	mov	qword ptr [r8 + 8*rdx], rax
 27539  	add	rdx, 1
 27540  	add	rdi, -1
 27541  	jne	.LBB4_1388
 27542  .LBB4_1389:
 27543  	cmp	rsi, 3
 27544  	jb	.LBB4_1655
 27545  .LBB4_1390:                             # =>This Inner Loop Header: Depth=1
 27546  	xor	eax, eax
 27547  	cmp	byte ptr [rcx + rdx], 0
 27548  	setne	al
 27549  	mov	qword ptr [r8 + 8*rdx], rax
 27550  	xor	eax, eax
 27551  	cmp	byte ptr [rcx + rdx + 1], 0
 27552  	setne	al
 27553  	mov	qword ptr [r8 + 8*rdx + 8], rax
 27554  	xor	eax, eax
 27555  	cmp	byte ptr [rcx + rdx + 2], 0
 27556  	setne	al
 27557  	mov	qword ptr [r8 + 8*rdx + 16], rax
 27558  	xor	eax, eax
 27559  	cmp	byte ptr [rcx + rdx + 3], 0
 27560  	setne	al
 27561  	mov	qword ptr [r8 + 8*rdx + 24], rax
 27562  	add	rdx, 4
 27563  	cmp	r10, rdx
 27564  	jne	.LBB4_1390
 27565  	jmp	.LBB4_1655
 27566  .LBB4_298:
 27567  	cmp	esi, 7
 27568  	je	.LBB4_491
 27569  # %bb.299:
 27570  	cmp	esi, 8
 27571  	jne	.LBB4_1655
 27572  # %bb.300:
 27573  	test	r9d, r9d
 27574  	jle	.LBB4_1655
 27575  # %bb.301:
 27576  	mov	r10d, r9d
 27577  	cmp	r9d, 4
 27578  	jae	.LBB4_713
 27579  # %bb.302:
 27580  	xor	edx, edx
 27581  	jmp	.LBB4_1025
 27582  .LBB4_303:
 27583  	test	r9d, r9d
 27584  	jle	.LBB4_1655
 27585  # %bb.304:
 27586  	mov	eax, r9d
 27587  	cmp	r9d, 8
 27588  	jae	.LBB4_716
 27589  # %bb.305:
 27590  	xor	edx, edx
 27591  	jmp	.LBB4_1141
 27592  .LBB4_306:
 27593  	test	r9d, r9d
 27594  	jle	.LBB4_1655
 27595  # %bb.307:
 27596  	mov	eax, r9d
 27597  	cmp	r9d, 8
 27598  	jae	.LBB4_719
 27599  # %bb.308:
 27600  	xor	edx, edx
 27601  	jmp	.LBB4_1146
 27602  .LBB4_309:
 27603  	test	r9d, r9d
 27604  	jle	.LBB4_1655
 27605  # %bb.310:
 27606  	mov	eax, r9d
 27607  	xor	r10d, r10d
 27608  	cmp	r9d, 4
 27609  	jae	.LBB4_722
 27610  # %bb.311:
 27611  	xor	esi, esi
 27612  	jmp	.LBB4_1151
 27613  .LBB4_312:
 27614  	test	r9d, r9d
 27615  	jle	.LBB4_1655
 27616  # %bb.313:
 27617  	mov	eax, r9d
 27618  	xor	r10d, r10d
 27619  	cmp	r9d, 4
 27620  	jae	.LBB4_725
 27621  # %bb.314:
 27622  	xor	esi, esi
 27623  	jmp	.LBB4_1157
 27624  .LBB4_315:
 27625  	test	r9d, r9d
 27626  	jle	.LBB4_1655
 27627  # %bb.316:
 27628  	mov	r10d, r9d
 27629  	cmp	r9d, 16
 27630  	jb	.LBB4_317
 27631  # %bb.728:
 27632  	lea	rdx, [rcx + r10]
 27633  	cmp	rdx, r8
 27634  	jbe	.LBB4_730
 27635  # %bb.729:
 27636  	lea	rdx, [r8 + 2*r10]
 27637  	cmp	rdx, rcx
 27638  	jbe	.LBB4_730
 27639  .LBB4_317:
 27640  	xor	edx, edx
 27641  .LBB4_1395:
 27642  	mov	rsi, rdx
 27643  	not	rsi
 27644  	test	r10b, 1
 27645  	je	.LBB4_1397
 27646  # %bb.1396:
 27647  	mov	r9b, byte ptr [rcx + rdx]
 27648  	xor	edi, edi
 27649  	test	r9b, r9b
 27650  	setne	dil
 27651  	neg	edi
 27652  	test	r9b, r9b
 27653  	mov	eax, 1
 27654  	cmovle	eax, edi
 27655  	mov	word ptr [r8 + 2*rdx], ax
 27656  	or	rdx, 1
 27657  .LBB4_1397:
 27658  	add	rsi, r10
 27659  	je	.LBB4_1655
 27660  # %bb.1398:
 27661  	mov	esi, 1
 27662  .LBB4_1399:                             # =>This Inner Loop Header: Depth=1
 27663  	movzx	eax, byte ptr [rcx + rdx]
 27664  	xor	edi, edi
 27665  	test	al, al
 27666  	setne	dil
 27667  	neg	edi
 27668  	test	al, al
 27669  	cmovg	edi, esi
 27670  	mov	word ptr [r8 + 2*rdx], di
 27671  	movzx	eax, byte ptr [rcx + rdx + 1]
 27672  	xor	edi, edi
 27673  	test	al, al
 27674  	setne	dil
 27675  	neg	edi
 27676  	test	al, al
 27677  	cmovg	edi, esi
 27678  	mov	word ptr [r8 + 2*rdx + 2], di
 27679  	add	rdx, 2
 27680  	cmp	r10, rdx
 27681  	jne	.LBB4_1399
 27682  	jmp	.LBB4_1655
 27683  .LBB4_318:
 27684  	test	r9d, r9d
 27685  	jle	.LBB4_1655
 27686  # %bb.319:
 27687  	mov	r10d, r9d
 27688  	cmp	r9d, 16
 27689  	jb	.LBB4_320
 27690  # %bb.733:
 27691  	lea	rdx, [rcx + r10]
 27692  	cmp	rdx, r8
 27693  	jbe	.LBB4_735
 27694  # %bb.734:
 27695  	lea	rdx, [r8 + 2*r10]
 27696  	cmp	rdx, rcx
 27697  	jbe	.LBB4_735
 27698  .LBB4_320:
 27699  	xor	edx, edx
 27700  .LBB4_1404:
 27701  	mov	rsi, rdx
 27702  	not	rsi
 27703  	test	r10b, 1
 27704  	je	.LBB4_1406
 27705  # %bb.1405:
 27706  	mov	r9b, byte ptr [rcx + rdx]
 27707  	xor	edi, edi
 27708  	test	r9b, r9b
 27709  	setne	dil
 27710  	neg	edi
 27711  	test	r9b, r9b
 27712  	mov	eax, 1
 27713  	cmovle	eax, edi
 27714  	mov	word ptr [r8 + 2*rdx], ax
 27715  	or	rdx, 1
 27716  .LBB4_1406:
 27717  	add	rsi, r10
 27718  	je	.LBB4_1655
 27719  # %bb.1407:
 27720  	mov	esi, 1
 27721  .LBB4_1408:                             # =>This Inner Loop Header: Depth=1
 27722  	movzx	eax, byte ptr [rcx + rdx]
 27723  	xor	edi, edi
 27724  	test	al, al
 27725  	setne	dil
 27726  	neg	edi
 27727  	test	al, al
 27728  	cmovg	edi, esi
 27729  	mov	word ptr [r8 + 2*rdx], di
 27730  	movzx	eax, byte ptr [rcx + rdx + 1]
 27731  	xor	edi, edi
 27732  	test	al, al
 27733  	setne	dil
 27734  	neg	edi
 27735  	test	al, al
 27736  	cmovg	edi, esi
 27737  	mov	word ptr [r8 + 2*rdx + 2], di
 27738  	add	rdx, 2
 27739  	cmp	r10, rdx
 27740  	jne	.LBB4_1408
 27741  	jmp	.LBB4_1655
 27742  .LBB4_321:
 27743  	test	r9d, r9d
 27744  	jle	.LBB4_1655
 27745  # %bb.322:
 27746  	mov	eax, r9d
 27747  	cmp	r9d, 4
 27748  	jae	.LBB4_738
 27749  # %bb.323:
 27750  	xor	edx, edx
 27751  	jmp	.LBB4_1031
 27752  .LBB4_324:
 27753  	test	r9d, r9d
 27754  	jle	.LBB4_1655
 27755  # %bb.325:
 27756  	mov	eax, r9d
 27757  	cmp	r9d, 4
 27758  	jae	.LBB4_741
 27759  # %bb.326:
 27760  	xor	edx, edx
 27761  	jmp	.LBB4_1036
 27762  .LBB4_327:
 27763  	test	r9d, r9d
 27764  	jle	.LBB4_1655
 27765  # %bb.328:
 27766  	mov	r10d, r9d
 27767  	cmp	r9d, 16
 27768  	jb	.LBB4_329
 27769  # %bb.744:
 27770  	lea	rdx, [rcx + 2*r10]
 27771  	cmp	rdx, r8
 27772  	jbe	.LBB4_746
 27773  # %bb.745:
 27774  	lea	rdx, [r8 + 2*r10]
 27775  	cmp	rdx, rcx
 27776  	jbe	.LBB4_746
 27777  .LBB4_329:
 27778  	xor	edx, edx
 27779  .LBB4_1413:
 27780  	mov	rsi, rdx
 27781  	not	rsi
 27782  	add	rsi, r10
 27783  	mov	rdi, r10
 27784  	and	rdi, 3
 27785  	je	.LBB4_1415
 27786  .LBB4_1414:                             # =>This Inner Loop Header: Depth=1
 27787  	xor	eax, eax
 27788  	cmp	word ptr [rcx + 2*rdx], 0
 27789  	setne	al
 27790  	mov	word ptr [r8 + 2*rdx], ax
 27791  	add	rdx, 1
 27792  	add	rdi, -1
 27793  	jne	.LBB4_1414
 27794  .LBB4_1415:
 27795  	cmp	rsi, 3
 27796  	jb	.LBB4_1655
 27797  .LBB4_1416:                             # =>This Inner Loop Header: Depth=1
 27798  	xor	eax, eax
 27799  	cmp	word ptr [rcx + 2*rdx], 0
 27800  	setne	al
 27801  	mov	word ptr [r8 + 2*rdx], ax
 27802  	xor	eax, eax
 27803  	cmp	word ptr [rcx + 2*rdx + 2], 0
 27804  	setne	al
 27805  	mov	word ptr [r8 + 2*rdx + 2], ax
 27806  	xor	eax, eax
 27807  	cmp	word ptr [rcx + 2*rdx + 4], 0
 27808  	setne	al
 27809  	mov	word ptr [r8 + 2*rdx + 4], ax
 27810  	xor	eax, eax
 27811  	cmp	word ptr [rcx + 2*rdx + 6], 0
 27812  	setne	al
 27813  	mov	word ptr [r8 + 2*rdx + 6], ax
 27814  	add	rdx, 4
 27815  	cmp	r10, rdx
 27816  	jne	.LBB4_1416
 27817  	jmp	.LBB4_1655
 27818  .LBB4_330:
 27819  	test	r9d, r9d
 27820  	jle	.LBB4_1655
 27821  # %bb.331:
 27822  	mov	r10d, r9d
 27823  	cmp	r9d, 16
 27824  	jb	.LBB4_332
 27825  # %bb.749:
 27826  	lea	rdx, [rcx + 2*r10]
 27827  	cmp	rdx, r8
 27828  	jbe	.LBB4_751
 27829  # %bb.750:
 27830  	lea	rdx, [r8 + 2*r10]
 27831  	cmp	rdx, rcx
 27832  	jbe	.LBB4_751
 27833  .LBB4_332:
 27834  	xor	edx, edx
 27835  .LBB4_1421:
 27836  	mov	rsi, rdx
 27837  	not	rsi
 27838  	add	rsi, r10
 27839  	mov	rdi, r10
 27840  	and	rdi, 3
 27841  	je	.LBB4_1423
 27842  .LBB4_1422:                             # =>This Inner Loop Header: Depth=1
 27843  	xor	eax, eax
 27844  	cmp	word ptr [rcx + 2*rdx], 0
 27845  	setne	al
 27846  	mov	word ptr [r8 + 2*rdx], ax
 27847  	add	rdx, 1
 27848  	add	rdi, -1
 27849  	jne	.LBB4_1422
 27850  .LBB4_1423:
 27851  	cmp	rsi, 3
 27852  	jb	.LBB4_1655
 27853  .LBB4_1424:                             # =>This Inner Loop Header: Depth=1
 27854  	xor	eax, eax
 27855  	cmp	word ptr [rcx + 2*rdx], 0
 27856  	setne	al
 27857  	mov	word ptr [r8 + 2*rdx], ax
 27858  	xor	eax, eax
 27859  	cmp	word ptr [rcx + 2*rdx + 2], 0
 27860  	setne	al
 27861  	mov	word ptr [r8 + 2*rdx + 2], ax
 27862  	xor	eax, eax
 27863  	cmp	word ptr [rcx + 2*rdx + 4], 0
 27864  	setne	al
 27865  	mov	word ptr [r8 + 2*rdx + 4], ax
 27866  	xor	eax, eax
 27867  	cmp	word ptr [rcx + 2*rdx + 6], 0
 27868  	setne	al
 27869  	mov	word ptr [r8 + 2*rdx + 6], ax
 27870  	add	rdx, 4
 27871  	cmp	r10, rdx
 27872  	jne	.LBB4_1424
 27873  	jmp	.LBB4_1655
 27874  .LBB4_333:
 27875  	test	r9d, r9d
 27876  	jle	.LBB4_1655
 27877  # %bb.334:
 27878  	mov	r11d, r9d
 27879  	cmp	r9d, 16
 27880  	jb	.LBB4_335
 27881  # %bb.754:
 27882  	lea	rdx, [rcx + 2*r11]
 27883  	cmp	rdx, r8
 27884  	jbe	.LBB4_756
 27885  # %bb.755:
 27886  	lea	rdx, [r8 + 2*r11]
 27887  	cmp	rdx, rcx
 27888  	jbe	.LBB4_756
 27889  .LBB4_335:
 27890  	xor	edx, edx
 27891  .LBB4_1429:
 27892  	mov	rsi, rdx
 27893  	not	rsi
 27894  	test	r11b, 1
 27895  	je	.LBB4_1431
 27896  # %bb.1430:
 27897  	movzx	r9d, word ptr [rcx + 2*rdx]
 27898  	xor	r10d, r10d
 27899  	test	r9w, r9w
 27900  	setne	r10b
 27901  	neg	r10d
 27902  	test	r9w, r9w
 27903  	mov	edi, 1
 27904  	cmovle	edi, r10d
 27905  	mov	word ptr [r8 + 2*rdx], di
 27906  	or	rdx, 1
 27907  .LBB4_1431:
 27908  	add	rsi, r11
 27909  	je	.LBB4_1655
 27910  # %bb.1432:
 27911  	mov	esi, 1
 27912  .LBB4_1433:                             # =>This Inner Loop Header: Depth=1
 27913  	movzx	edi, word ptr [rcx + 2*rdx]
 27914  	xor	eax, eax
 27915  	test	di, di
 27916  	setne	al
 27917  	neg	eax
 27918  	test	di, di
 27919  	cmovg	eax, esi
 27920  	mov	word ptr [r8 + 2*rdx], ax
 27921  	movzx	eax, word ptr [rcx + 2*rdx + 2]
 27922  	xor	edi, edi
 27923  	test	ax, ax
 27924  	setne	dil
 27925  	neg	edi
 27926  	test	ax, ax
 27927  	cmovg	edi, esi
 27928  	mov	word ptr [r8 + 2*rdx + 2], di
 27929  	add	rdx, 2
 27930  	cmp	r11, rdx
 27931  	jne	.LBB4_1433
 27932  	jmp	.LBB4_1655
 27933  .LBB4_336:
 27934  	test	r9d, r9d
 27935  	jle	.LBB4_1655
 27936  # %bb.337:
 27937  	mov	r11d, r9d
 27938  	cmp	r9d, 16
 27939  	jb	.LBB4_338
 27940  # %bb.759:
 27941  	lea	rdx, [rcx + 2*r11]
 27942  	cmp	rdx, r8
 27943  	jbe	.LBB4_761
 27944  # %bb.760:
 27945  	lea	rdx, [r8 + 2*r11]
 27946  	cmp	rdx, rcx
 27947  	jbe	.LBB4_761
 27948  .LBB4_338:
 27949  	xor	edx, edx
 27950  .LBB4_1438:
 27951  	mov	rsi, rdx
 27952  	not	rsi
 27953  	test	r11b, 1
 27954  	je	.LBB4_1440
 27955  # %bb.1439:
 27956  	movzx	r9d, word ptr [rcx + 2*rdx]
 27957  	xor	r10d, r10d
 27958  	test	r9w, r9w
 27959  	setne	r10b
 27960  	neg	r10d
 27961  	test	r9w, r9w
 27962  	mov	edi, 1
 27963  	cmovle	edi, r10d
 27964  	mov	word ptr [r8 + 2*rdx], di
 27965  	or	rdx, 1
 27966  .LBB4_1440:
 27967  	add	rsi, r11
 27968  	je	.LBB4_1655
 27969  # %bb.1441:
 27970  	mov	esi, 1
 27971  .LBB4_1442:                             # =>This Inner Loop Header: Depth=1
 27972  	movzx	edi, word ptr [rcx + 2*rdx]
 27973  	xor	eax, eax
 27974  	test	di, di
 27975  	setne	al
 27976  	neg	eax
 27977  	test	di, di
 27978  	cmovg	eax, esi
 27979  	mov	word ptr [r8 + 2*rdx], ax
 27980  	movzx	eax, word ptr [rcx + 2*rdx + 2]
 27981  	xor	edi, edi
 27982  	test	ax, ax
 27983  	setne	dil
 27984  	neg	edi
 27985  	test	ax, ax
 27986  	cmovg	edi, esi
 27987  	mov	word ptr [r8 + 2*rdx + 2], di
 27988  	add	rdx, 2
 27989  	cmp	r11, rdx
 27990  	jne	.LBB4_1442
 27991  	jmp	.LBB4_1655
 27992  .LBB4_339:
 27993  	test	r9d, r9d
 27994  	jle	.LBB4_1655
 27995  # %bb.340:
 27996  	mov	r10d, r9d
 27997  	cmp	r9d, 4
 27998  	jae	.LBB4_764
 27999  # %bb.341:
 28000  	xor	edx, edx
 28001  	jmp	.LBB4_1041
 28002  .LBB4_342:
 28003  	test	r9d, r9d
 28004  	jle	.LBB4_1655
 28005  # %bb.343:
 28006  	mov	r10d, r9d
 28007  	cmp	r9d, 4
 28008  	jae	.LBB4_767
 28009  # %bb.344:
 28010  	xor	edx, edx
 28011  	jmp	.LBB4_1163
 28012  .LBB4_345:
 28013  	test	r9d, r9d
 28014  	jle	.LBB4_1655
 28015  # %bb.346:
 28016  	mov	eax, r9d
 28017  	xor	r10d, r10d
 28018  	cmp	r9d, 8
 28019  	jae	.LBB4_770
 28020  # %bb.347:
 28021  	xor	esi, esi
 28022  	jmp	.LBB4_1169
 28023  .LBB4_348:
 28024  	test	r9d, r9d
 28025  	jle	.LBB4_1655
 28026  # %bb.349:
 28027  	mov	eax, r9d
 28028  	xor	r10d, r10d
 28029  	cmp	r9d, 8
 28030  	jae	.LBB4_773
 28031  # %bb.350:
 28032  	xor	esi, esi
 28033  	jmp	.LBB4_1175
 28034  .LBB4_351:
 28035  	test	r9d, r9d
 28036  	jle	.LBB4_1655
 28037  # %bb.352:
 28038  	mov	r10d, r9d
 28039  	cmp	r9d, 16
 28040  	jb	.LBB4_353
 28041  # %bb.776:
 28042  	lea	rdx, [rcx + r10]
 28043  	cmp	rdx, r8
 28044  	jbe	.LBB4_778
 28045  # %bb.777:
 28046  	lea	rdx, [r8 + 2*r10]
 28047  	cmp	rdx, rcx
 28048  	jbe	.LBB4_778
 28049  .LBB4_353:
 28050  	xor	edx, edx
 28051  .LBB4_1447:
 28052  	mov	rsi, rdx
 28053  	not	rsi
 28054  	add	rsi, r10
 28055  	mov	rdi, r10
 28056  	and	rdi, 3
 28057  	je	.LBB4_1449
 28058  .LBB4_1448:                             # =>This Inner Loop Header: Depth=1
 28059  	xor	eax, eax
 28060  	cmp	byte ptr [rcx + rdx], 0
 28061  	setne	al
 28062  	mov	word ptr [r8 + 2*rdx], ax
 28063  	add	rdx, 1
 28064  	add	rdi, -1
 28065  	jne	.LBB4_1448
 28066  .LBB4_1449:
 28067  	cmp	rsi, 3
 28068  	jb	.LBB4_1655
 28069  .LBB4_1450:                             # =>This Inner Loop Header: Depth=1
 28070  	xor	eax, eax
 28071  	cmp	byte ptr [rcx + rdx], 0
 28072  	setne	al
 28073  	mov	word ptr [r8 + 2*rdx], ax
 28074  	xor	eax, eax
 28075  	cmp	byte ptr [rcx + rdx + 1], 0
 28076  	setne	al
 28077  	mov	word ptr [r8 + 2*rdx + 2], ax
 28078  	xor	eax, eax
 28079  	cmp	byte ptr [rcx + rdx + 2], 0
 28080  	setne	al
 28081  	mov	word ptr [r8 + 2*rdx + 4], ax
 28082  	xor	eax, eax
 28083  	cmp	byte ptr [rcx + rdx + 3], 0
 28084  	setne	al
 28085  	mov	word ptr [r8 + 2*rdx + 6], ax
 28086  	add	rdx, 4
 28087  	cmp	r10, rdx
 28088  	jne	.LBB4_1450
 28089  	jmp	.LBB4_1655
 28090  .LBB4_354:
 28091  	test	r9d, r9d
 28092  	jle	.LBB4_1655
 28093  # %bb.355:
 28094  	mov	r10d, r9d
 28095  	cmp	r9d, 16
 28096  	jb	.LBB4_356
 28097  # %bb.781:
 28098  	lea	rdx, [rcx + r10]
 28099  	cmp	rdx, r8
 28100  	jbe	.LBB4_783
 28101  # %bb.782:
 28102  	lea	rdx, [r8 + 2*r10]
 28103  	cmp	rdx, rcx
 28104  	jbe	.LBB4_783
 28105  .LBB4_356:
 28106  	xor	edx, edx
 28107  .LBB4_1455:
 28108  	mov	rsi, rdx
 28109  	not	rsi
 28110  	add	rsi, r10
 28111  	mov	rdi, r10
 28112  	and	rdi, 3
 28113  	je	.LBB4_1457
 28114  .LBB4_1456:                             # =>This Inner Loop Header: Depth=1
 28115  	xor	eax, eax
 28116  	cmp	byte ptr [rcx + rdx], 0
 28117  	setne	al
 28118  	mov	word ptr [r8 + 2*rdx], ax
 28119  	add	rdx, 1
 28120  	add	rdi, -1
 28121  	jne	.LBB4_1456
 28122  .LBB4_1457:
 28123  	cmp	rsi, 3
 28124  	jb	.LBB4_1655
 28125  .LBB4_1458:                             # =>This Inner Loop Header: Depth=1
 28126  	xor	eax, eax
 28127  	cmp	byte ptr [rcx + rdx], 0
 28128  	setne	al
 28129  	mov	word ptr [r8 + 2*rdx], ax
 28130  	xor	eax, eax
 28131  	cmp	byte ptr [rcx + rdx + 1], 0
 28132  	setne	al
 28133  	mov	word ptr [r8 + 2*rdx + 2], ax
 28134  	xor	eax, eax
 28135  	cmp	byte ptr [rcx + rdx + 2], 0
 28136  	setne	al
 28137  	mov	word ptr [r8 + 2*rdx + 4], ax
 28138  	xor	eax, eax
 28139  	cmp	byte ptr [rcx + rdx + 3], 0
 28140  	setne	al
 28141  	mov	word ptr [r8 + 2*rdx + 6], ax
 28142  	add	rdx, 4
 28143  	cmp	r10, rdx
 28144  	jne	.LBB4_1458
 28145  	jmp	.LBB4_1655
 28146  .LBB4_357:
 28147  	test	r9d, r9d
 28148  	jle	.LBB4_1655
 28149  # %bb.358:
 28150  	mov	r10d, r9d
 28151  	cmp	r9d, 8
 28152  	jae	.LBB4_786
 28153  # %bb.359:
 28154  	xor	edx, edx
 28155  	jmp	.LBB4_1047
 28156  .LBB4_360:
 28157  	test	r9d, r9d
 28158  	jle	.LBB4_1655
 28159  # %bb.361:
 28160  	mov	r10d, r9d
 28161  	cmp	r9d, 8
 28162  	jae	.LBB4_789
 28163  # %bb.362:
 28164  	xor	edx, edx
 28165  	jmp	.LBB4_1053
 28166  .LBB4_363:
 28167  	test	r9d, r9d
 28168  	jle	.LBB4_1655
 28169  # %bb.364:
 28170  	mov	eax, r9d
 28171  	cmp	r9d, 4
 28172  	jae	.LBB4_792
 28173  # %bb.365:
 28174  	xor	edx, edx
 28175  	jmp	.LBB4_1181
 28176  .LBB4_366:
 28177  	test	r9d, r9d
 28178  	jle	.LBB4_1655
 28179  # %bb.367:
 28180  	mov	eax, r9d
 28181  	cmp	r9d, 8
 28182  	jae	.LBB4_795
 28183  # %bb.368:
 28184  	xor	edx, edx
 28185  	jmp	.LBB4_1186
 28186  .LBB4_369:
 28187  	test	r9d, r9d
 28188  	jle	.LBB4_1655
 28189  # %bb.370:
 28190  	mov	eax, r9d
 28191  	cmp	r9d, 4
 28192  	jae	.LBB4_798
 28193  # %bb.371:
 28194  	xor	edx, edx
 28195  	jmp	.LBB4_1194
 28196  .LBB4_372:
 28197  	test	r9d, r9d
 28198  	jle	.LBB4_1655
 28199  # %bb.373:
 28200  	mov	eax, r9d
 28201  	cmp	r9d, 4
 28202  	jae	.LBB4_801
 28203  # %bb.374:
 28204  	xor	edx, edx
 28205  	jmp	.LBB4_1200
 28206  .LBB4_375:
 28207  	test	r9d, r9d
 28208  	jle	.LBB4_1655
 28209  # %bb.376:
 28210  	mov	r10d, r9d
 28211  	cmp	r9d, 4
 28212  	jb	.LBB4_377
 28213  # %bb.804:
 28214  	lea	rdx, [rcx + r10]
 28215  	cmp	rdx, r8
 28216  	jbe	.LBB4_806
 28217  # %bb.805:
 28218  	lea	rdx, [r8 + 8*r10]
 28219  	cmp	rdx, rcx
 28220  	jbe	.LBB4_806
 28221  .LBB4_377:
 28222  	xor	edx, edx
 28223  .LBB4_1463:
 28224  	mov	rsi, rdx
 28225  	not	rsi
 28226  	test	r10b, 1
 28227  	je	.LBB4_1465
 28228  # %bb.1464:
 28229  	mov	al, byte ptr [rcx + rdx]
 28230  	xor	edi, edi
 28231  	test	al, al
 28232  	setne	dil
 28233  	neg	rdi
 28234  	test	al, al
 28235  	mov	eax, 1
 28236  	cmovle	rax, rdi
 28237  	mov	qword ptr [r8 + 8*rdx], rax
 28238  	or	rdx, 1
 28239  .LBB4_1465:
 28240  	add	rsi, r10
 28241  	je	.LBB4_1655
 28242  # %bb.1466:
 28243  	mov	esi, 1
 28244  .LBB4_1467:                             # =>This Inner Loop Header: Depth=1
 28245  	movzx	eax, byte ptr [rcx + rdx]
 28246  	xor	edi, edi
 28247  	test	al, al
 28248  	setne	dil
 28249  	neg	rdi
 28250  	test	al, al
 28251  	cmovg	rdi, rsi
 28252  	mov	qword ptr [r8 + 8*rdx], rdi
 28253  	movzx	eax, byte ptr [rcx + rdx + 1]
 28254  	xor	edi, edi
 28255  	test	al, al
 28256  	setne	dil
 28257  	neg	rdi
 28258  	test	al, al
 28259  	cmovg	rdi, rsi
 28260  	mov	qword ptr [r8 + 8*rdx + 8], rdi
 28261  	add	rdx, 2
 28262  	cmp	r10, rdx
 28263  	jne	.LBB4_1467
 28264  	jmp	.LBB4_1655
 28265  .LBB4_378:
 28266  	test	r9d, r9d
 28267  	jle	.LBB4_1655
 28268  # %bb.379:
 28269  	mov	eax, r9d
 28270  	cmp	r9d, 8
 28271  	jb	.LBB4_380
 28272  # %bb.809:
 28273  	lea	rdx, [rcx + rax]
 28274  	cmp	rdx, r8
 28275  	jbe	.LBB4_811
 28276  # %bb.810:
 28277  	lea	rdx, [r8 + 4*rax]
 28278  	cmp	rdx, rcx
 28279  	jbe	.LBB4_811
 28280  .LBB4_380:
 28281  	xor	edx, edx
 28282  .LBB4_1472:
 28283  	mov	rsi, rdx
 28284  	not	rsi
 28285  	test	al, 1
 28286  	je	.LBB4_1479
 28287  # %bb.1473:
 28288  	cmp	byte ptr [rcx + rdx], 0
 28289  	jne	.LBB4_1475
 28290  # %bb.1474:
 28291  	pxor	xmm0, xmm0
 28292  	jmp	.LBB4_1476
 28293  .LBB4_381:
 28294  	test	r9d, r9d
 28295  	jle	.LBB4_1655
 28296  # %bb.382:
 28297  	mov	r10d, r9d
 28298  	cmp	r9d, 4
 28299  	jb	.LBB4_383
 28300  # %bb.814:
 28301  	lea	rdx, [rcx + 8*r10]
 28302  	cmp	rdx, r8
 28303  	jbe	.LBB4_816
 28304  # %bb.815:
 28305  	lea	rdx, [r8 + 8*r10]
 28306  	cmp	rdx, rcx
 28307  	jbe	.LBB4_816
 28308  .LBB4_383:
 28309  	xor	edx, edx
 28310  .LBB4_1494:
 28311  	mov	rsi, rdx
 28312  	not	rsi
 28313  	add	rsi, r10
 28314  	mov	rdi, r10
 28315  	and	rdi, 3
 28316  	je	.LBB4_1496
 28317  .LBB4_1495:                             # =>This Inner Loop Header: Depth=1
 28318  	xor	eax, eax
 28319  	cmp	qword ptr [rcx + 8*rdx], 0
 28320  	setne	al
 28321  	mov	qword ptr [r8 + 8*rdx], rax
 28322  	add	rdx, 1
 28323  	add	rdi, -1
 28324  	jne	.LBB4_1495
 28325  .LBB4_1496:
 28326  	cmp	rsi, 3
 28327  	jb	.LBB4_1655
 28328  .LBB4_1497:                             # =>This Inner Loop Header: Depth=1
 28329  	xor	eax, eax
 28330  	cmp	qword ptr [rcx + 8*rdx], 0
 28331  	setne	al
 28332  	mov	qword ptr [r8 + 8*rdx], rax
 28333  	xor	eax, eax
 28334  	cmp	qword ptr [rcx + 8*rdx + 8], 0
 28335  	setne	al
 28336  	mov	qword ptr [r8 + 8*rdx + 8], rax
 28337  	xor	eax, eax
 28338  	cmp	qword ptr [rcx + 8*rdx + 16], 0
 28339  	setne	al
 28340  	mov	qword ptr [r8 + 8*rdx + 16], rax
 28341  	xor	eax, eax
 28342  	cmp	qword ptr [rcx + 8*rdx + 24], 0
 28343  	setne	al
 28344  	mov	qword ptr [r8 + 8*rdx + 24], rax
 28345  	add	rdx, 4
 28346  	cmp	r10, rdx
 28347  	jne	.LBB4_1497
 28348  	jmp	.LBB4_1655
 28349  .LBB4_384:
 28350  	test	r9d, r9d
 28351  	jle	.LBB4_1655
 28352  # %bb.385:
 28353  	mov	edx, r9d
 28354  	lea	rsi, [rdx - 1]
 28355  	mov	eax, edx
 28356  	and	eax, 3
 28357  	cmp	rsi, 3
 28358  	jae	.LBB4_819
 28359  # %bb.386:
 28360  	xor	esi, esi
 28361  .LBB4_387:
 28362  	test	rax, rax
 28363  	je	.LBB4_1655
 28364  # %bb.388:
 28365  	lea	rdx, [r8 + 4*rsi]
 28366  	lea	rcx, [rcx + 8*rsi]
 28367  	xor	esi, esi
 28368  	movss	xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero
 28369  	jmp	.LBB4_390
 28370  .LBB4_389:                              #   in Loop: Header=BB4_390 Depth=1
 28371  	movss	dword ptr [rdx + 4*rsi], xmm1
 28372  	add	rsi, 1
 28373  	cmp	rax, rsi
 28374  	je	.LBB4_1655
 28375  .LBB4_390:                              # =>This Inner Loop Header: Depth=1
 28376  	cmp	qword ptr [rcx + 8*rsi], 0
 28377  	movapd	xmm1, xmm0
 28378  	jne	.LBB4_389
 28379  # %bb.391:                              #   in Loop: Header=BB4_390 Depth=1
 28380  	xorpd	xmm1, xmm1
 28381  	jmp	.LBB4_389
 28382  .LBB4_392:
 28383  	test	r9d, r9d
 28384  	jle	.LBB4_1655
 28385  # %bb.393:
 28386  	mov	eax, r9d
 28387  	cmp	r9d, 4
 28388  	jae	.LBB4_829
 28389  # %bb.394:
 28390  	xor	edx, edx
 28391  	jmp	.LBB4_1059
 28392  .LBB4_395:
 28393  	test	r9d, r9d
 28394  	jle	.LBB4_1655
 28395  # %bb.396:
 28396  	mov	eax, r9d
 28397  	cmp	r9d, 8
 28398  	jae	.LBB4_832
 28399  # %bb.397:
 28400  	xor	edx, edx
 28401  	jmp	.LBB4_1208
 28402  .LBB4_398:
 28403  	test	r9d, r9d
 28404  	jle	.LBB4_1655
 28405  # %bb.399:
 28406  	mov	r10d, r9d
 28407  	cmp	r9d, 4
 28408  	jae	.LBB4_835
 28409  # %bb.400:
 28410  	xor	edx, edx
 28411  	jmp	.LBB4_1216
 28412  .LBB4_401:
 28413  	test	r9d, r9d
 28414  	jle	.LBB4_1655
 28415  # %bb.402:
 28416  	mov	eax, r9d
 28417  	cmp	r9d, 8
 28418  	jae	.LBB4_838
 28419  # %bb.403:
 28420  	xor	edx, edx
 28421  	jmp	.LBB4_1222
 28422  .LBB4_404:
 28423  	test	r9d, r9d
 28424  	jle	.LBB4_1655
 28425  # %bb.405:
 28426  	mov	r11d, r9d
 28427  	cmp	r9d, 4
 28428  	jb	.LBB4_406
 28429  # %bb.841:
 28430  	lea	rdx, [rcx + 8*r11]
 28431  	cmp	rdx, r8
 28432  	jbe	.LBB4_843
 28433  # %bb.842:
 28434  	lea	rdx, [r8 + 8*r11]
 28435  	cmp	rdx, rcx
 28436  	jbe	.LBB4_843
 28437  .LBB4_406:
 28438  	xor	edx, edx
 28439  .LBB4_1502:
 28440  	mov	rsi, rdx
 28441  	not	rsi
 28442  	test	r11b, 1
 28443  	je	.LBB4_1504
 28444  # %bb.1503:
 28445  	mov	r9, qword ptr [rcx + 8*rdx]
 28446  	xor	r10d, r10d
 28447  	test	r9, r9
 28448  	setne	r10b
 28449  	neg	r10
 28450  	test	r9, r9
 28451  	mov	edi, 1
 28452  	cmovle	rdi, r10
 28453  	mov	qword ptr [r8 + 8*rdx], rdi
 28454  	or	rdx, 1
 28455  .LBB4_1504:
 28456  	add	rsi, r11
 28457  	je	.LBB4_1655
 28458  # %bb.1505:
 28459  	mov	esi, 1
 28460  .LBB4_1506:                             # =>This Inner Loop Header: Depth=1
 28461  	mov	rdi, qword ptr [rcx + 8*rdx]
 28462  	xor	eax, eax
 28463  	test	rdi, rdi
 28464  	setne	al
 28465  	neg	rax
 28466  	test	rdi, rdi
 28467  	cmovg	rax, rsi
 28468  	mov	qword ptr [r8 + 8*rdx], rax
 28469  	mov	rax, qword ptr [rcx + 8*rdx + 8]
 28470  	xor	edi, edi
 28471  	test	rax, rax
 28472  	setne	dil
 28473  	neg	rdi
 28474  	test	rax, rax
 28475  	cmovg	rdi, rsi
 28476  	mov	qword ptr [r8 + 8*rdx + 8], rdi
 28477  	add	rdx, 2
 28478  	cmp	r11, rdx
 28479  	jne	.LBB4_1506
 28480  	jmp	.LBB4_1655
 28481  .LBB4_407:
 28482  	test	r9d, r9d
 28483  	jle	.LBB4_1655
 28484  # %bb.408:
 28485  	mov	edx, r9d
 28486  	cmp	r9d, 1
 28487  	jne	.LBB4_846
 28488  # %bb.409:
 28489  	xor	eax, eax
 28490  .LBB4_410:
 28491  	test	dl, 1
 28492  	je	.LBB4_1655
 28493  # %bb.411:
 28494  	cmp	qword ptr [rcx + 8*rax], 0
 28495  	jne	.LBB4_989
 28496  # %bb.412:
 28497  	xorpd	xmm0, xmm0
 28498  	jmp	.LBB4_990
 28499  .LBB4_413:
 28500  	test	r9d, r9d
 28501  	jle	.LBB4_1655
 28502  # %bb.414:
 28503  	mov	edx, r9d
 28504  	cmp	r9d, 1
 28505  	jne	.LBB4_856
 28506  # %bb.415:
 28507  	xor	eax, eax
 28508  	jmp	.LBB4_416
 28509  .LBB4_419:
 28510  	test	r9d, r9d
 28511  	jle	.LBB4_1655
 28512  # %bb.420:
 28513  	mov	eax, r9d
 28514  	cmp	r9d, 8
 28515  	jb	.LBB4_421
 28516  # %bb.864:
 28517  	lea	rdx, [rcx + 4*rax]
 28518  	cmp	rdx, r8
 28519  	jbe	.LBB4_866
 28520  # %bb.865:
 28521  	lea	rdx, [r8 + 4*rax]
 28522  	cmp	rdx, rcx
 28523  	jbe	.LBB4_866
 28524  .LBB4_421:
 28525  	xor	edx, edx
 28526  .LBB4_869:
 28527  	mov	rsi, rdx
 28528  	not	rsi
 28529  	test	al, 1
 28530  	je	.LBB4_871
 28531  # %bb.870:
 28532  	movss	xmm0, dword ptr [rcx + 4*rdx]   # xmm0 = mem[0],zero,zero,zero
 28533  	movmskps	edi, xmm0
 28534  	and	edi, 1
 28535  	neg	edi
 28536  	or	edi, 1
 28537  	xorps	xmm1, xmm1
 28538  	cvtsi2ss	xmm1, edi
 28539  	xorps	xmm2, xmm2
 28540  	cmpeqss	xmm2, xmm0
 28541  	andnps	xmm2, xmm1
 28542  	movss	dword ptr [r8 + 4*rdx], xmm2
 28543  	or	rdx, 1
 28544  .LBB4_871:
 28545  	add	rsi, rax
 28546  	je	.LBB4_1655
 28547  # %bb.872:
 28548  	xorps	xmm0, xmm0
 28549  .LBB4_873:                              # =>This Inner Loop Header: Depth=1
 28550  	movss	xmm1, dword ptr [rcx + 4*rdx]   # xmm1 = mem[0],zero,zero,zero
 28551  	movmskps	esi, xmm1
 28552  	and	esi, 1
 28553  	neg	esi
 28554  	or	esi, 1
 28555  	xorps	xmm2, xmm2
 28556  	cvtsi2ss	xmm2, esi
 28557  	cmpeqss	xmm1, xmm0
 28558  	andnps	xmm1, xmm2
 28559  	movss	dword ptr [r8 + 4*rdx], xmm1
 28560  	movss	xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero
 28561  	movmskps	esi, xmm1
 28562  	and	esi, 1
 28563  	neg	esi
 28564  	or	esi, 1
 28565  	xorps	xmm2, xmm2
 28566  	cvtsi2ss	xmm2, esi
 28567  	cmpeqss	xmm1, xmm0
 28568  	andnps	xmm1, xmm2
 28569  	movss	dword ptr [r8 + 4*rdx + 4], xmm1
 28570  	add	rdx, 2
 28571  	cmp	rax, rdx
 28572  	jne	.LBB4_873
 28573  	jmp	.LBB4_1655
 28574  .LBB4_422:
 28575  	test	r9d, r9d
 28576  	jle	.LBB4_1655
 28577  # %bb.423:
 28578  	mov	r10d, r9d
 28579  	cmp	r9d, 4
 28580  	jb	.LBB4_424
 28581  # %bb.874:
 28582  	lea	rdx, [rcx + r10]
 28583  	cmp	rdx, r8
 28584  	jbe	.LBB4_876
 28585  # %bb.875:
 28586  	lea	rdx, [r8 + 8*r10]
 28587  	cmp	rdx, rcx
 28588  	jbe	.LBB4_876
 28589  .LBB4_424:
 28590  	xor	edx, edx
 28591  .LBB4_1511:
 28592  	mov	rsi, rdx
 28593  	not	rsi
 28594  	add	rsi, r10
 28595  	mov	rdi, r10
 28596  	and	rdi, 3
 28597  	je	.LBB4_1513
 28598  .LBB4_1512:                             # =>This Inner Loop Header: Depth=1
 28599  	xor	eax, eax
 28600  	cmp	byte ptr [rcx + rdx], 0
 28601  	setne	al
 28602  	mov	qword ptr [r8 + 8*rdx], rax
 28603  	add	rdx, 1
 28604  	add	rdi, -1
 28605  	jne	.LBB4_1512
 28606  .LBB4_1513:
 28607  	cmp	rsi, 3
 28608  	jb	.LBB4_1655
 28609  .LBB4_1514:                             # =>This Inner Loop Header: Depth=1
 28610  	xor	eax, eax
 28611  	cmp	byte ptr [rcx + rdx], 0
 28612  	setne	al
 28613  	mov	qword ptr [r8 + 8*rdx], rax
 28614  	xor	eax, eax
 28615  	cmp	byte ptr [rcx + rdx + 1], 0
 28616  	setne	al
 28617  	mov	qword ptr [r8 + 8*rdx + 8], rax
 28618  	xor	eax, eax
 28619  	cmp	byte ptr [rcx + rdx + 2], 0
 28620  	setne	al
 28621  	mov	qword ptr [r8 + 8*rdx + 16], rax
 28622  	xor	eax, eax
 28623  	cmp	byte ptr [rcx + rdx + 3], 0
 28624  	setne	al
 28625  	mov	qword ptr [r8 + 8*rdx + 24], rax
 28626  	add	rdx, 4
 28627  	cmp	r10, rdx
 28628  	jne	.LBB4_1514
 28629  	jmp	.LBB4_1655
 28630  .LBB4_425:
 28631  	test	r9d, r9d
 28632  	jle	.LBB4_1655
 28633  # %bb.426:
 28634  	mov	eax, r9d
 28635  	cmp	r9d, 8
 28636  	jb	.LBB4_427
 28637  # %bb.879:
 28638  	lea	rdx, [rcx + rax]
 28639  	cmp	rdx, r8
 28640  	jbe	.LBB4_881
 28641  # %bb.880:
 28642  	lea	rdx, [r8 + 4*rax]
 28643  	cmp	rdx, rcx
 28644  	jbe	.LBB4_881
 28645  .LBB4_427:
 28646  	xor	edx, edx
 28647  .LBB4_1519:
 28648  	mov	rsi, rdx
 28649  	not	rsi
 28650  	add	rsi, rax
 28651  	mov	rdi, rax
 28652  	and	rdi, 3
 28653  	je	.LBB4_1524
 28654  # %bb.1520:
 28655  	movd	xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero
 28656  	jmp	.LBB4_1522
 28657  .LBB4_1521:                             #   in Loop: Header=BB4_1522 Depth=1
 28658  	movd	dword ptr [r8 + 4*rdx], xmm1
 28659  	add	rdx, 1
 28660  	add	rdi, -1
 28661  	je	.LBB4_1524
 28662  .LBB4_1522:                             # =>This Inner Loop Header: Depth=1
 28663  	cmp	byte ptr [rcx + rdx], 0
 28664  	movdqa	xmm1, xmm0
 28665  	jne	.LBB4_1521
 28666  # %bb.1523:                             #   in Loop: Header=BB4_1522 Depth=1
 28667  	pxor	xmm1, xmm1
 28668  	jmp	.LBB4_1521
 28669  .LBB4_428:
 28670  	test	r9d, r9d
 28671  	jle	.LBB4_1655
 28672  # %bb.429:
 28673  	mov	r10d, r9d
 28674  	cmp	r9d, 4
 28675  	jae	.LBB4_884
 28676  # %bb.430:
 28677  	xor	edx, edx
 28678  	jmp	.LBB4_1064
 28679  .LBB4_431:
 28680  	test	r9d, r9d
 28681  	jle	.LBB4_1655
 28682  # %bb.432:
 28683  	mov	eax, r9d
 28684  	cmp	r9d, 8
 28685  	jae	.LBB4_887
 28686  # %bb.433:
 28687  	xor	edx, edx
 28688  	jmp	.LBB4_1070
 28689  .LBB4_434:
 28690  	test	r9d, r9d
 28691  	jle	.LBB4_1655
 28692  # %bb.435:
 28693  	mov	eax, r9d
 28694  	cmp	r9d, 8
 28695  	jb	.LBB4_436
 28696  # %bb.890:
 28697  	lea	rdx, [rcx + 4*rax]
 28698  	cmp	rdx, r8
 28699  	jbe	.LBB4_892
 28700  # %bb.891:
 28701  	lea	rdx, [r8 + rax]
 28702  	cmp	rdx, rcx
 28703  	jbe	.LBB4_892
 28704  .LBB4_436:
 28705  	xor	edx, edx
 28706  .LBB4_1539:
 28707  	mov	rsi, rdx
 28708  	not	rsi
 28709  	add	rsi, rax
 28710  	mov	rdi, rax
 28711  	and	rdi, 3
 28712  	je	.LBB4_1541
 28713  .LBB4_1540:                             # =>This Inner Loop Header: Depth=1
 28714  	cmp	dword ptr [rcx + 4*rdx], 0
 28715  	setne	byte ptr [r8 + rdx]
 28716  	add	rdx, 1
 28717  	add	rdi, -1
 28718  	jne	.LBB4_1540
 28719  .LBB4_1541:
 28720  	cmp	rsi, 3
 28721  	jb	.LBB4_1655
 28722  .LBB4_1542:                             # =>This Inner Loop Header: Depth=1
 28723  	cmp	dword ptr [rcx + 4*rdx], 0
 28724  	setne	byte ptr [r8 + rdx]
 28725  	cmp	dword ptr [rcx + 4*rdx + 4], 0
 28726  	setne	byte ptr [r8 + rdx + 1]
 28727  	cmp	dword ptr [rcx + 4*rdx + 8], 0
 28728  	setne	byte ptr [r8 + rdx + 2]
 28729  	cmp	dword ptr [rcx + 4*rdx + 12], 0
 28730  	setne	byte ptr [r8 + rdx + 3]
 28731  	add	rdx, 4
 28732  	cmp	rax, rdx
 28733  	jne	.LBB4_1542
 28734  	jmp	.LBB4_1655
 28735  .LBB4_437:
 28736  	test	r9d, r9d
 28737  	jle	.LBB4_1655
 28738  # %bb.438:
 28739  	mov	eax, r9d
 28740  	cmp	r9d, 4
 28741  	jb	.LBB4_439
 28742  # %bb.895:
 28743  	lea	rdx, [rcx + 8*rax]
 28744  	cmp	rdx, r8
 28745  	jbe	.LBB4_897
 28746  # %bb.896:
 28747  	lea	rdx, [r8 + rax]
 28748  	cmp	rdx, rcx
 28749  	jbe	.LBB4_897
 28750  .LBB4_439:
 28751  	xor	edx, edx
 28752  .LBB4_1547:
 28753  	mov	rsi, rdx
 28754  	not	rsi
 28755  	test	al, 1
 28756  	je	.LBB4_1549
 28757  # %bb.1548:
 28758  	movsd	xmm0, qword ptr [rcx + 8*rdx]   # xmm0 = mem[0],zero
 28759  	xor	r9d, r9d
 28760  	pxor	xmm1, xmm1
 28761  	ucomisd	xmm1, xmm0
 28762  	andpd	xmm0, xmmword ptr [rip + .LCPI4_0]
 28763  	movsd	xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero
 28764  	orpd	xmm1, xmm0
 28765  	cvttsd2si	edi, xmm1
 28766  	cmove	edi, r9d
 28767  	mov	byte ptr [r8 + rdx], dil
 28768  	or	rdx, 1
 28769  .LBB4_1549:
 28770  	add	rsi, rax
 28771  	je	.LBB4_1655
 28772  # %bb.1550:
 28773  	xor	esi, esi
 28774  	xorpd	xmm0, xmm0
 28775  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 28776  	movsd	xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero
 28777  .LBB4_1551:                             # =>This Inner Loop Header: Depth=1
 28778  	movsd	xmm3, qword ptr [rcx + 8*rdx]   # xmm3 = mem[0],zero
 28779  	ucomisd	xmm0, xmm3
 28780  	andpd	xmm3, xmm1
 28781  	orpd	xmm3, xmm2
 28782  	cvttsd2si	edi, xmm3
 28783  	cmove	edi, esi
 28784  	mov	byte ptr [r8 + rdx], dil
 28785  	movsd	xmm3, qword ptr [rcx + 8*rdx + 8] # xmm3 = mem[0],zero
 28786  	ucomisd	xmm0, xmm3
 28787  	andpd	xmm3, xmm1
 28788  	orpd	xmm3, xmm2
 28789  	cvttsd2si	edi, xmm3
 28790  	cmove	edi, esi
 28791  	mov	byte ptr [r8 + rdx + 1], dil
 28792  	add	rdx, 2
 28793  	cmp	rax, rdx
 28794  	jne	.LBB4_1551
 28795  	jmp	.LBB4_1655
 28796  .LBB4_440:
 28797  	test	r9d, r9d
 28798  	jle	.LBB4_1655
 28799  # %bb.441:
 28800  	mov	r10d, r9d
 28801  	cmp	r9d, 32
 28802  	jb	.LBB4_442
 28803  # %bb.900:
 28804  	lea	rdx, [rcx + r10]
 28805  	cmp	rdx, r8
 28806  	jbe	.LBB4_902
 28807  # %bb.901:
 28808  	lea	rdx, [r8 + r10]
 28809  	cmp	rdx, rcx
 28810  	jbe	.LBB4_902
 28811  .LBB4_442:
 28812  	xor	esi, esi
 28813  .LBB4_1556:
 28814  	mov	rax, rsi
 28815  	not	rax
 28816  	test	r10b, 1
 28817  	je	.LBB4_1558
 28818  # %bb.1557:
 28819  	mov	dil, byte ptr [rcx + rsi]
 28820  	test	dil, dil
 28821  	setne	r9b
 28822  	neg	r9b
 28823  	test	dil, dil
 28824  	movzx	r9d, r9b
 28825  	mov	edi, 1
 28826  	cmovle	edi, r9d
 28827  	mov	byte ptr [r8 + rsi], dil
 28828  	or	rsi, 1
 28829  .LBB4_1558:
 28830  	add	rax, r10
 28831  	je	.LBB4_1655
 28832  # %bb.1559:
 28833  	mov	edi, 1
 28834  .LBB4_1560:                             # =>This Inner Loop Header: Depth=1
 28835  	movzx	eax, byte ptr [rcx + rsi]
 28836  	test	al, al
 28837  	setne	dl
 28838  	neg	dl
 28839  	test	al, al
 28840  	movzx	eax, dl
 28841  	cmovg	eax, edi
 28842  	mov	byte ptr [r8 + rsi], al
 28843  	movzx	eax, byte ptr [rcx + rsi + 1]
 28844  	test	al, al
 28845  	setne	dl
 28846  	neg	dl
 28847  	test	al, al
 28848  	movzx	eax, dl
 28849  	cmovg	eax, edi
 28850  	mov	byte ptr [r8 + rsi + 1], al
 28851  	add	rsi, 2
 28852  	cmp	r10, rsi
 28853  	jne	.LBB4_1560
 28854  	jmp	.LBB4_1655
 28855  .LBB4_443:
 28856  	test	r9d, r9d
 28857  	jle	.LBB4_1655
 28858  # %bb.444:
 28859  	mov	eax, r9d
 28860  	cmp	r9d, 4
 28861  	jb	.LBB4_445
 28862  # %bb.905:
 28863  	lea	rdx, [rcx + 8*rax]
 28864  	cmp	rdx, r8
 28865  	jbe	.LBB4_907
 28866  # %bb.906:
 28867  	lea	rdx, [r8 + rax]
 28868  	cmp	rdx, rcx
 28869  	jbe	.LBB4_907
 28870  .LBB4_445:
 28871  	xor	edx, edx
 28872  .LBB4_1565:
 28873  	mov	rsi, rdx
 28874  	not	rsi
 28875  	add	rsi, rax
 28876  	mov	rdi, rax
 28877  	and	rdi, 3
 28878  	je	.LBB4_1567
 28879  .LBB4_1566:                             # =>This Inner Loop Header: Depth=1
 28880  	cmp	qword ptr [rcx + 8*rdx], 0
 28881  	setne	byte ptr [r8 + rdx]
 28882  	add	rdx, 1
 28883  	add	rdi, -1
 28884  	jne	.LBB4_1566
 28885  .LBB4_1567:
 28886  	cmp	rsi, 3
 28887  	jb	.LBB4_1655
 28888  .LBB4_1568:                             # =>This Inner Loop Header: Depth=1
 28889  	cmp	qword ptr [rcx + 8*rdx], 0
 28890  	setne	byte ptr [r8 + rdx]
 28891  	cmp	qword ptr [rcx + 8*rdx + 8], 0
 28892  	setne	byte ptr [r8 + rdx + 1]
 28893  	cmp	qword ptr [rcx + 8*rdx + 16], 0
 28894  	setne	byte ptr [r8 + rdx + 2]
 28895  	cmp	qword ptr [rcx + 8*rdx + 24], 0
 28896  	setne	byte ptr [r8 + rdx + 3]
 28897  	add	rdx, 4
 28898  	cmp	rax, rdx
 28899  	jne	.LBB4_1568
 28900  	jmp	.LBB4_1655
 28901  .LBB4_446:
 28902  	test	r9d, r9d
 28903  	jle	.LBB4_1655
 28904  # %bb.447:
 28905  	mov	eax, r9d
 28906  	cmp	r9d, 16
 28907  	jb	.LBB4_448
 28908  # %bb.910:
 28909  	lea	rdx, [rcx + 2*rax]
 28910  	cmp	rdx, r8
 28911  	jbe	.LBB4_912
 28912  # %bb.911:
 28913  	lea	rdx, [r8 + rax]
 28914  	cmp	rdx, rcx
 28915  	jbe	.LBB4_912
 28916  .LBB4_448:
 28917  	xor	edx, edx
 28918  .LBB4_1573:
 28919  	mov	rsi, rdx
 28920  	not	rsi
 28921  	add	rsi, rax
 28922  	mov	rdi, rax
 28923  	and	rdi, 3
 28924  	je	.LBB4_1575
 28925  .LBB4_1574:                             # =>This Inner Loop Header: Depth=1
 28926  	cmp	word ptr [rcx + 2*rdx], 0
 28927  	setne	byte ptr [r8 + rdx]
 28928  	add	rdx, 1
 28929  	add	rdi, -1
 28930  	jne	.LBB4_1574
 28931  .LBB4_1575:
 28932  	cmp	rsi, 3
 28933  	jb	.LBB4_1655
 28934  .LBB4_1576:                             # =>This Inner Loop Header: Depth=1
 28935  	cmp	word ptr [rcx + 2*rdx], 0
 28936  	setne	byte ptr [r8 + rdx]
 28937  	cmp	word ptr [rcx + 2*rdx + 2], 0
 28938  	setne	byte ptr [r8 + rdx + 1]
 28939  	cmp	word ptr [rcx + 2*rdx + 4], 0
 28940  	setne	byte ptr [r8 + rdx + 2]
 28941  	cmp	word ptr [rcx + 2*rdx + 6], 0
 28942  	setne	byte ptr [r8 + rdx + 3]
 28943  	add	rdx, 4
 28944  	cmp	rax, rdx
 28945  	jne	.LBB4_1576
 28946  	jmp	.LBB4_1655
 28947  .LBB4_449:
 28948  	test	r9d, r9d
 28949  	jle	.LBB4_1655
 28950  # %bb.450:
 28951  	mov	r10d, r9d
 28952  	cmp	r9d, 16
 28953  	jb	.LBB4_451
 28954  # %bb.915:
 28955  	lea	rdx, [rcx + 2*r10]
 28956  	cmp	rdx, r8
 28957  	jbe	.LBB4_917
 28958  # %bb.916:
 28959  	lea	rdx, [r8 + r10]
 28960  	cmp	rdx, rcx
 28961  	jbe	.LBB4_917
 28962  .LBB4_451:
 28963  	xor	esi, esi
 28964  .LBB4_1581:
 28965  	mov	rax, rsi
 28966  	not	rax
 28967  	test	r10b, 1
 28968  	je	.LBB4_1583
 28969  # %bb.1582:
 28970  	movzx	edi, word ptr [rcx + 2*rsi]
 28971  	test	di, di
 28972  	setne	r9b
 28973  	neg	r9b
 28974  	test	di, di
 28975  	movzx	r9d, r9b
 28976  	mov	edi, 1
 28977  	cmovle	edi, r9d
 28978  	mov	byte ptr [r8 + rsi], dil
 28979  	or	rsi, 1
 28980  .LBB4_1583:
 28981  	add	rax, r10
 28982  	je	.LBB4_1655
 28983  # %bb.1584:
 28984  	mov	r9d, 1
 28985  .LBB4_1585:                             # =>This Inner Loop Header: Depth=1
 28986  	movzx	edi, word ptr [rcx + 2*rsi]
 28987  	test	di, di
 28988  	setne	al
 28989  	neg	al
 28990  	test	di, di
 28991  	movzx	eax, al
 28992  	cmovg	eax, r9d
 28993  	mov	byte ptr [r8 + rsi], al
 28994  	movzx	eax, word ptr [rcx + 2*rsi + 2]
 28995  	test	ax, ax
 28996  	setne	dl
 28997  	neg	dl
 28998  	test	ax, ax
 28999  	movzx	eax, dl
 29000  	cmovg	eax, r9d
 29001  	mov	byte ptr [r8 + rsi + 1], al
 29002  	add	rsi, 2
 29003  	cmp	r10, rsi
 29004  	jne	.LBB4_1585
 29005  	jmp	.LBB4_1655
 29006  .LBB4_452:
 29007  	test	r9d, r9d
 29008  	jle	.LBB4_1655
 29009  # %bb.453:
 29010  	mov	r10d, r9d
 29011  	cmp	r9d, 4
 29012  	jb	.LBB4_454
 29013  # %bb.920:
 29014  	lea	rdx, [rcx + 8*r10]
 29015  	cmp	rdx, r8
 29016  	jbe	.LBB4_922
 29017  # %bb.921:
 29018  	lea	rdx, [r8 + r10]
 29019  	cmp	rdx, rcx
 29020  	jbe	.LBB4_922
 29021  .LBB4_454:
 29022  	xor	esi, esi
 29023  .LBB4_1590:
 29024  	mov	rdx, rsi
 29025  	not	rdx
 29026  	test	r10b, 1
 29027  	je	.LBB4_1592
 29028  # %bb.1591:
 29029  	mov	rdi, qword ptr [rcx + 8*rsi]
 29030  	test	rdi, rdi
 29031  	setne	al
 29032  	neg	al
 29033  	test	rdi, rdi
 29034  	movzx	eax, al
 29035  	mov	edi, 1
 29036  	cmovle	edi, eax
 29037  	mov	byte ptr [r8 + rsi], dil
 29038  	or	rsi, 1
 29039  .LBB4_1592:
 29040  	add	rdx, r10
 29041  	je	.LBB4_1655
 29042  # %bb.1593:
 29043  	mov	edi, 1
 29044  .LBB4_1594:                             # =>This Inner Loop Header: Depth=1
 29045  	mov	rax, qword ptr [rcx + 8*rsi]
 29046  	test	rax, rax
 29047  	setne	dl
 29048  	neg	dl
 29049  	test	rax, rax
 29050  	movzx	eax, dl
 29051  	cmovg	eax, edi
 29052  	mov	byte ptr [r8 + rsi], al
 29053  	mov	rax, qword ptr [rcx + 8*rsi + 8]
 29054  	test	rax, rax
 29055  	setne	dl
 29056  	neg	dl
 29057  	test	rax, rax
 29058  	movzx	eax, dl
 29059  	cmovg	eax, edi
 29060  	mov	byte ptr [r8 + rsi + 1], al
 29061  	add	rsi, 2
 29062  	cmp	r10, rsi
 29063  	jne	.LBB4_1594
 29064  	jmp	.LBB4_1655
 29065  .LBB4_455:
 29066  	test	r9d, r9d
 29067  	jle	.LBB4_1655
 29068  # %bb.456:
 29069  	mov	r10d, r9d
 29070  	cmp	r9d, 8
 29071  	jb	.LBB4_457
 29072  # %bb.925:
 29073  	lea	rdx, [rcx + 4*r10]
 29074  	cmp	rdx, r8
 29075  	jbe	.LBB4_927
 29076  # %bb.926:
 29077  	lea	rdx, [r8 + r10]
 29078  	cmp	rdx, rcx
 29079  	jbe	.LBB4_927
 29080  .LBB4_457:
 29081  	xor	edx, edx
 29082  .LBB4_1599:
 29083  	mov	rsi, rdx
 29084  	not	rsi
 29085  	test	r10b, 1
 29086  	je	.LBB4_1601
 29087  # %bb.1600:
 29088  	movd	xmm0, dword ptr [rcx + 4*rdx]   # xmm0 = mem[0],zero,zero,zero
 29089  	movd	edi, xmm0
 29090  	test	edi, edi
 29091  	setns	al
 29092  	add	al, al
 29093  	add	al, -1
 29094  	xor	edi, edi
 29095  	pxor	xmm1, xmm1
 29096  	ucomiss	xmm1, xmm0
 29097  	movzx	eax, al
 29098  	cmove	eax, edi
 29099  	mov	byte ptr [r8 + rdx], al
 29100  	or	rdx, 1
 29101  .LBB4_1601:
 29102  	add	rsi, r10
 29103  	je	.LBB4_1655
 29104  # %bb.1602:
 29105  	xor	esi, esi
 29106  	xorps	xmm0, xmm0
 29107  .LBB4_1603:                             # =>This Inner Loop Header: Depth=1
 29108  	movd	xmm1, dword ptr [rcx + 4*rdx]   # xmm1 = mem[0],zero,zero,zero
 29109  	movd	eax, xmm1
 29110  	test	eax, eax
 29111  	setns	al
 29112  	add	al, al
 29113  	add	al, -1
 29114  	ucomiss	xmm0, xmm1
 29115  	movzx	eax, al
 29116  	cmove	eax, esi
 29117  	mov	byte ptr [r8 + rdx], al
 29118  	movd	xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero
 29119  	movd	eax, xmm1
 29120  	test	eax, eax
 29121  	setns	al
 29122  	add	al, al
 29123  	add	al, -1
 29124  	ucomiss	xmm0, xmm1
 29125  	movzx	eax, al
 29126  	cmove	eax, esi
 29127  	mov	byte ptr [r8 + rdx + 1], al
 29128  	add	rdx, 2
 29129  	cmp	r10, rdx
 29130  	jne	.LBB4_1603
 29131  	jmp	.LBB4_1655
 29132  .LBB4_458:
 29133  	test	r9d, r9d
 29134  	jle	.LBB4_1655
 29135  # %bb.459:
 29136  	mov	eax, r9d
 29137  	cmp	r9d, 32
 29138  	jb	.LBB4_460
 29139  # %bb.930:
 29140  	lea	rdx, [rcx + rax]
 29141  	cmp	rdx, r8
 29142  	jbe	.LBB4_932
 29143  # %bb.931:
 29144  	lea	rdx, [r8 + rax]
 29145  	cmp	rdx, rcx
 29146  	jbe	.LBB4_932
 29147  .LBB4_460:
 29148  	xor	edx, edx
 29149  .LBB4_1608:
 29150  	mov	rsi, rdx
 29151  	not	rsi
 29152  	add	rsi, rax
 29153  	mov	rdi, rax
 29154  	and	rdi, 3
 29155  	je	.LBB4_1610
 29156  .LBB4_1609:                             # =>This Inner Loop Header: Depth=1
 29157  	cmp	byte ptr [rcx + rdx], 0
 29158  	setne	byte ptr [r8 + rdx]
 29159  	add	rdx, 1
 29160  	add	rdi, -1
 29161  	jne	.LBB4_1609
 29162  .LBB4_1610:
 29163  	cmp	rsi, 3
 29164  	jb	.LBB4_1655
 29165  .LBB4_1611:                             # =>This Inner Loop Header: Depth=1
 29166  	cmp	byte ptr [rcx + rdx], 0
 29167  	setne	byte ptr [r8 + rdx]
 29168  	cmp	byte ptr [rcx + rdx + 1], 0
 29169  	setne	byte ptr [r8 + rdx + 1]
 29170  	cmp	byte ptr [rcx + rdx + 2], 0
 29171  	setne	byte ptr [r8 + rdx + 2]
 29172  	cmp	byte ptr [rcx + rdx + 3], 0
 29173  	setne	byte ptr [r8 + rdx + 3]
 29174  	add	rdx, 4
 29175  	cmp	rax, rdx
 29176  	jne	.LBB4_1611
 29177  	jmp	.LBB4_1655
 29178  .LBB4_461:
 29179  	test	r9d, r9d
 29180  	jle	.LBB4_1655
 29181  # %bb.462:
 29182  	mov	r10d, r9d
 29183  	cmp	r9d, 8
 29184  	jb	.LBB4_463
 29185  # %bb.935:
 29186  	lea	rdx, [rcx + 4*r10]
 29187  	cmp	rdx, r8
 29188  	jbe	.LBB4_937
 29189  # %bb.936:
 29190  	lea	rdx, [r8 + r10]
 29191  	cmp	rdx, rcx
 29192  	jbe	.LBB4_937
 29193  .LBB4_463:
 29194  	xor	esi, esi
 29195  .LBB4_1616:
 29196  	mov	rax, rsi
 29197  	not	rax
 29198  	test	r10b, 1
 29199  	je	.LBB4_1618
 29200  # %bb.1617:
 29201  	mov	edi, dword ptr [rcx + 4*rsi]
 29202  	test	edi, edi
 29203  	setne	r9b
 29204  	neg	r9b
 29205  	test	edi, edi
 29206  	movzx	r9d, r9b
 29207  	mov	edi, 1
 29208  	cmovle	edi, r9d
 29209  	mov	byte ptr [r8 + rsi], dil
 29210  	or	rsi, 1
 29211  .LBB4_1618:
 29212  	add	rax, r10
 29213  	je	.LBB4_1655
 29214  # %bb.1619:
 29215  	mov	r9d, 1
 29216  .LBB4_1620:                             # =>This Inner Loop Header: Depth=1
 29217  	mov	edi, dword ptr [rcx + 4*rsi]
 29218  	test	edi, edi
 29219  	setne	al
 29220  	neg	al
 29221  	test	edi, edi
 29222  	movzx	eax, al
 29223  	cmovg	eax, r9d
 29224  	mov	byte ptr [r8 + rsi], al
 29225  	mov	eax, dword ptr [rcx + 4*rsi + 4]
 29226  	test	eax, eax
 29227  	setne	dl
 29228  	neg	dl
 29229  	test	eax, eax
 29230  	movzx	eax, dl
 29231  	cmovg	eax, r9d
 29232  	mov	byte ptr [r8 + rsi + 1], al
 29233  	add	rsi, 2
 29234  	cmp	r10, rsi
 29235  	jne	.LBB4_1620
 29236  	jmp	.LBB4_1655
 29237  .LBB4_464:
 29238  	test	r9d, r9d
 29239  	jle	.LBB4_1655
 29240  # %bb.465:
 29241  	mov	r10d, r9d
 29242  	cmp	r9d, 8
 29243  	jb	.LBB4_466
 29244  # %bb.940:
 29245  	lea	rdx, [rcx + 4*r10]
 29246  	cmp	rdx, r8
 29247  	jbe	.LBB4_942
 29248  # %bb.941:
 29249  	lea	rdx, [r8 + 4*r10]
 29250  	cmp	rdx, rcx
 29251  	jbe	.LBB4_942
 29252  .LBB4_466:
 29253  	xor	edx, edx
 29254  .LBB4_1625:
 29255  	mov	rsi, rdx
 29256  	not	rsi
 29257  	add	rsi, r10
 29258  	mov	rdi, r10
 29259  	and	rdi, 3
 29260  	je	.LBB4_1627
 29261  .LBB4_1626:                             # =>This Inner Loop Header: Depth=1
 29262  	xor	eax, eax
 29263  	cmp	dword ptr [rcx + 4*rdx], 0
 29264  	setne	al
 29265  	mov	dword ptr [r8 + 4*rdx], eax
 29266  	add	rdx, 1
 29267  	add	rdi, -1
 29268  	jne	.LBB4_1626
 29269  .LBB4_1627:
 29270  	cmp	rsi, 3
 29271  	jb	.LBB4_1655
 29272  .LBB4_1628:                             # =>This Inner Loop Header: Depth=1
 29273  	xor	eax, eax
 29274  	cmp	dword ptr [rcx + 4*rdx], 0
 29275  	setne	al
 29276  	mov	dword ptr [r8 + 4*rdx], eax
 29277  	xor	eax, eax
 29278  	cmp	dword ptr [rcx + 4*rdx + 4], 0
 29279  	setne	al
 29280  	mov	dword ptr [r8 + 4*rdx + 4], eax
 29281  	xor	eax, eax
 29282  	cmp	dword ptr [rcx + 4*rdx + 8], 0
 29283  	setne	al
 29284  	mov	dword ptr [r8 + 4*rdx + 8], eax
 29285  	xor	eax, eax
 29286  	cmp	dword ptr [rcx + 4*rdx + 12], 0
 29287  	setne	al
 29288  	mov	dword ptr [r8 + 4*rdx + 12], eax
 29289  	add	rdx, 4
 29290  	cmp	r10, rdx
 29291  	jne	.LBB4_1628
 29292  	jmp	.LBB4_1655
 29293  .LBB4_467:
 29294  	test	r9d, r9d
 29295  	jle	.LBB4_1655
 29296  # %bb.468:
 29297  	mov	eax, r9d
 29298  	xor	r10d, r10d
 29299  	cmp	r9d, 4
 29300  	jae	.LBB4_945
 29301  # %bb.469:
 29302  	xor	esi, esi
 29303  	jmp	.LBB4_1080
 29304  .LBB4_470:
 29305  	test	r9d, r9d
 29306  	jle	.LBB4_1655
 29307  # %bb.471:
 29308  	mov	r10d, r9d
 29309  	cmp	r9d, 8
 29310  	jb	.LBB4_472
 29311  # %bb.948:
 29312  	lea	rdx, [rcx + r10]
 29313  	cmp	rdx, r8
 29314  	jbe	.LBB4_950
 29315  # %bb.949:
 29316  	lea	rdx, [r8 + 4*r10]
 29317  	cmp	rdx, rcx
 29318  	jbe	.LBB4_950
 29319  .LBB4_472:
 29320  	xor	edx, edx
 29321  .LBB4_1633:
 29322  	mov	rsi, rdx
 29323  	not	rsi
 29324  	test	r10b, 1
 29325  	je	.LBB4_1635
 29326  # %bb.1634:
 29327  	mov	r9b, byte ptr [rcx + rdx]
 29328  	xor	edi, edi
 29329  	test	r9b, r9b
 29330  	setne	dil
 29331  	neg	edi
 29332  	test	r9b, r9b
 29333  	mov	eax, 1
 29334  	cmovle	eax, edi
 29335  	mov	dword ptr [r8 + 4*rdx], eax
 29336  	or	rdx, 1
 29337  .LBB4_1635:
 29338  	add	rsi, r10
 29339  	je	.LBB4_1655
 29340  # %bb.1636:
 29341  	mov	esi, 1
 29342  .LBB4_1637:                             # =>This Inner Loop Header: Depth=1
 29343  	movzx	eax, byte ptr [rcx + rdx]
 29344  	xor	edi, edi
 29345  	test	al, al
 29346  	setne	dil
 29347  	neg	edi
 29348  	test	al, al
 29349  	cmovg	edi, esi
 29350  	mov	dword ptr [r8 + 4*rdx], edi
 29351  	movzx	eax, byte ptr [rcx + rdx + 1]
 29352  	xor	edi, edi
 29353  	test	al, al
 29354  	setne	dil
 29355  	neg	edi
 29356  	test	al, al
 29357  	cmovg	edi, esi
 29358  	mov	dword ptr [r8 + 4*rdx + 4], edi
 29359  	add	rdx, 2
 29360  	cmp	r10, rdx
 29361  	jne	.LBB4_1637
 29362  	jmp	.LBB4_1655
 29363  .LBB4_473:
 29364  	test	r9d, r9d
 29365  	jle	.LBB4_1655
 29366  # %bb.474:
 29367  	mov	eax, r9d
 29368  	cmp	r9d, 4
 29369  	jae	.LBB4_953
 29370  # %bb.475:
 29371  	xor	edx, edx
 29372  	jmp	.LBB4_1086
 29373  .LBB4_476:
 29374  	test	r9d, r9d
 29375  	jle	.LBB4_1655
 29376  # %bb.477:
 29377  	mov	eax, r9d
 29378  	cmp	r9d, 8
 29379  	jae	.LBB4_956
 29380  # %bb.478:
 29381  	xor	edx, edx
 29382  	jmp	.LBB4_1091
 29383  .LBB4_479:
 29384  	test	r9d, r9d
 29385  	jle	.LBB4_1655
 29386  # %bb.480:
 29387  	mov	r10d, r9d
 29388  	cmp	r9d, 8
 29389  	jae	.LBB4_959
 29390  # %bb.481:
 29391  	xor	edx, edx
 29392  	jmp	.LBB4_1096
 29393  .LBB4_482:
 29394  	test	r9d, r9d
 29395  	jle	.LBB4_1655
 29396  # %bb.483:
 29397  	mov	r10d, r9d
 29398  	cmp	r9d, 4
 29399  	jae	.LBB4_962
 29400  # %bb.484:
 29401  	xor	edx, edx
 29402  	jmp	.LBB4_1102
 29403  .LBB4_485:
 29404  	test	r9d, r9d
 29405  	jle	.LBB4_1655
 29406  # %bb.486:
 29407  	mov	eax, r9d
 29408  	cmp	r9d, 8
 29409  	jae	.LBB4_965
 29410  # %bb.487:
 29411  	xor	edx, edx
 29412  	jmp	.LBB4_968
 29413  .LBB4_488:
 29414  	test	r9d, r9d
 29415  	jle	.LBB4_1655
 29416  # %bb.489:
 29417  	mov	r10d, r9d
 29418  	cmp	r9d, 8
 29419  	jb	.LBB4_490
 29420  # %bb.972:
 29421  	lea	rdx, [rcx + r10]
 29422  	cmp	rdx, r8
 29423  	jbe	.LBB4_974
 29424  # %bb.973:
 29425  	lea	rdx, [r8 + 4*r10]
 29426  	cmp	rdx, rcx
 29427  	jbe	.LBB4_974
 29428  .LBB4_490:
 29429  	xor	edx, edx
 29430  .LBB4_1642:
 29431  	mov	rsi, rdx
 29432  	not	rsi
 29433  	add	rsi, r10
 29434  	mov	rdi, r10
 29435  	and	rdi, 3
 29436  	je	.LBB4_1644
 29437  .LBB4_1643:                             # =>This Inner Loop Header: Depth=1
 29438  	xor	eax, eax
 29439  	cmp	byte ptr [rcx + rdx], 0
 29440  	setne	al
 29441  	mov	dword ptr [r8 + 4*rdx], eax
 29442  	add	rdx, 1
 29443  	add	rdi, -1
 29444  	jne	.LBB4_1643
 29445  .LBB4_1644:
 29446  	cmp	rsi, 3
 29447  	jb	.LBB4_1655
 29448  .LBB4_1645:                             # =>This Inner Loop Header: Depth=1
 29449  	xor	eax, eax
 29450  	cmp	byte ptr [rcx + rdx], 0
 29451  	setne	al
 29452  	mov	dword ptr [r8 + 4*rdx], eax
 29453  	xor	eax, eax
 29454  	cmp	byte ptr [rcx + rdx + 1], 0
 29455  	setne	al
 29456  	mov	dword ptr [r8 + 4*rdx + 4], eax
 29457  	xor	eax, eax
 29458  	cmp	byte ptr [rcx + rdx + 2], 0
 29459  	setne	al
 29460  	mov	dword ptr [r8 + 4*rdx + 8], eax
 29461  	xor	eax, eax
 29462  	cmp	byte ptr [rcx + rdx + 3], 0
 29463  	setne	al
 29464  	mov	dword ptr [r8 + 4*rdx + 12], eax
 29465  	add	rdx, 4
 29466  	cmp	r10, rdx
 29467  	jne	.LBB4_1645
 29468  	jmp	.LBB4_1655
 29469  .LBB4_491:
 29470  	test	r9d, r9d
 29471  	jle	.LBB4_1655
 29472  # %bb.492:
 29473  	mov	r11d, r9d
 29474  	cmp	r9d, 8
 29475  	jb	.LBB4_493
 29476  # %bb.977:
 29477  	lea	rdx, [rcx + 4*r11]
 29478  	cmp	rdx, r8
 29479  	jbe	.LBB4_979
 29480  # %bb.978:
 29481  	lea	rdx, [r8 + 4*r11]
 29482  	cmp	rdx, rcx
 29483  	jbe	.LBB4_979
 29484  .LBB4_493:
 29485  	xor	edx, edx
 29486  .LBB4_1650:
 29487  	mov	rsi, rdx
 29488  	not	rsi
 29489  	test	r11b, 1
 29490  	je	.LBB4_1652
 29491  # %bb.1651:
 29492  	mov	r9d, dword ptr [rcx + 4*rdx]
 29493  	xor	r10d, r10d
 29494  	test	r9d, r9d
 29495  	setne	r10b
 29496  	neg	r10d
 29497  	test	r9d, r9d
 29498  	mov	edi, 1
 29499  	cmovle	edi, r10d
 29500  	mov	dword ptr [r8 + 4*rdx], edi
 29501  	or	rdx, 1
 29502  .LBB4_1652:
 29503  	add	rsi, r11
 29504  	je	.LBB4_1655
 29505  # %bb.1653:
 29506  	mov	esi, 1
 29507  .LBB4_1654:                             # =>This Inner Loop Header: Depth=1
 29508  	mov	edi, dword ptr [rcx + 4*rdx]
 29509  	xor	eax, eax
 29510  	test	edi, edi
 29511  	setne	al
 29512  	neg	eax
 29513  	test	edi, edi
 29514  	cmovg	eax, esi
 29515  	mov	dword ptr [r8 + 4*rdx], eax
 29516  	mov	eax, dword ptr [rcx + 4*rdx + 4]
 29517  	xor	edi, edi
 29518  	test	eax, eax
 29519  	setne	dil
 29520  	neg	edi
 29521  	test	eax, eax
 29522  	cmovg	edi, esi
 29523  	mov	dword ptr [r8 + 4*rdx + 4], edi
 29524  	add	rdx, 2
 29525  	cmp	r11, rdx
 29526  	jne	.LBB4_1654
 29527  	jmp	.LBB4_1655
 29528  .LBB4_1524:
 29529  	cmp	rsi, 3
 29530  	jb	.LBB4_1655
 29531  # %bb.1525:
 29532  	movd	xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero
 29533  	jmp	.LBB4_1527
 29534  .LBB4_1526:                             #   in Loop: Header=BB4_1527 Depth=1
 29535  	movd	dword ptr [r8 + 4*rdx + 12], xmm1
 29536  	add	rdx, 4
 29537  	cmp	rax, rdx
 29538  	je	.LBB4_1655
 29539  .LBB4_1527:                             # =>This Inner Loop Header: Depth=1
 29540  	cmp	byte ptr [rcx + rdx], 0
 29541  	movdqa	xmm1, xmm0
 29542  	jne	.LBB4_1528
 29543  # %bb.1531:                             #   in Loop: Header=BB4_1527 Depth=1
 29544  	pxor	xmm1, xmm1
 29545  	movd	dword ptr [r8 + 4*rdx], xmm1
 29546  	cmp	byte ptr [rcx + rdx + 1], 0
 29547  	movdqa	xmm1, xmm0
 29548  	je	.LBB4_1532
 29549  .LBB4_1529:                             #   in Loop: Header=BB4_1527 Depth=1
 29550  	movd	dword ptr [r8 + 4*rdx + 4], xmm1
 29551  	cmp	byte ptr [rcx + rdx + 2], 0
 29552  	movdqa	xmm1, xmm0
 29553  	jne	.LBB4_1530
 29554  .LBB4_1533:                             #   in Loop: Header=BB4_1527 Depth=1
 29555  	pxor	xmm1, xmm1
 29556  	movd	dword ptr [r8 + 4*rdx + 8], xmm1
 29557  	cmp	byte ptr [rcx + rdx + 3], 0
 29558  	movdqa	xmm1, xmm0
 29559  	jne	.LBB4_1526
 29560  	jmp	.LBB4_1534
 29561  .LBB4_1528:                             #   in Loop: Header=BB4_1527 Depth=1
 29562  	movd	dword ptr [r8 + 4*rdx], xmm1
 29563  	cmp	byte ptr [rcx + rdx + 1], 0
 29564  	movdqa	xmm1, xmm0
 29565  	jne	.LBB4_1529
 29566  .LBB4_1532:                             #   in Loop: Header=BB4_1527 Depth=1
 29567  	pxor	xmm1, xmm1
 29568  	movd	dword ptr [r8 + 4*rdx + 4], xmm1
 29569  	cmp	byte ptr [rcx + rdx + 2], 0
 29570  	movdqa	xmm1, xmm0
 29571  	je	.LBB4_1533
 29572  .LBB4_1530:                             #   in Loop: Header=BB4_1527 Depth=1
 29573  	movd	dword ptr [r8 + 4*rdx + 8], xmm1
 29574  	cmp	byte ptr [rcx + rdx + 3], 0
 29575  	movdqa	xmm1, xmm0
 29576  	jne	.LBB4_1526
 29577  .LBB4_1534:                             #   in Loop: Header=BB4_1527 Depth=1
 29578  	pxor	xmm1, xmm1
 29579  	jmp	.LBB4_1526
 29580  .LBB4_499:
 29581  	mov	esi, r11d
 29582  	and	esi, -4
 29583  	lea	rdx, [rsi - 4]
 29584  	mov	r9, rdx
 29585  	shr	r9, 2
 29586  	add	r9, 1
 29587  	test	rdx, rdx
 29588  	je	.LBB4_1106
 29589  # %bb.500:
 29590  	mov	rdx, r9
 29591  	and	rdx, -2
 29592  	neg	rdx
 29593  	xor	edi, edi
 29594  	xorpd	xmm0, xmm0
 29595  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 29596  	movapd	xmm2, xmmword ptr [rip + .LCPI4_1] # xmm2 = [1.0E+0,1.0E+0]
 29597  .LBB4_501:                              # =>This Inner Loop Header: Depth=1
 29598  	movupd	xmm5, xmmword ptr [rcx + 8*rdi]
 29599  	movupd	xmm6, xmmword ptr [rcx + 8*rdi + 16]
 29600  	movapd	xmm3, xmm5
 29601  	cmpeqpd	xmm3, xmm0
 29602  	shufps	xmm3, xmm3, 232                 # xmm3 = xmm3[0,2,2,3]
 29603  	movapd	xmm4, xmm6
 29604  	cmpeqpd	xmm4, xmm0
 29605  	andpd	xmm5, xmm1
 29606  	orpd	xmm5, xmm2
 29607  	andpd	xmm6, xmm1
 29608  	orpd	xmm6, xmm2
 29609  	pshufd	xmm7, xmm5, 238                 # xmm7 = xmm5[2,3,2,3]
 29610  	cvttsd2si	rax, xmm7
 29611  	cvttsd2si	rbx, xmm5
 29612  	movd	xmm5, ebx
 29613  	pinsrd	xmm5, eax, 1
 29614  	pshufd	xmm7, xmm6, 238                 # xmm7 = xmm6[2,3,2,3]
 29615  	cvttsd2si	rax, xmm7
 29616  	cvttsd2si	rbx, xmm6
 29617  	shufps	xmm4, xmm4, 232                 # xmm4 = xmm4[0,2,2,3]
 29618  	movd	xmm6, ebx
 29619  	pinsrd	xmm6, eax, 1
 29620  	andnps	xmm3, xmm5
 29621  	andnps	xmm4, xmm6
 29622  	movlhps	xmm3, xmm4                      # xmm3 = xmm3[0],xmm4[0]
 29623  	movups	xmmword ptr [r8 + 4*rdi], xmm3
 29624  	movupd	xmm5, xmmword ptr [rcx + 8*rdi + 32]
 29625  	movupd	xmm6, xmmword ptr [rcx + 8*rdi + 48]
 29626  	movapd	xmm3, xmm5
 29627  	cmpeqpd	xmm3, xmm0
 29628  	shufps	xmm3, xmm3, 232                 # xmm3 = xmm3[0,2,2,3]
 29629  	movapd	xmm4, xmm6
 29630  	cmpeqpd	xmm4, xmm0
 29631  	shufps	xmm4, xmm4, 232                 # xmm4 = xmm4[0,2,2,3]
 29632  	andpd	xmm5, xmm1
 29633  	orpd	xmm5, xmm2
 29634  	andpd	xmm6, xmm1
 29635  	pshufd	xmm7, xmm5, 238                 # xmm7 = xmm5[2,3,2,3]
 29636  	cvttsd2si	rax, xmm7
 29637  	orpd	xmm6, xmm2
 29638  	cvttsd2si	rbx, xmm5
 29639  	movd	xmm5, ebx
 29640  	pinsrd	xmm5, eax, 1
 29641  	andnps	xmm3, xmm5
 29642  	pshufd	xmm5, xmm6, 238                 # xmm5 = xmm6[2,3,2,3]
 29643  	cvttsd2si	rax, xmm5
 29644  	cvttsd2si	rbx, xmm6
 29645  	movd	xmm5, ebx
 29646  	pinsrd	xmm5, eax, 1
 29647  	andnps	xmm4, xmm5
 29648  	movlhps	xmm3, xmm4                      # xmm3 = xmm3[0],xmm4[0]
 29649  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm3
 29650  	add	rdi, 8
 29651  	add	rdx, 2
 29652  	jne	.LBB4_501
 29653  	jmp	.LBB4_1107
 29654  .LBB4_507:
 29655  	mov	edx, eax
 29656  	and	edx, -4
 29657  	lea	rsi, [rdx - 4]
 29658  	mov	r9, rsi
 29659  	shr	r9, 2
 29660  	add	r9, 1
 29661  	test	rsi, rsi
 29662  	je	.LBB4_994
 29663  # %bb.508:
 29664  	mov	rdi, r9
 29665  	and	rdi, -2
 29666  	neg	rdi
 29667  	xor	esi, esi
 29668  	pxor	xmm0, xmm0
 29669  	movdqa	xmm1, xmmword ptr [rip + .LCPI4_16] # xmm1 = <1,1,u,u>
 29670  .LBB4_509:                              # =>This Inner Loop Header: Depth=1
 29671  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi]
 29672  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 16]
 29673  	pcmpeqq	xmm2, xmm0
 29674  	pshufd	xmm2, xmm2, 232                 # xmm2 = xmm2[0,2,2,3]
 29675  	pandn	xmm2, xmm1
 29676  	pcmpeqq	xmm3, xmm0
 29677  	pshufd	xmm3, xmm3, 232                 # xmm3 = xmm3[0,2,2,3]
 29678  	pandn	xmm3, xmm1
 29679  	punpcklqdq	xmm2, xmm3              # xmm2 = xmm2[0],xmm3[0]
 29680  	movdqu	xmmword ptr [r8 + 4*rsi], xmm2
 29681  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi + 32]
 29682  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 48]
 29683  	pcmpeqq	xmm2, xmm0
 29684  	pshufd	xmm2, xmm2, 232                 # xmm2 = xmm2[0,2,2,3]
 29685  	pandn	xmm2, xmm1
 29686  	pcmpeqq	xmm3, xmm0
 29687  	pshufd	xmm3, xmm3, 232                 # xmm3 = xmm3[0,2,2,3]
 29688  	pandn	xmm3, xmm1
 29689  	punpcklqdq	xmm2, xmm3              # xmm2 = xmm2[0],xmm3[0]
 29690  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm2
 29691  	add	rsi, 8
 29692  	add	rdi, 2
 29693  	jne	.LBB4_509
 29694  	jmp	.LBB4_995
 29695  .LBB4_510:
 29696  	mov	edx, eax
 29697  	and	edx, -8
 29698  	lea	rsi, [rdx - 8]
 29699  	mov	r9, rsi
 29700  	shr	r9, 3
 29701  	add	r9, 1
 29702  	test	rsi, rsi
 29703  	je	.LBB4_1112
 29704  # %bb.511:
 29705  	mov	rdi, r9
 29706  	and	rdi, -2
 29707  	neg	rdi
 29708  	xor	esi, esi
 29709  	pxor	xmm0, xmm0
 29710  	pcmpeqd	xmm1, xmm1
 29711  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1]
 29712  .LBB4_512:                              # =>This Inner Loop Header: Depth=1
 29713  	movq	xmm3, qword ptr [rcx + 2*rsi]   # xmm3 = mem[0],zero
 29714  	movq	xmm4, qword ptr [rcx + 2*rsi + 8] # xmm4 = mem[0],zero
 29715  	pcmpeqw	xmm3, xmm0
 29716  	pxor	xmm3, xmm1
 29717  	pmovzxwd	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 29718  	pand	xmm3, xmm2
 29719  	pcmpeqw	xmm4, xmm0
 29720  	pxor	xmm4, xmm1
 29721  	pmovzxwd	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
 29722  	pand	xmm4, xmm2
 29723  	movdqu	xmmword ptr [r8 + 4*rsi], xmm3
 29724  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm4
 29725  	movq	xmm3, qword ptr [rcx + 2*rsi + 16] # xmm3 = mem[0],zero
 29726  	movq	xmm4, qword ptr [rcx + 2*rsi + 24] # xmm4 = mem[0],zero
 29727  	pcmpeqw	xmm3, xmm0
 29728  	pxor	xmm3, xmm1
 29729  	pmovzxwd	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 29730  	pand	xmm3, xmm2
 29731  	pcmpeqw	xmm4, xmm0
 29732  	pxor	xmm4, xmm1
 29733  	pmovzxwd	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
 29734  	pand	xmm4, xmm2
 29735  	movdqu	xmmword ptr [r8 + 4*rsi + 32], xmm3
 29736  	movdqu	xmmword ptr [r8 + 4*rsi + 48], xmm4
 29737  	add	rsi, 16
 29738  	add	rdi, 2
 29739  	jne	.LBB4_512
 29740  	jmp	.LBB4_1113
 29741  .LBB4_513:
 29742  	mov	edx, r10d
 29743  	and	edx, -8
 29744  	lea	rsi, [rdx - 8]
 29745  	mov	r9, rsi
 29746  	shr	r9, 3
 29747  	add	r9, 1
 29748  	test	rsi, rsi
 29749  	je	.LBB4_1117
 29750  # %bb.514:
 29751  	mov	rdi, r9
 29752  	and	rdi, -2
 29753  	neg	rdi
 29754  	xor	esi, esi
 29755  	pxor	xmm2, xmm2
 29756  	pcmpeqd	xmm3, xmm3
 29757  	movaps	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 29758  .LBB4_515:                              # =>This Inner Loop Header: Depth=1
 29759  	movq	xmm5, qword ptr [rcx + 2*rsi]   # xmm5 = mem[0],zero
 29760  	movq	xmm6, qword ptr [rcx + 2*rsi + 8] # xmm6 = mem[0],zero
 29761  	movdqa	xmm0, xmm5
 29762  	pcmpgtw	xmm0, xmm2
 29763  	pmovsxwd	xmm0, xmm0
 29764  	movdqa	xmm1, xmm6
 29765  	pcmpgtw	xmm1, xmm2
 29766  	pmovsxwd	xmm1, xmm1
 29767  	pcmpeqw	xmm5, xmm2
 29768  	pxor	xmm5, xmm3
 29769  	pmovsxwd	xmm5, xmm5
 29770  	pcmpeqw	xmm6, xmm2
 29771  	pxor	xmm6, xmm3
 29772  	pmovsxwd	xmm6, xmm6
 29773  	blendvps	xmm5, xmm4, xmm0
 29774  	movdqa	xmm0, xmm1
 29775  	blendvps	xmm6, xmm4, xmm0
 29776  	movups	xmmword ptr [r8 + 4*rsi], xmm5
 29777  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm6
 29778  	movq	xmm5, qword ptr [rcx + 2*rsi + 16] # xmm5 = mem[0],zero
 29779  	movq	xmm6, qword ptr [rcx + 2*rsi + 24] # xmm6 = mem[0],zero
 29780  	movdqa	xmm0, xmm5
 29781  	pcmpgtw	xmm0, xmm2
 29782  	pmovsxwd	xmm0, xmm0
 29783  	movdqa	xmm1, xmm6
 29784  	pcmpgtw	xmm1, xmm2
 29785  	pmovsxwd	xmm1, xmm1
 29786  	pcmpeqw	xmm5, xmm2
 29787  	pxor	xmm5, xmm3
 29788  	pmovsxwd	xmm5, xmm5
 29789  	pcmpeqw	xmm6, xmm2
 29790  	pxor	xmm6, xmm3
 29791  	pmovsxwd	xmm6, xmm6
 29792  	blendvps	xmm5, xmm4, xmm0
 29793  	movdqa	xmm0, xmm1
 29794  	blendvps	xmm6, xmm4, xmm0
 29795  	movups	xmmword ptr [r8 + 4*rsi + 32], xmm5
 29796  	movups	xmmword ptr [r8 + 4*rsi + 48], xmm6
 29797  	add	rsi, 16
 29798  	add	rdi, 2
 29799  	jne	.LBB4_515
 29800  	jmp	.LBB4_1118
 29801  .LBB4_516:
 29802  	mov	edx, r10d
 29803  	and	edx, -4
 29804  	lea	rsi, [rdx - 4]
 29805  	mov	r9, rsi
 29806  	shr	r9, 2
 29807  	add	r9, 1
 29808  	test	rsi, rsi
 29809  	je	.LBB4_1123
 29810  # %bb.517:
 29811  	mov	rdi, r9
 29812  	and	rdi, -2
 29813  	neg	rdi
 29814  	xor	esi, esi
 29815  	pxor	xmm2, xmm2
 29816  	pcmpeqd	xmm3, xmm3
 29817  	movaps	xmm4, xmmword ptr [rip + .LCPI4_16] # xmm4 = <1,1,u,u>
 29818  .LBB4_518:                              # =>This Inner Loop Header: Depth=1
 29819  	movdqu	xmm5, xmmword ptr [rcx + 8*rsi]
 29820  	movdqu	xmm6, xmmword ptr [rcx + 8*rsi + 16]
 29821  	movdqa	xmm0, xmm5
 29822  	pcmpgtq	xmm0, xmm2
 29823  	pshufd	xmm0, xmm0, 232                 # xmm0 = xmm0[0,2,2,3]
 29824  	movdqa	xmm1, xmm6
 29825  	pcmpgtq	xmm1, xmm2
 29826  	pshufd	xmm1, xmm1, 232                 # xmm1 = xmm1[0,2,2,3]
 29827  	pcmpeqq	xmm5, xmm2
 29828  	pshufd	xmm5, xmm5, 232                 # xmm5 = xmm5[0,2,2,3]
 29829  	pxor	xmm5, xmm3
 29830  	pcmpeqq	xmm6, xmm2
 29831  	pshufd	xmm6, xmm6, 232                 # xmm6 = xmm6[0,2,2,3]
 29832  	pxor	xmm6, xmm3
 29833  	blendvps	xmm5, xmm4, xmm0
 29834  	movdqa	xmm0, xmm1
 29835  	blendvps	xmm6, xmm4, xmm0
 29836  	movlhps	xmm5, xmm6                      # xmm5 = xmm5[0],xmm6[0]
 29837  	movups	xmmword ptr [r8 + 4*rsi], xmm5
 29838  	movdqu	xmm5, xmmword ptr [rcx + 8*rsi + 32]
 29839  	movdqu	xmm6, xmmword ptr [rcx + 8*rsi + 48]
 29840  	movdqa	xmm0, xmm5
 29841  	pcmpgtq	xmm0, xmm2
 29842  	pshufd	xmm0, xmm0, 232                 # xmm0 = xmm0[0,2,2,3]
 29843  	movdqa	xmm1, xmm6
 29844  	pcmpgtq	xmm1, xmm2
 29845  	pshufd	xmm1, xmm1, 232                 # xmm1 = xmm1[0,2,2,3]
 29846  	pcmpeqq	xmm5, xmm2
 29847  	pshufd	xmm5, xmm5, 232                 # xmm5 = xmm5[0,2,2,3]
 29848  	pxor	xmm5, xmm3
 29849  	pcmpeqq	xmm6, xmm2
 29850  	pshufd	xmm6, xmm6, 232                 # xmm6 = xmm6[0,2,2,3]
 29851  	pxor	xmm6, xmm3
 29852  	blendvps	xmm5, xmm4, xmm0
 29853  	movdqa	xmm0, xmm1
 29854  	blendvps	xmm6, xmm4, xmm0
 29855  	movlhps	xmm5, xmm6                      # xmm5 = xmm5[0],xmm6[0]
 29856  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm5
 29857  	add	rsi, 8
 29858  	add	rdi, 2
 29859  	jne	.LBB4_518
 29860  	jmp	.LBB4_1124
 29861  .LBB4_519:
 29862  	mov	edx, eax
 29863  	and	edx, -4
 29864  	lea	rsi, [rdx - 4]
 29865  	mov	r9, rsi
 29866  	shr	r9, 2
 29867  	add	r9, 1
 29868  	test	rsi, rsi
 29869  	je	.LBB4_1129
 29870  # %bb.520:
 29871  	mov	rdi, r9
 29872  	and	rdi, -2
 29873  	neg	rdi
 29874  	xor	esi, esi
 29875  	xorps	xmm1, xmm1
 29876  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1]
 29877  	movaps	xmm3, xmmword ptr [rip + .LCPI4_10] # xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
 29878  	movaps	xmm4, xmmword ptr [rip + .LCPI4_4] # xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 29879  .LBB4_521:                              # =>This Inner Loop Header: Depth=1
 29880  	movdqu	xmm5, xmmword ptr [rcx + 4*rsi]
 29881  	movdqa	xmm0, xmm5
 29882  	psrad	xmm0, 31
 29883  	por	xmm0, xmm2
 29884  	cvtdq2ps	xmm6, xmm0
 29885  	movaps	xmm0, xmm6
 29886  	cmpltps	xmm0, xmm3
 29887  	cvttps2dq	xmm7, xmm6
 29888  	subps	xmm6, xmm3
 29889  	cvttps2dq	xmm6, xmm6
 29890  	xorps	xmm6, xmm4
 29891  	blendvps	xmm6, xmm7, xmm0
 29892  	cmpneqps	xmm5, xmm1
 29893  	andps	xmm5, xmm6
 29894  	movups	xmmword ptr [r8 + 4*rsi], xmm5
 29895  	movdqu	xmm5, xmmword ptr [rcx + 4*rsi + 16]
 29896  	movdqa	xmm0, xmm5
 29897  	psrad	xmm0, 31
 29898  	por	xmm0, xmm2
 29899  	cvtdq2ps	xmm6, xmm0
 29900  	movaps	xmm0, xmm6
 29901  	cmpltps	xmm0, xmm3
 29902  	cvttps2dq	xmm7, xmm6
 29903  	subps	xmm6, xmm3
 29904  	cvttps2dq	xmm6, xmm6
 29905  	xorps	xmm6, xmm4
 29906  	blendvps	xmm6, xmm7, xmm0
 29907  	cmpneqps	xmm5, xmm1
 29908  	andps	xmm5, xmm6
 29909  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm5
 29910  	add	rsi, 8
 29911  	add	rdi, 2
 29912  	jne	.LBB4_521
 29913  	jmp	.LBB4_1130
 29914  .LBB4_532:
 29915  	and	edx, -4
 29916  	xor	esi, esi
 29917  	movsd	xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero
 29918  	jmp	.LBB4_534
 29919  .LBB4_533:                              #   in Loop: Header=BB4_534 Depth=1
 29920  	movsd	qword ptr [r8 + 8*rsi + 24], xmm1
 29921  	add	rsi, 4
 29922  	cmp	rdx, rsi
 29923  	je	.LBB4_101
 29924  .LBB4_534:                              # =>This Inner Loop Header: Depth=1
 29925  	cmp	dword ptr [rcx + 4*rsi], 0
 29926  	movapd	xmm1, xmm0
 29927  	jne	.LBB4_535
 29928  # %bb.538:                              #   in Loop: Header=BB4_534 Depth=1
 29929  	xorpd	xmm1, xmm1
 29930  	movsd	qword ptr [r8 + 8*rsi], xmm1
 29931  	cmp	dword ptr [rcx + 4*rsi + 4], 0
 29932  	movapd	xmm1, xmm0
 29933  	je	.LBB4_539
 29934  .LBB4_536:                              #   in Loop: Header=BB4_534 Depth=1
 29935  	movsd	qword ptr [r8 + 8*rsi + 8], xmm1
 29936  	cmp	dword ptr [rcx + 4*rsi + 8], 0
 29937  	movapd	xmm1, xmm0
 29938  	jne	.LBB4_537
 29939  .LBB4_540:                              #   in Loop: Header=BB4_534 Depth=1
 29940  	xorpd	xmm1, xmm1
 29941  	movsd	qword ptr [r8 + 8*rsi + 16], xmm1
 29942  	cmp	dword ptr [rcx + 4*rsi + 12], 0
 29943  	movapd	xmm1, xmm0
 29944  	jne	.LBB4_533
 29945  	jmp	.LBB4_541
 29946  .LBB4_535:                              #   in Loop: Header=BB4_534 Depth=1
 29947  	movsd	qword ptr [r8 + 8*rsi], xmm1
 29948  	cmp	dword ptr [rcx + 4*rsi + 4], 0
 29949  	movapd	xmm1, xmm0
 29950  	jne	.LBB4_536
 29951  .LBB4_539:                              #   in Loop: Header=BB4_534 Depth=1
 29952  	xorpd	xmm1, xmm1
 29953  	movsd	qword ptr [r8 + 8*rsi + 8], xmm1
 29954  	cmp	dword ptr [rcx + 4*rsi + 8], 0
 29955  	movapd	xmm1, xmm0
 29956  	je	.LBB4_540
 29957  .LBB4_537:                              #   in Loop: Header=BB4_534 Depth=1
 29958  	movsd	qword ptr [r8 + 8*rsi + 16], xmm1
 29959  	cmp	dword ptr [rcx + 4*rsi + 12], 0
 29960  	movapd	xmm1, xmm0
 29961  	jne	.LBB4_533
 29962  .LBB4_541:                              #   in Loop: Header=BB4_534 Depth=1
 29963  	xorpd	xmm1, xmm1
 29964  	jmp	.LBB4_533
 29965  .LBB4_547:
 29966  	mov	esi, edx
 29967  	and	esi, -2
 29968  	xor	eax, eax
 29969  	movsd	xmm0, qword ptr [rip + .LCPI4_13] # xmm0 = mem[0],zero
 29970  	movsd	xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero
 29971  	jmp	.LBB4_549
 29972  .LBB4_548:                              #   in Loop: Header=BB4_549 Depth=1
 29973  	movsd	qword ptr [r8 + 8*rax + 8], xmm3
 29974  	add	rax, 2
 29975  	cmp	rsi, rax
 29976  	je	.LBB4_120
 29977  .LBB4_549:                              # =>This Inner Loop Header: Depth=1
 29978  	cmp	byte ptr [rcx + rax], 0
 29979  	movapd	xmm2, xmm0
 29980  	jne	.LBB4_550
 29981  # %bb.553:                              #   in Loop: Header=BB4_549 Depth=1
 29982  	xorpd	xmm2, xmm2
 29983  	movapd	xmm3, xmm1
 29984  	jle	.LBB4_554
 29985  .LBB4_551:                              #   in Loop: Header=BB4_549 Depth=1
 29986  	movsd	qword ptr [r8 + 8*rax], xmm3
 29987  	cmp	byte ptr [rcx + rax + 1], 0
 29988  	movapd	xmm2, xmm0
 29989  	jne	.LBB4_552
 29990  .LBB4_555:                              #   in Loop: Header=BB4_549 Depth=1
 29991  	xorpd	xmm2, xmm2
 29992  	movapd	xmm3, xmm1
 29993  	jg	.LBB4_548
 29994  	jmp	.LBB4_556
 29995  .LBB4_550:                              #   in Loop: Header=BB4_549 Depth=1
 29996  	movapd	xmm3, xmm1
 29997  	jg	.LBB4_551
 29998  .LBB4_554:                              #   in Loop: Header=BB4_549 Depth=1
 29999  	movapd	xmm3, xmm2
 30000  	movsd	qword ptr [r8 + 8*rax], xmm3
 30001  	cmp	byte ptr [rcx + rax + 1], 0
 30002  	movapd	xmm2, xmm0
 30003  	je	.LBB4_555
 30004  .LBB4_552:                              #   in Loop: Header=BB4_549 Depth=1
 30005  	movapd	xmm3, xmm1
 30006  	jg	.LBB4_548
 30007  .LBB4_556:                              #   in Loop: Header=BB4_549 Depth=1
 30008  	movapd	xmm3, xmm2
 30009  	jmp	.LBB4_548
 30010  .LBB4_557:
 30011  	and	edx, -4
 30012  	xor	esi, esi
 30013  	movsd	xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero
 30014  	jmp	.LBB4_559
 30015  .LBB4_558:                              #   in Loop: Header=BB4_559 Depth=1
 30016  	movsd	qword ptr [r8 + 8*rsi + 24], xmm1
 30017  	add	rsi, 4
 30018  	cmp	rdx, rsi
 30019  	je	.LBB4_130
 30020  .LBB4_559:                              # =>This Inner Loop Header: Depth=1
 30021  	cmp	qword ptr [rcx + 8*rsi], 0
 30022  	movapd	xmm1, xmm0
 30023  	jne	.LBB4_560
 30024  # %bb.563:                              #   in Loop: Header=BB4_559 Depth=1
 30025  	xorpd	xmm1, xmm1
 30026  	movsd	qword ptr [r8 + 8*rsi], xmm1
 30027  	cmp	qword ptr [rcx + 8*rsi + 8], 0
 30028  	movapd	xmm1, xmm0
 30029  	je	.LBB4_564
 30030  .LBB4_561:                              #   in Loop: Header=BB4_559 Depth=1
 30031  	movsd	qword ptr [r8 + 8*rsi + 8], xmm1
 30032  	cmp	qword ptr [rcx + 8*rsi + 16], 0
 30033  	movapd	xmm1, xmm0
 30034  	jne	.LBB4_562
 30035  .LBB4_565:                              #   in Loop: Header=BB4_559 Depth=1
 30036  	xorpd	xmm1, xmm1
 30037  	movsd	qword ptr [r8 + 8*rsi + 16], xmm1
 30038  	cmp	qword ptr [rcx + 8*rsi + 24], 0
 30039  	movapd	xmm1, xmm0
 30040  	jne	.LBB4_558
 30041  	jmp	.LBB4_566
 30042  .LBB4_560:                              #   in Loop: Header=BB4_559 Depth=1
 30043  	movsd	qword ptr [r8 + 8*rsi], xmm1
 30044  	cmp	qword ptr [rcx + 8*rsi + 8], 0
 30045  	movapd	xmm1, xmm0
 30046  	jne	.LBB4_561
 30047  .LBB4_564:                              #   in Loop: Header=BB4_559 Depth=1
 30048  	xorpd	xmm1, xmm1
 30049  	movsd	qword ptr [r8 + 8*rsi + 8], xmm1
 30050  	cmp	qword ptr [rcx + 8*rsi + 16], 0
 30051  	movapd	xmm1, xmm0
 30052  	je	.LBB4_565
 30053  .LBB4_562:                              #   in Loop: Header=BB4_559 Depth=1
 30054  	movsd	qword ptr [r8 + 8*rsi + 16], xmm1
 30055  	cmp	qword ptr [rcx + 8*rsi + 24], 0
 30056  	movapd	xmm1, xmm0
 30057  	jne	.LBB4_558
 30058  .LBB4_566:                              #   in Loop: Header=BB4_559 Depth=1
 30059  	xorpd	xmm1, xmm1
 30060  	jmp	.LBB4_558
 30061  .LBB4_567:
 30062  	and	edx, -4
 30063  	xor	esi, esi
 30064  	movsd	xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero
 30065  	jmp	.LBB4_569
 30066  .LBB4_568:                              #   in Loop: Header=BB4_569 Depth=1
 30067  	movsd	qword ptr [r8 + 8*rsi + 24], xmm1
 30068  	add	rsi, 4
 30069  	cmp	rdx, rsi
 30070  	je	.LBB4_142
 30071  .LBB4_569:                              # =>This Inner Loop Header: Depth=1
 30072  	cmp	word ptr [rcx + 2*rsi], 0
 30073  	movapd	xmm1, xmm0
 30074  	jne	.LBB4_570
 30075  # %bb.573:                              #   in Loop: Header=BB4_569 Depth=1
 30076  	xorpd	xmm1, xmm1
 30077  	movsd	qword ptr [r8 + 8*rsi], xmm1
 30078  	cmp	word ptr [rcx + 2*rsi + 2], 0
 30079  	movapd	xmm1, xmm0
 30080  	je	.LBB4_574
 30081  .LBB4_571:                              #   in Loop: Header=BB4_569 Depth=1
 30082  	movsd	qword ptr [r8 + 8*rsi + 8], xmm1
 30083  	cmp	word ptr [rcx + 2*rsi + 4], 0
 30084  	movapd	xmm1, xmm0
 30085  	jne	.LBB4_572
 30086  .LBB4_575:                              #   in Loop: Header=BB4_569 Depth=1
 30087  	xorpd	xmm1, xmm1
 30088  	movsd	qword ptr [r8 + 8*rsi + 16], xmm1
 30089  	cmp	word ptr [rcx + 2*rsi + 6], 0
 30090  	movapd	xmm1, xmm0
 30091  	jne	.LBB4_568
 30092  	jmp	.LBB4_576
 30093  .LBB4_570:                              #   in Loop: Header=BB4_569 Depth=1
 30094  	movsd	qword ptr [r8 + 8*rsi], xmm1
 30095  	cmp	word ptr [rcx + 2*rsi + 2], 0
 30096  	movapd	xmm1, xmm0
 30097  	jne	.LBB4_571
 30098  .LBB4_574:                              #   in Loop: Header=BB4_569 Depth=1
 30099  	xorpd	xmm1, xmm1
 30100  	movsd	qword ptr [r8 + 8*rsi + 8], xmm1
 30101  	cmp	word ptr [rcx + 2*rsi + 4], 0
 30102  	movapd	xmm1, xmm0
 30103  	je	.LBB4_575
 30104  .LBB4_572:                              #   in Loop: Header=BB4_569 Depth=1
 30105  	movsd	qword ptr [r8 + 8*rsi + 16], xmm1
 30106  	cmp	word ptr [rcx + 2*rsi + 6], 0
 30107  	movapd	xmm1, xmm0
 30108  	jne	.LBB4_568
 30109  .LBB4_576:                              #   in Loop: Header=BB4_569 Depth=1
 30110  	xorpd	xmm1, xmm1
 30111  	jmp	.LBB4_568
 30112  .LBB4_577:
 30113  	mov	esi, edx
 30114  	and	esi, -2
 30115  	xor	eax, eax
 30116  	movsd	xmm0, qword ptr [rip + .LCPI4_13] # xmm0 = mem[0],zero
 30117  	movsd	xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero
 30118  	jmp	.LBB4_579
 30119  .LBB4_578:                              #   in Loop: Header=BB4_579 Depth=1
 30120  	movsd	qword ptr [r8 + 8*rax + 8], xmm3
 30121  	add	rax, 2
 30122  	cmp	rsi, rax
 30123  	je	.LBB4_154
 30124  .LBB4_579:                              # =>This Inner Loop Header: Depth=1
 30125  	cmp	word ptr [rcx + 2*rax], 0
 30126  	movapd	xmm2, xmm0
 30127  	jne	.LBB4_580
 30128  # %bb.583:                              #   in Loop: Header=BB4_579 Depth=1
 30129  	xorpd	xmm2, xmm2
 30130  	movapd	xmm3, xmm1
 30131  	jle	.LBB4_584
 30132  .LBB4_581:                              #   in Loop: Header=BB4_579 Depth=1
 30133  	movsd	qword ptr [r8 + 8*rax], xmm3
 30134  	cmp	word ptr [rcx + 2*rax + 2], 0
 30135  	movapd	xmm2, xmm0
 30136  	jne	.LBB4_582
 30137  .LBB4_585:                              #   in Loop: Header=BB4_579 Depth=1
 30138  	xorpd	xmm2, xmm2
 30139  	movapd	xmm3, xmm1
 30140  	jg	.LBB4_578
 30141  	jmp	.LBB4_586
 30142  .LBB4_580:                              #   in Loop: Header=BB4_579 Depth=1
 30143  	movapd	xmm3, xmm1
 30144  	jg	.LBB4_581
 30145  .LBB4_584:                              #   in Loop: Header=BB4_579 Depth=1
 30146  	movapd	xmm3, xmm2
 30147  	movsd	qword ptr [r8 + 8*rax], xmm3
 30148  	cmp	word ptr [rcx + 2*rax + 2], 0
 30149  	movapd	xmm2, xmm0
 30150  	je	.LBB4_585
 30151  .LBB4_582:                              #   in Loop: Header=BB4_579 Depth=1
 30152  	movapd	xmm3, xmm1
 30153  	jg	.LBB4_578
 30154  .LBB4_586:                              #   in Loop: Header=BB4_579 Depth=1
 30155  	movapd	xmm3, xmm2
 30156  	jmp	.LBB4_578
 30157  .LBB4_587:
 30158  	mov	esi, edx
 30159  	and	esi, -2
 30160  	xor	eax, eax
 30161  	movsd	xmm0, qword ptr [rip + .LCPI4_13] # xmm0 = mem[0],zero
 30162  	movsd	xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero
 30163  	jmp	.LBB4_589
 30164  .LBB4_588:                              #   in Loop: Header=BB4_589 Depth=1
 30165  	movsd	qword ptr [r8 + 8*rax + 8], xmm3
 30166  	add	rax, 2
 30167  	cmp	rsi, rax
 30168  	je	.LBB4_164
 30169  .LBB4_589:                              # =>This Inner Loop Header: Depth=1
 30170  	cmp	qword ptr [rcx + 8*rax], 0
 30171  	movapd	xmm2, xmm0
 30172  	jne	.LBB4_590
 30173  # %bb.593:                              #   in Loop: Header=BB4_589 Depth=1
 30174  	xorpd	xmm2, xmm2
 30175  	movapd	xmm3, xmm1
 30176  	jle	.LBB4_594
 30177  .LBB4_591:                              #   in Loop: Header=BB4_589 Depth=1
 30178  	movsd	qword ptr [r8 + 8*rax], xmm3
 30179  	cmp	qword ptr [rcx + 8*rax + 8], 0
 30180  	movapd	xmm2, xmm0
 30181  	jne	.LBB4_592
 30182  .LBB4_595:                              #   in Loop: Header=BB4_589 Depth=1
 30183  	xorpd	xmm2, xmm2
 30184  	movapd	xmm3, xmm1
 30185  	jg	.LBB4_588
 30186  	jmp	.LBB4_596
 30187  .LBB4_590:                              #   in Loop: Header=BB4_589 Depth=1
 30188  	movapd	xmm3, xmm1
 30189  	jg	.LBB4_591
 30190  .LBB4_594:                              #   in Loop: Header=BB4_589 Depth=1
 30191  	movapd	xmm3, xmm2
 30192  	movsd	qword ptr [r8 + 8*rax], xmm3
 30193  	cmp	qword ptr [rcx + 8*rax + 8], 0
 30194  	movapd	xmm2, xmm0
 30195  	je	.LBB4_595
 30196  .LBB4_592:                              #   in Loop: Header=BB4_589 Depth=1
 30197  	movapd	xmm3, xmm1
 30198  	jg	.LBB4_588
 30199  .LBB4_596:                              #   in Loop: Header=BB4_589 Depth=1
 30200  	movapd	xmm3, xmm2
 30201  	jmp	.LBB4_588
 30202  .LBB4_597:
 30203  	mov	esi, edx
 30204  	and	esi, -2
 30205  	xor	eax, eax
 30206  	xorps	xmm0, xmm0
 30207  	jmp	.LBB4_599
 30208  .LBB4_598:                              #   in Loop: Header=BB4_599 Depth=1
 30209  	movsd	qword ptr [r8 + 8*rax + 8], xmm1
 30210  	add	rax, 2
 30211  	cmp	rsi, rax
 30212  	je	.LBB4_174
 30213  .LBB4_599:                              # =>This Inner Loop Header: Depth=1
 30214  	movss	xmm2, dword ptr [rcx + 4*rax]   # xmm2 = mem[0],zero,zero,zero
 30215  	xorpd	xmm1, xmm1
 30216  	ucomiss	xmm0, xmm2
 30217  	xorpd	xmm3, xmm3
 30218  	je	.LBB4_601
 30219  # %bb.600:                              #   in Loop: Header=BB4_599 Depth=1
 30220  	movmskps	edi, xmm2
 30221  	and	edi, 1
 30222  	neg	edi
 30223  	or	edi, 1
 30224  	xorps	xmm2, xmm2
 30225  	cvtsi2ss	xmm2, edi
 30226  	xorps	xmm3, xmm3
 30227  	cvtss2sd	xmm3, xmm2
 30228  .LBB4_601:                              #   in Loop: Header=BB4_599 Depth=1
 30229  	movsd	qword ptr [r8 + 8*rax], xmm3
 30230  	movss	xmm2, dword ptr [rcx + 4*rax + 4] # xmm2 = mem[0],zero,zero,zero
 30231  	ucomiss	xmm0, xmm2
 30232  	je	.LBB4_598
 30233  # %bb.602:                              #   in Loop: Header=BB4_599 Depth=1
 30234  	movmskps	edi, xmm2
 30235  	and	edi, 1
 30236  	neg	edi
 30237  	or	edi, 1
 30238  	xorps	xmm1, xmm1
 30239  	cvtsi2ss	xmm1, edi
 30240  	cvtss2sd	xmm1, xmm1
 30241  	jmp	.LBB4_598
 30242  .LBB4_603:
 30243  	and	edx, -4
 30244  	xor	esi, esi
 30245  	movsd	xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero
 30246  	jmp	.LBB4_605
 30247  .LBB4_604:                              #   in Loop: Header=BB4_605 Depth=1
 30248  	movsd	qword ptr [r8 + 8*rsi + 24], xmm1
 30249  	add	rsi, 4
 30250  	cmp	rdx, rsi
 30251  	je	.LBB4_185
 30252  .LBB4_605:                              # =>This Inner Loop Header: Depth=1
 30253  	cmp	byte ptr [rcx + rsi], 0
 30254  	movapd	xmm1, xmm0
 30255  	jne	.LBB4_606
 30256  # %bb.609:                              #   in Loop: Header=BB4_605 Depth=1
 30257  	xorpd	xmm1, xmm1
 30258  	movsd	qword ptr [r8 + 8*rsi], xmm1
 30259  	cmp	byte ptr [rcx + rsi + 1], 0
 30260  	movapd	xmm1, xmm0
 30261  	je	.LBB4_610
 30262  .LBB4_607:                              #   in Loop: Header=BB4_605 Depth=1
 30263  	movsd	qword ptr [r8 + 8*rsi + 8], xmm1
 30264  	cmp	byte ptr [rcx + rsi + 2], 0
 30265  	movapd	xmm1, xmm0
 30266  	jne	.LBB4_608
 30267  .LBB4_611:                              #   in Loop: Header=BB4_605 Depth=1
 30268  	xorpd	xmm1, xmm1
 30269  	movsd	qword ptr [r8 + 8*rsi + 16], xmm1
 30270  	cmp	byte ptr [rcx + rsi + 3], 0
 30271  	movapd	xmm1, xmm0
 30272  	jne	.LBB4_604
 30273  	jmp	.LBB4_612
 30274  .LBB4_606:                              #   in Loop: Header=BB4_605 Depth=1
 30275  	movsd	qword ptr [r8 + 8*rsi], xmm1
 30276  	cmp	byte ptr [rcx + rsi + 1], 0
 30277  	movapd	xmm1, xmm0
 30278  	jne	.LBB4_607
 30279  .LBB4_610:                              #   in Loop: Header=BB4_605 Depth=1
 30280  	xorpd	xmm1, xmm1
 30281  	movsd	qword ptr [r8 + 8*rsi + 8], xmm1
 30282  	cmp	byte ptr [rcx + rsi + 2], 0
 30283  	movapd	xmm1, xmm0
 30284  	je	.LBB4_611
 30285  .LBB4_608:                              #   in Loop: Header=BB4_605 Depth=1
 30286  	movsd	qword ptr [r8 + 8*rsi + 16], xmm1
 30287  	cmp	byte ptr [rcx + rsi + 3], 0
 30288  	movapd	xmm1, xmm0
 30289  	jne	.LBB4_604
 30290  .LBB4_612:                              #   in Loop: Header=BB4_605 Depth=1
 30291  	xorpd	xmm1, xmm1
 30292  	jmp	.LBB4_604
 30293  .LBB4_613:
 30294  	mov	esi, edx
 30295  	and	esi, -2
 30296  	xor	eax, eax
 30297  	movsd	xmm0, qword ptr [rip + .LCPI4_13] # xmm0 = mem[0],zero
 30298  	movsd	xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero
 30299  	jmp	.LBB4_615
 30300  .LBB4_614:                              #   in Loop: Header=BB4_615 Depth=1
 30301  	movsd	qword ptr [r8 + 8*rax + 8], xmm3
 30302  	add	rax, 2
 30303  	cmp	rsi, rax
 30304  	je	.LBB4_197
 30305  .LBB4_615:                              # =>This Inner Loop Header: Depth=1
 30306  	cmp	dword ptr [rcx + 4*rax], 0
 30307  	movapd	xmm2, xmm0
 30308  	jne	.LBB4_616
 30309  # %bb.619:                              #   in Loop: Header=BB4_615 Depth=1
 30310  	xorpd	xmm2, xmm2
 30311  	movapd	xmm3, xmm1
 30312  	jle	.LBB4_620
 30313  .LBB4_617:                              #   in Loop: Header=BB4_615 Depth=1
 30314  	movsd	qword ptr [r8 + 8*rax], xmm3
 30315  	cmp	dword ptr [rcx + 4*rax + 4], 0
 30316  	movapd	xmm2, xmm0
 30317  	jne	.LBB4_618
 30318  .LBB4_621:                              #   in Loop: Header=BB4_615 Depth=1
 30319  	xorpd	xmm2, xmm2
 30320  	movapd	xmm3, xmm1
 30321  	jg	.LBB4_614
 30322  	jmp	.LBB4_622
 30323  .LBB4_616:                              #   in Loop: Header=BB4_615 Depth=1
 30324  	movapd	xmm3, xmm1
 30325  	jg	.LBB4_617
 30326  .LBB4_620:                              #   in Loop: Header=BB4_615 Depth=1
 30327  	movapd	xmm3, xmm2
 30328  	movsd	qword ptr [r8 + 8*rax], xmm3
 30329  	cmp	dword ptr [rcx + 4*rax + 4], 0
 30330  	movapd	xmm2, xmm0
 30331  	je	.LBB4_621
 30332  .LBB4_618:                              #   in Loop: Header=BB4_615 Depth=1
 30333  	movapd	xmm3, xmm1
 30334  	jg	.LBB4_614
 30335  .LBB4_622:                              #   in Loop: Header=BB4_615 Depth=1
 30336  	movapd	xmm3, xmm2
 30337  	jmp	.LBB4_614
 30338  .LBB4_673:
 30339  	mov	edx, eax
 30340  	and	edx, -4
 30341  	lea	rsi, [rdx - 4]
 30342  	mov	r9, rsi
 30343  	shr	r9, 2
 30344  	add	r9, 1
 30345  	test	rsi, rsi
 30346  	je	.LBB4_999
 30347  # %bb.674:
 30348  	mov	rdi, r9
 30349  	and	rdi, -2
 30350  	neg	rdi
 30351  	xor	esi, esi
 30352  	pxor	xmm0, xmm0
 30353  	pcmpeqd	xmm1, xmm1
 30354  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_15] # xmm2 = [1,1]
 30355  .LBB4_675:                              # =>This Inner Loop Header: Depth=1
 30356  	movq	xmm3, qword ptr [rcx + 4*rsi]   # xmm3 = mem[0],zero
 30357  	movq	xmm4, qword ptr [rcx + 4*rsi + 8] # xmm4 = mem[0],zero
 30358  	pcmpeqd	xmm3, xmm0
 30359  	pxor	xmm3, xmm1
 30360  	pmovzxdq	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero
 30361  	pand	xmm3, xmm2
 30362  	pcmpeqd	xmm4, xmm0
 30363  	pxor	xmm4, xmm1
 30364  	pmovzxdq	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero
 30365  	pand	xmm4, xmm2
 30366  	movdqu	xmmword ptr [r8 + 8*rsi], xmm3
 30367  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm4
 30368  	movq	xmm3, qword ptr [rcx + 4*rsi + 16] # xmm3 = mem[0],zero
 30369  	movq	xmm4, qword ptr [rcx + 4*rsi + 24] # xmm4 = mem[0],zero
 30370  	pcmpeqd	xmm3, xmm0
 30371  	pxor	xmm3, xmm1
 30372  	pmovzxdq	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero
 30373  	pand	xmm3, xmm2
 30374  	pcmpeqd	xmm4, xmm0
 30375  	pxor	xmm4, xmm1
 30376  	pmovzxdq	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero
 30377  	pand	xmm4, xmm2
 30378  	movdqu	xmmword ptr [r8 + 8*rsi + 32], xmm3
 30379  	movdqu	xmmword ptr [r8 + 8*rsi + 48], xmm4
 30380  	add	rsi, 8
 30381  	add	rdi, 2
 30382  	jne	.LBB4_675
 30383  	jmp	.LBB4_1000
 30384  .LBB4_676:
 30385  	mov	esi, r10d
 30386  	and	esi, -2
 30387  	lea	rax, [rsi - 2]
 30388  	mov	r9, rax
 30389  	shr	r9
 30390  	add	r9, 1
 30391  	test	rax, rax
 30392  	je	.LBB4_1004
 30393  # %bb.677:
 30394  	mov	r14, r9
 30395  	and	r14, -2
 30396  	neg	r14
 30397  	xor	edi, edi
 30398  	xorpd	xmm0, xmm0
 30399  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 30400  	movapd	xmm2, xmmword ptr [rip + .LCPI4_1] # xmm2 = [1.0E+0,1.0E+0]
 30401  	movsd	xmm3, qword ptr [rip + .LCPI4_6] # xmm3 = mem[0],zero
 30402  .LBB4_678:                              # =>This Inner Loop Header: Depth=1
 30403  	movupd	xmm4, xmmword ptr [rcx + 8*rdi]
 30404  	movapd	xmm5, xmm4
 30405  	andpd	xmm5, xmm1
 30406  	orpd	xmm5, xmm2
 30407  	movapd	xmm6, xmm5
 30408  	subsd	xmm6, xmm3
 30409  	cvttsd2si	rbx, xmm6
 30410  	xor	rbx, r11
 30411  	cvttsd2si	rdx, xmm5
 30412  	ucomisd	xmm5, xmm3
 30413  	cmovae	rdx, rbx
 30414  	pshufd	xmm5, xmm5, 238                 # xmm5 = xmm5[2,3,2,3]
 30415  	movdqa	xmm6, xmm5
 30416  	subsd	xmm6, xmm3
 30417  	cvttsd2si	rbx, xmm6
 30418  	xor	rbx, r11
 30419  	cvttsd2si	rax, xmm5
 30420  	ucomisd	xmm5, xmm3
 30421  	cmovae	rax, rbx
 30422  	movq	xmm5, rdx
 30423  	movq	xmm6, rax
 30424  	punpcklqdq	xmm5, xmm6              # xmm5 = xmm5[0],xmm6[0]
 30425  	cmpneqpd	xmm4, xmm0
 30426  	andpd	xmm4, xmm5
 30427  	movupd	xmmword ptr [r8 + 8*rdi], xmm4
 30428  	movupd	xmm4, xmmword ptr [rcx + 8*rdi + 16]
 30429  	movapd	xmm5, xmm4
 30430  	andpd	xmm5, xmm1
 30431  	orpd	xmm5, xmm2
 30432  	movapd	xmm6, xmm5
 30433  	subsd	xmm6, xmm3
 30434  	cvttsd2si	rax, xmm6
 30435  	xor	rax, r11
 30436  	cvttsd2si	rdx, xmm5
 30437  	ucomisd	xmm5, xmm3
 30438  	cmovae	rdx, rax
 30439  	pshufd	xmm5, xmm5, 238                 # xmm5 = xmm5[2,3,2,3]
 30440  	movdqa	xmm6, xmm5
 30441  	subsd	xmm6, xmm3
 30442  	cvttsd2si	rax, xmm6
 30443  	xor	rax, r11
 30444  	cvttsd2si	rbx, xmm5
 30445  	ucomisd	xmm5, xmm3
 30446  	cmovae	rbx, rax
 30447  	movq	xmm5, rdx
 30448  	movq	xmm6, rbx
 30449  	punpcklqdq	xmm5, xmm6              # xmm5 = xmm5[0],xmm6[0]
 30450  	cmpneqpd	xmm4, xmm0
 30451  	andpd	xmm4, xmm5
 30452  	movupd	xmmword ptr [r8 + 8*rdi + 16], xmm4
 30453  	add	rdi, 4
 30454  	add	r14, 2
 30455  	jne	.LBB4_678
 30456  	jmp	.LBB4_1005
 30457  .LBB4_689:
 30458  	mov	edx, eax
 30459  	and	edx, -4
 30460  	lea	rsi, [rdx - 4]
 30461  	mov	r9, rsi
 30462  	shr	r9, 2
 30463  	add	r9, 1
 30464  	test	rsi, rsi
 30465  	je	.LBB4_1010
 30466  # %bb.690:
 30467  	mov	rdi, r9
 30468  	and	rdi, -2
 30469  	neg	rdi
 30470  	xor	esi, esi
 30471  	pxor	xmm0, xmm0
 30472  	pcmpeqd	xmm1, xmm1
 30473  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_15] # xmm2 = [1,1]
 30474  .LBB4_691:                              # =>This Inner Loop Header: Depth=1
 30475  	movd	xmm3, dword ptr [rcx + 2*rsi]   # xmm3 = mem[0],zero,zero,zero
 30476  	movd	xmm4, dword ptr [rcx + 2*rsi + 4] # xmm4 = mem[0],zero,zero,zero
 30477  	pcmpeqw	xmm3, xmm0
 30478  	pxor	xmm3, xmm1
 30479  	pmovzxwq	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
 30480  	pand	xmm3, xmm2
 30481  	pcmpeqw	xmm4, xmm0
 30482  	pxor	xmm4, xmm1
 30483  	pmovzxwq	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 30484  	pand	xmm4, xmm2
 30485  	movdqu	xmmword ptr [r8 + 8*rsi], xmm3
 30486  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm4
 30487  	movd	xmm3, dword ptr [rcx + 2*rsi + 8] # xmm3 = mem[0],zero,zero,zero
 30488  	movd	xmm4, dword ptr [rcx + 2*rsi + 12] # xmm4 = mem[0],zero,zero,zero
 30489  	pcmpeqw	xmm3, xmm0
 30490  	pxor	xmm3, xmm1
 30491  	pmovzxwq	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
 30492  	pand	xmm3, xmm2
 30493  	pcmpeqw	xmm4, xmm0
 30494  	pxor	xmm4, xmm1
 30495  	pmovzxwq	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 30496  	pand	xmm4, xmm2
 30497  	movdqu	xmmword ptr [r8 + 8*rsi + 32], xmm3
 30498  	movdqu	xmmword ptr [r8 + 8*rsi + 48], xmm4
 30499  	add	rsi, 8
 30500  	add	rdi, 2
 30501  	jne	.LBB4_691
 30502  	jmp	.LBB4_1011
 30503  .LBB4_692:
 30504  	mov	edx, r10d
 30505  	and	edx, -4
 30506  	lea	rsi, [rdx - 4]
 30507  	mov	r9, rsi
 30508  	shr	r9, 2
 30509  	add	r9, 1
 30510  	test	rsi, rsi
 30511  	je	.LBB4_1015
 30512  # %bb.693:
 30513  	mov	rdi, r9
 30514  	and	rdi, -2
 30515  	neg	rdi
 30516  	xor	esi, esi
 30517  	pxor	xmm2, xmm2
 30518  	pcmpeqd	xmm3, xmm3
 30519  	movapd	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 30520  .LBB4_694:                              # =>This Inner Loop Header: Depth=1
 30521  	movd	xmm5, dword ptr [rcx + 2*rsi]   # xmm5 = mem[0],zero,zero,zero
 30522  	movd	xmm6, dword ptr [rcx + 2*rsi + 4] # xmm6 = mem[0],zero,zero,zero
 30523  	movdqa	xmm0, xmm5
 30524  	pcmpgtw	xmm0, xmm2
 30525  	pmovsxwq	xmm0, xmm0
 30526  	movdqa	xmm1, xmm6
 30527  	pcmpgtw	xmm1, xmm2
 30528  	pmovsxwq	xmm1, xmm1
 30529  	pcmpeqw	xmm5, xmm2
 30530  	pxor	xmm5, xmm3
 30531  	pmovsxwq	xmm5, xmm5
 30532  	pcmpeqw	xmm6, xmm2
 30533  	pxor	xmm6, xmm3
 30534  	pmovsxwq	xmm6, xmm6
 30535  	blendvpd	xmm5, xmm4, xmm0
 30536  	movdqa	xmm0, xmm1
 30537  	blendvpd	xmm6, xmm4, xmm0
 30538  	movupd	xmmword ptr [r8 + 8*rsi], xmm5
 30539  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm6
 30540  	movd	xmm5, dword ptr [rcx + 2*rsi + 8] # xmm5 = mem[0],zero,zero,zero
 30541  	movd	xmm6, dword ptr [rcx + 2*rsi + 12] # xmm6 = mem[0],zero,zero,zero
 30542  	movdqa	xmm0, xmm5
 30543  	pcmpgtw	xmm0, xmm2
 30544  	pmovsxwq	xmm0, xmm0
 30545  	movdqa	xmm1, xmm6
 30546  	pcmpgtw	xmm1, xmm2
 30547  	pmovsxwq	xmm1, xmm1
 30548  	pcmpeqw	xmm5, xmm2
 30549  	pxor	xmm5, xmm3
 30550  	pmovsxwq	xmm5, xmm5
 30551  	pcmpeqw	xmm6, xmm2
 30552  	pxor	xmm6, xmm3
 30553  	pmovsxwq	xmm6, xmm6
 30554  	blendvpd	xmm5, xmm4, xmm0
 30555  	movdqa	xmm0, xmm1
 30556  	blendvpd	xmm6, xmm4, xmm0
 30557  	movupd	xmmword ptr [r8 + 8*rsi + 32], xmm5
 30558  	movupd	xmmword ptr [r8 + 8*rsi + 48], xmm6
 30559  	add	rsi, 8
 30560  	add	rdi, 2
 30561  	jne	.LBB4_694
 30562  	jmp	.LBB4_1016
 30563  .LBB4_700:
 30564  	mov	esi, r10d
 30565  	and	esi, -2
 30566  	xor	eax, eax
 30567  	xorps	xmm0, xmm0
 30568  	movss	xmm1, dword ptr [rip + .LCPI4_9] # xmm1 = mem[0],zero,zero,zero
 30569  	movabs	r9, -9223372036854775808
 30570  	jmp	.LBB4_703
 30571  .LBB4_701:                              #   in Loop: Header=BB4_703 Depth=1
 30572  	movmskps	edx, xmm2
 30573  	and	edx, 1
 30574  	neg	edx
 30575  	or	edx, 1
 30576  	xorps	xmm2, xmm2
 30577  	cvtsi2ss	xmm2, edx
 30578  	movaps	xmm3, xmm2
 30579  	subss	xmm3, xmm1
 30580  	cvttss2si	rdi, xmm3
 30581  	xor	rdi, r9
 30582  	cvttss2si	rdx, xmm2
 30583  	ucomiss	xmm2, xmm1
 30584  	cmovae	rdx, rdi
 30585  	mov	qword ptr [r8 + 8*rax + 8], rdx
 30586  	add	rax, 2
 30587  	cmp	rsi, rax
 30588  	je	.LBB4_290
 30589  .LBB4_703:                              # =>This Inner Loop Header: Depth=1
 30590  	movss	xmm2, dword ptr [rcx + 4*rax]   # xmm2 = mem[0],zero,zero,zero
 30591  	ucomiss	xmm0, xmm2
 30592  	jne	.LBB4_705
 30593  # %bb.704:                              #   in Loop: Header=BB4_703 Depth=1
 30594  	xor	edx, edx
 30595  	jmp	.LBB4_706
 30596  .LBB4_705:                              #   in Loop: Header=BB4_703 Depth=1
 30597  	movmskps	edx, xmm2
 30598  	and	edx, 1
 30599  	neg	edx
 30600  	or	edx, 1
 30601  	xorps	xmm2, xmm2
 30602  	cvtsi2ss	xmm2, edx
 30603  	movaps	xmm3, xmm2
 30604  	subss	xmm3, xmm1
 30605  	cvttss2si	rdi, xmm3
 30606  	xor	rdi, r9
 30607  	cvttss2si	rdx, xmm2
 30608  	ucomiss	xmm2, xmm1
 30609  	cmovae	rdx, rdi
 30610  .LBB4_706:                              #   in Loop: Header=BB4_703 Depth=1
 30611  	mov	qword ptr [r8 + 8*rax], rdx
 30612  	movss	xmm2, dword ptr [rcx + 4*rax + 4] # xmm2 = mem[0],zero,zero,zero
 30613  	ucomiss	xmm0, xmm2
 30614  	jne	.LBB4_701
 30615  # %bb.707:                              #   in Loop: Header=BB4_703 Depth=1
 30616  	xor	edx, edx
 30617  	mov	qword ptr [r8 + 8*rax + 8], rdx
 30618  	add	rax, 2
 30619  	cmp	rsi, rax
 30620  	jne	.LBB4_703
 30621  .LBB4_290:
 30622  	test	r10b, 1
 30623  	je	.LBB4_1655
 30624  # %bb.291:
 30625  	movss	xmm0, dword ptr [rcx + 4*rax]   # xmm0 = mem[0],zero,zero,zero
 30626  	xorps	xmm1, xmm1
 30627  	ucomiss	xmm1, xmm0
 30628  	jne	.LBB4_993
 30629  # %bb.292:
 30630  	xor	ecx, ecx
 30631  	mov	qword ptr [r8 + 8*rax], rcx
 30632  	jmp	.LBB4_1655
 30633  .LBB4_713:
 30634  	mov	edx, r10d
 30635  	and	edx, -4
 30636  	lea	rsi, [rdx - 4]
 30637  	mov	r9, rsi
 30638  	shr	r9, 2
 30639  	add	r9, 1
 30640  	test	rsi, rsi
 30641  	je	.LBB4_1021
 30642  # %bb.714:
 30643  	mov	rdi, r9
 30644  	and	rdi, -2
 30645  	neg	rdi
 30646  	xor	esi, esi
 30647  	pxor	xmm2, xmm2
 30648  	pcmpeqd	xmm3, xmm3
 30649  	movapd	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 30650  .LBB4_715:                              # =>This Inner Loop Header: Depth=1
 30651  	movq	xmm5, qword ptr [rcx + 4*rsi]   # xmm5 = mem[0],zero
 30652  	movq	xmm6, qword ptr [rcx + 4*rsi + 8] # xmm6 = mem[0],zero
 30653  	movdqa	xmm0, xmm5
 30654  	pcmpgtd	xmm0, xmm2
 30655  	pmovsxdq	xmm0, xmm0
 30656  	movdqa	xmm1, xmm6
 30657  	pcmpgtd	xmm1, xmm2
 30658  	pmovsxdq	xmm1, xmm1
 30659  	pcmpeqd	xmm5, xmm2
 30660  	pxor	xmm5, xmm3
 30661  	pmovsxdq	xmm5, xmm5
 30662  	pcmpeqd	xmm6, xmm2
 30663  	pxor	xmm6, xmm3
 30664  	pmovsxdq	xmm6, xmm6
 30665  	blendvpd	xmm5, xmm4, xmm0
 30666  	movdqa	xmm0, xmm1
 30667  	blendvpd	xmm6, xmm4, xmm0
 30668  	movupd	xmmword ptr [r8 + 8*rsi], xmm5
 30669  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm6
 30670  	movq	xmm5, qword ptr [rcx + 4*rsi + 16] # xmm5 = mem[0],zero
 30671  	movq	xmm6, qword ptr [rcx + 4*rsi + 24] # xmm6 = mem[0],zero
 30672  	movdqa	xmm0, xmm5
 30673  	pcmpgtd	xmm0, xmm2
 30674  	pmovsxdq	xmm0, xmm0
 30675  	movdqa	xmm1, xmm6
 30676  	pcmpgtd	xmm1, xmm2
 30677  	pmovsxdq	xmm1, xmm1
 30678  	pcmpeqd	xmm5, xmm2
 30679  	pxor	xmm5, xmm3
 30680  	pmovsxdq	xmm5, xmm5
 30681  	pcmpeqd	xmm6, xmm2
 30682  	pxor	xmm6, xmm3
 30683  	pmovsxdq	xmm6, xmm6
 30684  	blendvpd	xmm5, xmm4, xmm0
 30685  	movdqa	xmm0, xmm1
 30686  	blendvpd	xmm6, xmm4, xmm0
 30687  	movupd	xmmword ptr [r8 + 8*rsi + 32], xmm5
 30688  	movupd	xmmword ptr [r8 + 8*rsi + 48], xmm6
 30689  	add	rsi, 8
 30690  	add	rdi, 2
 30691  	jne	.LBB4_715
 30692  	jmp	.LBB4_1022
 30693  .LBB4_716:
 30694  	mov	edx, eax
 30695  	and	edx, -8
 30696  	lea	rsi, [rdx - 8]
 30697  	mov	r9, rsi
 30698  	shr	r9, 3
 30699  	add	r9, 1
 30700  	test	rsi, rsi
 30701  	je	.LBB4_1137
 30702  # %bb.717:
 30703  	mov	rdi, r9
 30704  	and	rdi, -2
 30705  	neg	rdi
 30706  	xor	esi, esi
 30707  	pxor	xmm0, xmm0
 30708  	pcmpeqd	xmm1, xmm1
 30709  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_11] # xmm2 = <1,1,1,1,u,u,u,u>
 30710  .LBB4_718:                              # =>This Inner Loop Header: Depth=1
 30711  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi]
 30712  	movdqu	xmm4, xmmword ptr [rcx + 4*rsi + 16]
 30713  	pcmpeqd	xmm3, xmm0
 30714  	pxor	xmm3, xmm1
 30715  	packssdw	xmm3, xmm3
 30716  	pand	xmm3, xmm2
 30717  	pcmpeqd	xmm4, xmm0
 30718  	pxor	xmm4, xmm1
 30719  	packssdw	xmm4, xmm4
 30720  	pand	xmm4, xmm2
 30721  	punpcklqdq	xmm3, xmm4              # xmm3 = xmm3[0],xmm4[0]
 30722  	movdqu	xmmword ptr [r8 + 2*rsi], xmm3
 30723  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 32]
 30724  	movdqu	xmm4, xmmword ptr [rcx + 4*rsi + 48]
 30725  	pcmpeqd	xmm3, xmm0
 30726  	pxor	xmm3, xmm1
 30727  	packssdw	xmm3, xmm3
 30728  	pand	xmm3, xmm2
 30729  	pcmpeqd	xmm4, xmm0
 30730  	pxor	xmm4, xmm1
 30731  	packssdw	xmm4, xmm4
 30732  	pand	xmm4, xmm2
 30733  	punpcklqdq	xmm3, xmm4              # xmm3 = xmm3[0],xmm4[0]
 30734  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm3
 30735  	add	rsi, 16
 30736  	add	rdi, 2
 30737  	jne	.LBB4_718
 30738  	jmp	.LBB4_1138
 30739  .LBB4_719:
 30740  	mov	edx, eax
 30741  	and	edx, -8
 30742  	lea	rsi, [rdx - 8]
 30743  	mov	r9, rsi
 30744  	shr	r9, 3
 30745  	add	r9, 1
 30746  	test	rsi, rsi
 30747  	je	.LBB4_1142
 30748  # %bb.720:
 30749  	mov	rdi, r9
 30750  	and	rdi, -2
 30751  	neg	rdi
 30752  	xor	esi, esi
 30753  	pxor	xmm0, xmm0
 30754  	pcmpeqd	xmm1, xmm1
 30755  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_11] # xmm2 = <1,1,1,1,u,u,u,u>
 30756  .LBB4_721:                              # =>This Inner Loop Header: Depth=1
 30757  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi]
 30758  	movdqu	xmm4, xmmword ptr [rcx + 4*rsi + 16]
 30759  	pcmpeqd	xmm3, xmm0
 30760  	pxor	xmm3, xmm1
 30761  	packssdw	xmm3, xmm3
 30762  	pand	xmm3, xmm2
 30763  	pcmpeqd	xmm4, xmm0
 30764  	pxor	xmm4, xmm1
 30765  	packssdw	xmm4, xmm4
 30766  	pand	xmm4, xmm2
 30767  	punpcklqdq	xmm3, xmm4              # xmm3 = xmm3[0],xmm4[0]
 30768  	movdqu	xmmword ptr [r8 + 2*rsi], xmm3
 30769  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 32]
 30770  	movdqu	xmm4, xmmword ptr [rcx + 4*rsi + 48]
 30771  	pcmpeqd	xmm3, xmm0
 30772  	pxor	xmm3, xmm1
 30773  	packssdw	xmm3, xmm3
 30774  	pand	xmm3, xmm2
 30775  	pcmpeqd	xmm4, xmm0
 30776  	pxor	xmm4, xmm1
 30777  	packssdw	xmm4, xmm4
 30778  	pand	xmm4, xmm2
 30779  	punpcklqdq	xmm3, xmm4              # xmm3 = xmm3[0],xmm4[0]
 30780  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm3
 30781  	add	rsi, 16
 30782  	add	rdi, 2
 30783  	jne	.LBB4_721
 30784  	jmp	.LBB4_1143
 30785  .LBB4_722:
 30786  	mov	esi, eax
 30787  	and	esi, -4
 30788  	lea	rdx, [rsi - 4]
 30789  	mov	r9, rdx
 30790  	shr	r9, 2
 30791  	add	r9, 1
 30792  	test	rdx, rdx
 30793  	je	.LBB4_1147
 30794  # %bb.723:
 30795  	mov	rdx, r9
 30796  	and	rdx, -2
 30797  	neg	rdx
 30798  	xor	edi, edi
 30799  	xorpd	xmm2, xmm2
 30800  	movapd	xmm3, xmmword ptr [rip + .LCPI4_0] # xmm3 = [-0.0E+0,-0.0E+0]
 30801  	movapd	xmm4, xmmword ptr [rip + .LCPI4_1] # xmm4 = [1.0E+0,1.0E+0]
 30802  .LBB4_724:                              # =>This Inner Loop Header: Depth=1
 30803  	movupd	xmm5, xmmword ptr [rcx + 8*rdi]
 30804  	movupd	xmm6, xmmword ptr [rcx + 8*rdi + 16]
 30805  	movapd	xmm0, xmm5
 30806  	cmpeqpd	xmm0, xmm2
 30807  	packssdw	xmm0, xmm0
 30808  	packssdw	xmm0, xmm0
 30809  	movapd	xmm1, xmm6
 30810  	cmpeqpd	xmm1, xmm2
 30811  	packssdw	xmm1, xmm1
 30812  	packssdw	xmm1, xmm1
 30813  	andpd	xmm5, xmm3
 30814  	orpd	xmm5, xmm4
 30815  	andpd	xmm6, xmm3
 30816  	orpd	xmm6, xmm4
 30817  	cvttpd2dq	xmm5, xmm5
 30818  	pshuflw	xmm5, xmm5, 232                 # xmm5 = xmm5[0,2,2,3,4,5,6,7]
 30819  	cvttpd2dq	xmm6, xmm6
 30820  	pshuflw	xmm6, xmm6, 232                 # xmm6 = xmm6[0,2,2,3,4,5,6,7]
 30821  	pblendvb	xmm5, xmm2, xmm0
 30822  	movdqa	xmm0, xmm1
 30823  	pblendvb	xmm6, xmm2, xmm0
 30824  	movd	dword ptr [r8 + 2*rdi], xmm5
 30825  	movd	dword ptr [r8 + 2*rdi + 4], xmm6
 30826  	movupd	xmm5, xmmword ptr [rcx + 8*rdi + 32]
 30827  	movupd	xmm6, xmmword ptr [rcx + 8*rdi + 48]
 30828  	movapd	xmm0, xmm5
 30829  	cmpeqpd	xmm0, xmm2
 30830  	packssdw	xmm0, xmm0
 30831  	packssdw	xmm0, xmm0
 30832  	movapd	xmm1, xmm6
 30833  	cmpeqpd	xmm1, xmm2
 30834  	packssdw	xmm1, xmm1
 30835  	packssdw	xmm1, xmm1
 30836  	andpd	xmm5, xmm3
 30837  	orpd	xmm5, xmm4
 30838  	andpd	xmm6, xmm3
 30839  	orpd	xmm6, xmm4
 30840  	cvttpd2dq	xmm5, xmm5
 30841  	pshuflw	xmm5, xmm5, 232                 # xmm5 = xmm5[0,2,2,3,4,5,6,7]
 30842  	cvttpd2dq	xmm6, xmm6
 30843  	pshuflw	xmm6, xmm6, 232                 # xmm6 = xmm6[0,2,2,3,4,5,6,7]
 30844  	pblendvb	xmm5, xmm2, xmm0
 30845  	movdqa	xmm0, xmm1
 30846  	pblendvb	xmm6, xmm2, xmm0
 30847  	movd	dword ptr [r8 + 2*rdi + 8], xmm5
 30848  	movd	dword ptr [r8 + 2*rdi + 12], xmm6
 30849  	add	rdi, 8
 30850  	add	rdx, 2
 30851  	jne	.LBB4_724
 30852  	jmp	.LBB4_1148
 30853  .LBB4_725:
 30854  	mov	esi, eax
 30855  	and	esi, -4
 30856  	lea	rdx, [rsi - 4]
 30857  	mov	r9, rdx
 30858  	shr	r9, 2
 30859  	add	r9, 1
 30860  	test	rdx, rdx
 30861  	je	.LBB4_1153
 30862  # %bb.726:
 30863  	mov	rdx, r9
 30864  	and	rdx, -2
 30865  	neg	rdx
 30866  	xor	edi, edi
 30867  	xorpd	xmm2, xmm2
 30868  	movapd	xmm3, xmmword ptr [rip + .LCPI4_0] # xmm3 = [-0.0E+0,-0.0E+0]
 30869  	movapd	xmm4, xmmword ptr [rip + .LCPI4_1] # xmm4 = [1.0E+0,1.0E+0]
 30870  .LBB4_727:                              # =>This Inner Loop Header: Depth=1
 30871  	movupd	xmm5, xmmword ptr [rcx + 8*rdi]
 30872  	movupd	xmm6, xmmword ptr [rcx + 8*rdi + 16]
 30873  	movapd	xmm0, xmm5
 30874  	cmpeqpd	xmm0, xmm2
 30875  	packssdw	xmm0, xmm0
 30876  	packssdw	xmm0, xmm0
 30877  	movapd	xmm1, xmm6
 30878  	cmpeqpd	xmm1, xmm2
 30879  	packssdw	xmm1, xmm1
 30880  	packssdw	xmm1, xmm1
 30881  	andpd	xmm5, xmm3
 30882  	orpd	xmm5, xmm4
 30883  	andpd	xmm6, xmm3
 30884  	orpd	xmm6, xmm4
 30885  	cvttpd2dq	xmm5, xmm5
 30886  	pshuflw	xmm5, xmm5, 232                 # xmm5 = xmm5[0,2,2,3,4,5,6,7]
 30887  	cvttpd2dq	xmm6, xmm6
 30888  	pshuflw	xmm6, xmm6, 232                 # xmm6 = xmm6[0,2,2,3,4,5,6,7]
 30889  	pblendvb	xmm5, xmm2, xmm0
 30890  	movdqa	xmm0, xmm1
 30891  	pblendvb	xmm6, xmm2, xmm0
 30892  	movd	dword ptr [r8 + 2*rdi], xmm5
 30893  	movd	dword ptr [r8 + 2*rdi + 4], xmm6
 30894  	movupd	xmm5, xmmword ptr [rcx + 8*rdi + 32]
 30895  	movupd	xmm6, xmmword ptr [rcx + 8*rdi + 48]
 30896  	movapd	xmm0, xmm5
 30897  	cmpeqpd	xmm0, xmm2
 30898  	packssdw	xmm0, xmm0
 30899  	packssdw	xmm0, xmm0
 30900  	movapd	xmm1, xmm6
 30901  	cmpeqpd	xmm1, xmm2
 30902  	packssdw	xmm1, xmm1
 30903  	packssdw	xmm1, xmm1
 30904  	andpd	xmm5, xmm3
 30905  	orpd	xmm5, xmm4
 30906  	andpd	xmm6, xmm3
 30907  	orpd	xmm6, xmm4
 30908  	cvttpd2dq	xmm5, xmm5
 30909  	pshuflw	xmm5, xmm5, 232                 # xmm5 = xmm5[0,2,2,3,4,5,6,7]
 30910  	cvttpd2dq	xmm6, xmm6
 30911  	pshuflw	xmm6, xmm6, 232                 # xmm6 = xmm6[0,2,2,3,4,5,6,7]
 30912  	pblendvb	xmm5, xmm2, xmm0
 30913  	movdqa	xmm0, xmm1
 30914  	pblendvb	xmm6, xmm2, xmm0
 30915  	movd	dword ptr [r8 + 2*rdi + 8], xmm5
 30916  	movd	dword ptr [r8 + 2*rdi + 12], xmm6
 30917  	add	rdi, 8
 30918  	add	rdx, 2
 30919  	jne	.LBB4_727
 30920  	jmp	.LBB4_1154
 30921  .LBB4_738:
 30922  	mov	edx, eax
 30923  	and	edx, -4
 30924  	lea	rsi, [rdx - 4]
 30925  	mov	r9, rsi
 30926  	shr	r9, 2
 30927  	add	r9, 1
 30928  	test	rsi, rsi
 30929  	je	.LBB4_1027
 30930  # %bb.739:
 30931  	mov	rdi, r9
 30932  	and	rdi, -2
 30933  	neg	rdi
 30934  	xor	esi, esi
 30935  	pxor	xmm0, xmm0
 30936  	pcmpeqd	xmm1, xmm1
 30937  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_17] # xmm2 = <1,1,u,u,u,u,u,u>
 30938  .LBB4_740:                              # =>This Inner Loop Header: Depth=1
 30939  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi]
 30940  	movdqu	xmm4, xmmword ptr [rcx + 8*rsi + 16]
 30941  	pcmpeqq	xmm3, xmm0
 30942  	pxor	xmm3, xmm1
 30943  	packssdw	xmm3, xmm3
 30944  	packssdw	xmm3, xmm3
 30945  	pand	xmm3, xmm2
 30946  	pcmpeqq	xmm4, xmm0
 30947  	pxor	xmm4, xmm1
 30948  	packssdw	xmm4, xmm4
 30949  	packssdw	xmm4, xmm4
 30950  	pand	xmm4, xmm2
 30951  	movd	dword ptr [r8 + 2*rsi], xmm3
 30952  	movd	dword ptr [r8 + 2*rsi + 4], xmm4
 30953  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 32]
 30954  	movdqu	xmm4, xmmword ptr [rcx + 8*rsi + 48]
 30955  	pcmpeqq	xmm3, xmm0
 30956  	pxor	xmm3, xmm1
 30957  	packssdw	xmm3, xmm3
 30958  	packssdw	xmm3, xmm3
 30959  	pand	xmm3, xmm2
 30960  	pcmpeqq	xmm4, xmm0
 30961  	pxor	xmm4, xmm1
 30962  	packssdw	xmm4, xmm4
 30963  	packssdw	xmm4, xmm4
 30964  	pand	xmm4, xmm2
 30965  	movd	dword ptr [r8 + 2*rsi + 8], xmm3
 30966  	movd	dword ptr [r8 + 2*rsi + 12], xmm4
 30967  	add	rsi, 8
 30968  	add	rdi, 2
 30969  	jne	.LBB4_740
 30970  	jmp	.LBB4_1028
 30971  .LBB4_741:
 30972  	mov	edx, eax
 30973  	and	edx, -4
 30974  	lea	rsi, [rdx - 4]
 30975  	mov	r9, rsi
 30976  	shr	r9, 2
 30977  	add	r9, 1
 30978  	test	rsi, rsi
 30979  	je	.LBB4_1032
 30980  # %bb.742:
 30981  	mov	rdi, r9
 30982  	and	rdi, -2
 30983  	neg	rdi
 30984  	xor	esi, esi
 30985  	pxor	xmm0, xmm0
 30986  	pcmpeqd	xmm1, xmm1
 30987  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_17] # xmm2 = <1,1,u,u,u,u,u,u>
 30988  .LBB4_743:                              # =>This Inner Loop Header: Depth=1
 30989  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi]
 30990  	movdqu	xmm4, xmmword ptr [rcx + 8*rsi + 16]
 30991  	pcmpeqq	xmm3, xmm0
 30992  	pxor	xmm3, xmm1
 30993  	packssdw	xmm3, xmm3
 30994  	packssdw	xmm3, xmm3
 30995  	pand	xmm3, xmm2
 30996  	pcmpeqq	xmm4, xmm0
 30997  	pxor	xmm4, xmm1
 30998  	packssdw	xmm4, xmm4
 30999  	packssdw	xmm4, xmm4
 31000  	pand	xmm4, xmm2
 31001  	movd	dword ptr [r8 + 2*rsi], xmm3
 31002  	movd	dword ptr [r8 + 2*rsi + 4], xmm4
 31003  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 32]
 31004  	movdqu	xmm4, xmmword ptr [rcx + 8*rsi + 48]
 31005  	pcmpeqq	xmm3, xmm0
 31006  	pxor	xmm3, xmm1
 31007  	packssdw	xmm3, xmm3
 31008  	packssdw	xmm3, xmm3
 31009  	pand	xmm3, xmm2
 31010  	pcmpeqq	xmm4, xmm0
 31011  	pxor	xmm4, xmm1
 31012  	packssdw	xmm4, xmm4
 31013  	packssdw	xmm4, xmm4
 31014  	pand	xmm4, xmm2
 31015  	movd	dword ptr [r8 + 2*rsi + 8], xmm3
 31016  	movd	dword ptr [r8 + 2*rsi + 12], xmm4
 31017  	add	rsi, 8
 31018  	add	rdi, 2
 31019  	jne	.LBB4_743
 31020  	jmp	.LBB4_1033
 31021  .LBB4_764:
 31022  	mov	edx, r10d
 31023  	and	edx, -4
 31024  	lea	rsi, [rdx - 4]
 31025  	mov	r9, rsi
 31026  	shr	r9, 2
 31027  	add	r9, 1
 31028  	test	rsi, rsi
 31029  	je	.LBB4_1037
 31030  # %bb.765:
 31031  	mov	rdi, r9
 31032  	and	rdi, -2
 31033  	neg	rdi
 31034  	xor	esi, esi
 31035  	pxor	xmm2, xmm2
 31036  	pcmpeqd	xmm3, xmm3
 31037  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_17] # xmm4 = <1,1,u,u,u,u,u,u>
 31038  .LBB4_766:                              # =>This Inner Loop Header: Depth=1
 31039  	movdqu	xmm5, xmmword ptr [rcx + 8*rsi]
 31040  	movdqu	xmm6, xmmword ptr [rcx + 8*rsi + 16]
 31041  	movdqa	xmm0, xmm5
 31042  	pcmpgtq	xmm0, xmm2
 31043  	packssdw	xmm0, xmm0
 31044  	packssdw	xmm0, xmm0
 31045  	movdqa	xmm1, xmm6
 31046  	pcmpgtq	xmm1, xmm2
 31047  	packssdw	xmm1, xmm1
 31048  	packssdw	xmm1, xmm1
 31049  	pcmpeqq	xmm5, xmm2
 31050  	pxor	xmm5, xmm3
 31051  	packssdw	xmm5, xmm5
 31052  	packssdw	xmm5, xmm5
 31053  	pcmpeqq	xmm6, xmm2
 31054  	pxor	xmm6, xmm3
 31055  	packssdw	xmm6, xmm6
 31056  	packssdw	xmm6, xmm6
 31057  	pblendvb	xmm5, xmm4, xmm0
 31058  	movdqa	xmm0, xmm1
 31059  	pblendvb	xmm6, xmm4, xmm0
 31060  	movd	dword ptr [r8 + 2*rsi], xmm5
 31061  	movd	dword ptr [r8 + 2*rsi + 4], xmm6
 31062  	movdqu	xmm5, xmmword ptr [rcx + 8*rsi + 32]
 31063  	movdqu	xmm6, xmmword ptr [rcx + 8*rsi + 48]
 31064  	movdqa	xmm0, xmm5
 31065  	pcmpgtq	xmm0, xmm2
 31066  	packssdw	xmm0, xmm0
 31067  	packssdw	xmm0, xmm0
 31068  	movdqa	xmm1, xmm6
 31069  	pcmpgtq	xmm1, xmm2
 31070  	packssdw	xmm1, xmm1
 31071  	packssdw	xmm1, xmm1
 31072  	pcmpeqq	xmm5, xmm2
 31073  	pxor	xmm5, xmm3
 31074  	packssdw	xmm5, xmm5
 31075  	packssdw	xmm5, xmm5
 31076  	pcmpeqq	xmm6, xmm2
 31077  	pxor	xmm6, xmm3
 31078  	packssdw	xmm6, xmm6
 31079  	packssdw	xmm6, xmm6
 31080  	pblendvb	xmm5, xmm4, xmm0
 31081  	movdqa	xmm0, xmm1
 31082  	pblendvb	xmm6, xmm4, xmm0
 31083  	movd	dword ptr [r8 + 2*rsi + 8], xmm5
 31084  	movd	dword ptr [r8 + 2*rsi + 12], xmm6
 31085  	add	rsi, 8
 31086  	add	rdi, 2
 31087  	jne	.LBB4_766
 31088  	jmp	.LBB4_1038
 31089  .LBB4_767:
 31090  	mov	edx, r10d
 31091  	and	edx, -4
 31092  	lea	rsi, [rdx - 4]
 31093  	mov	r9, rsi
 31094  	shr	r9, 2
 31095  	add	r9, 1
 31096  	test	rsi, rsi
 31097  	je	.LBB4_1159
 31098  # %bb.768:
 31099  	mov	rdi, r9
 31100  	and	rdi, -2
 31101  	neg	rdi
 31102  	xor	esi, esi
 31103  	pxor	xmm2, xmm2
 31104  	pcmpeqd	xmm3, xmm3
 31105  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_17] # xmm4 = <1,1,u,u,u,u,u,u>
 31106  .LBB4_769:                              # =>This Inner Loop Header: Depth=1
 31107  	movdqu	xmm5, xmmword ptr [rcx + 8*rsi]
 31108  	movdqu	xmm6, xmmword ptr [rcx + 8*rsi + 16]
 31109  	movdqa	xmm0, xmm5
 31110  	pcmpgtq	xmm0, xmm2
 31111  	packssdw	xmm0, xmm0
 31112  	packssdw	xmm0, xmm0
 31113  	movdqa	xmm1, xmm6
 31114  	pcmpgtq	xmm1, xmm2
 31115  	packssdw	xmm1, xmm1
 31116  	packssdw	xmm1, xmm1
 31117  	pcmpeqq	xmm5, xmm2
 31118  	pxor	xmm5, xmm3
 31119  	packssdw	xmm5, xmm5
 31120  	packssdw	xmm5, xmm5
 31121  	pcmpeqq	xmm6, xmm2
 31122  	pxor	xmm6, xmm3
 31123  	packssdw	xmm6, xmm6
 31124  	packssdw	xmm6, xmm6
 31125  	pblendvb	xmm5, xmm4, xmm0
 31126  	movdqa	xmm0, xmm1
 31127  	pblendvb	xmm6, xmm4, xmm0
 31128  	movd	dword ptr [r8 + 2*rsi], xmm5
 31129  	movd	dword ptr [r8 + 2*rsi + 4], xmm6
 31130  	movdqu	xmm5, xmmword ptr [rcx + 8*rsi + 32]
 31131  	movdqu	xmm6, xmmword ptr [rcx + 8*rsi + 48]
 31132  	movdqa	xmm0, xmm5
 31133  	pcmpgtq	xmm0, xmm2
 31134  	packssdw	xmm0, xmm0
 31135  	packssdw	xmm0, xmm0
 31136  	movdqa	xmm1, xmm6
 31137  	pcmpgtq	xmm1, xmm2
 31138  	packssdw	xmm1, xmm1
 31139  	packssdw	xmm1, xmm1
 31140  	pcmpeqq	xmm5, xmm2
 31141  	pxor	xmm5, xmm3
 31142  	packssdw	xmm5, xmm5
 31143  	packssdw	xmm5, xmm5
 31144  	pcmpeqq	xmm6, xmm2
 31145  	pxor	xmm6, xmm3
 31146  	packssdw	xmm6, xmm6
 31147  	packssdw	xmm6, xmm6
 31148  	pblendvb	xmm5, xmm4, xmm0
 31149  	movdqa	xmm0, xmm1
 31150  	pblendvb	xmm6, xmm4, xmm0
 31151  	movd	dword ptr [r8 + 2*rsi + 8], xmm5
 31152  	movd	dword ptr [r8 + 2*rsi + 12], xmm6
 31153  	add	rsi, 8
 31154  	add	rdi, 2
 31155  	jne	.LBB4_769
 31156  	jmp	.LBB4_1160
 31157  .LBB4_770:
 31158  	mov	esi, eax
 31159  	and	esi, -8
 31160  	lea	rdx, [rsi - 8]
 31161  	mov	r9, rdx
 31162  	shr	r9, 3
 31163  	add	r9, 1
 31164  	test	rdx, rdx
 31165  	je	.LBB4_1165
 31166  # %bb.771:
 31167  	mov	rdx, r9
 31168  	and	rdx, -2
 31169  	neg	rdx
 31170  	xor	edi, edi
 31171  	xorps	xmm4, xmm4
 31172  	pcmpeqd	xmm8, xmm8
 31173  	movdqa	xmm6, xmmword ptr [rip + .LCPI4_11] # xmm6 = <1,1,1,1,u,u,u,u>
 31174  .LBB4_772:                              # =>This Inner Loop Header: Depth=1
 31175  	movups	xmm0, xmmword ptr [rcx + 4*rdi]
 31176  	movups	xmm1, xmmword ptr [rcx + 4*rdi + 16]
 31177  	movaps	xmm2, xmm0
 31178  	cmpeqps	xmm2, xmm4
 31179  	packssdw	xmm2, xmm2
 31180  	movaps	xmm3, xmm1
 31181  	cmpeqps	xmm3, xmm4
 31182  	packssdw	xmm3, xmm3
 31183  	pcmpgtd	xmm0, xmm8
 31184  	packssdw	xmm0, xmm0
 31185  	pcmpgtd	xmm1, xmm8
 31186  	packssdw	xmm1, xmm1
 31187  	pcmpeqd	xmm7, xmm7
 31188  	pblendvb	xmm7, xmm6, xmm0
 31189  	pcmpeqd	xmm5, xmm5
 31190  	movdqa	xmm0, xmm1
 31191  	pblendvb	xmm5, xmm6, xmm0
 31192  	movdqa	xmm0, xmm2
 31193  	pblendvb	xmm7, xmm4, xmm0
 31194  	movdqa	xmm0, xmm3
 31195  	pblendvb	xmm5, xmm4, xmm0
 31196  	punpcklqdq	xmm7, xmm5              # xmm7 = xmm7[0],xmm5[0]
 31197  	movdqu	xmmword ptr [r8 + 2*rdi], xmm7
 31198  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 32]
 31199  	movups	xmm1, xmmword ptr [rcx + 4*rdi + 48]
 31200  	movaps	xmm2, xmm0
 31201  	cmpeqps	xmm2, xmm4
 31202  	packssdw	xmm2, xmm2
 31203  	movaps	xmm3, xmm1
 31204  	cmpeqps	xmm3, xmm4
 31205  	packssdw	xmm3, xmm3
 31206  	pcmpgtd	xmm0, xmm8
 31207  	packssdw	xmm0, xmm0
 31208  	pcmpgtd	xmm1, xmm8
 31209  	pcmpeqd	xmm5, xmm5
 31210  	pblendvb	xmm5, xmm6, xmm0
 31211  	packssdw	xmm1, xmm1
 31212  	pcmpeqd	xmm7, xmm7
 31213  	movdqa	xmm0, xmm1
 31214  	pblendvb	xmm7, xmm6, xmm0
 31215  	movdqa	xmm0, xmm2
 31216  	pblendvb	xmm5, xmm4, xmm0
 31217  	movdqa	xmm0, xmm3
 31218  	pblendvb	xmm7, xmm4, xmm0
 31219  	punpcklqdq	xmm5, xmm7              # xmm5 = xmm5[0],xmm7[0]
 31220  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm5
 31221  	add	rdi, 16
 31222  	add	rdx, 2
 31223  	jne	.LBB4_772
 31224  	jmp	.LBB4_1166
 31225  .LBB4_773:
 31226  	mov	esi, eax
 31227  	and	esi, -8
 31228  	lea	rdx, [rsi - 8]
 31229  	mov	r9, rdx
 31230  	shr	r9, 3
 31231  	add	r9, 1
 31232  	test	rdx, rdx
 31233  	je	.LBB4_1171
 31234  # %bb.774:
 31235  	mov	rdx, r9
 31236  	and	rdx, -2
 31237  	neg	rdx
 31238  	xor	edi, edi
 31239  	xorps	xmm4, xmm4
 31240  	pcmpeqd	xmm8, xmm8
 31241  	movdqa	xmm6, xmmword ptr [rip + .LCPI4_11] # xmm6 = <1,1,1,1,u,u,u,u>
 31242  .LBB4_775:                              # =>This Inner Loop Header: Depth=1
 31243  	movups	xmm0, xmmword ptr [rcx + 4*rdi]
 31244  	movups	xmm1, xmmword ptr [rcx + 4*rdi + 16]
 31245  	movaps	xmm2, xmm0
 31246  	cmpeqps	xmm2, xmm4
 31247  	packssdw	xmm2, xmm2
 31248  	movaps	xmm3, xmm1
 31249  	cmpeqps	xmm3, xmm4
 31250  	packssdw	xmm3, xmm3
 31251  	pcmpgtd	xmm0, xmm8
 31252  	packssdw	xmm0, xmm0
 31253  	pcmpgtd	xmm1, xmm8
 31254  	packssdw	xmm1, xmm1
 31255  	pcmpeqd	xmm7, xmm7
 31256  	pblendvb	xmm7, xmm6, xmm0
 31257  	pcmpeqd	xmm5, xmm5
 31258  	movdqa	xmm0, xmm1
 31259  	pblendvb	xmm5, xmm6, xmm0
 31260  	movdqa	xmm0, xmm2
 31261  	pblendvb	xmm7, xmm4, xmm0
 31262  	movdqa	xmm0, xmm3
 31263  	pblendvb	xmm5, xmm4, xmm0
 31264  	punpcklqdq	xmm7, xmm5              # xmm7 = xmm7[0],xmm5[0]
 31265  	movdqu	xmmword ptr [r8 + 2*rdi], xmm7
 31266  	movups	xmm0, xmmword ptr [rcx + 4*rdi + 32]
 31267  	movups	xmm1, xmmword ptr [rcx + 4*rdi + 48]
 31268  	movaps	xmm2, xmm0
 31269  	cmpeqps	xmm2, xmm4
 31270  	packssdw	xmm2, xmm2
 31271  	movaps	xmm3, xmm1
 31272  	cmpeqps	xmm3, xmm4
 31273  	packssdw	xmm3, xmm3
 31274  	pcmpgtd	xmm0, xmm8
 31275  	packssdw	xmm0, xmm0
 31276  	pcmpgtd	xmm1, xmm8
 31277  	pcmpeqd	xmm5, xmm5
 31278  	pblendvb	xmm5, xmm6, xmm0
 31279  	packssdw	xmm1, xmm1
 31280  	pcmpeqd	xmm7, xmm7
 31281  	movdqa	xmm0, xmm1
 31282  	pblendvb	xmm7, xmm6, xmm0
 31283  	movdqa	xmm0, xmm2
 31284  	pblendvb	xmm5, xmm4, xmm0
 31285  	movdqa	xmm0, xmm3
 31286  	pblendvb	xmm7, xmm4, xmm0
 31287  	punpcklqdq	xmm5, xmm7              # xmm5 = xmm5[0],xmm7[0]
 31288  	movdqu	xmmword ptr [r8 + 2*rdi + 16], xmm5
 31289  	add	rdi, 16
 31290  	add	rdx, 2
 31291  	jne	.LBB4_775
 31292  	jmp	.LBB4_1172
 31293  .LBB4_786:
 31294  	mov	edx, r10d
 31295  	and	edx, -8
 31296  	lea	rsi, [rdx - 8]
 31297  	mov	r9, rsi
 31298  	shr	r9, 3
 31299  	add	r9, 1
 31300  	test	rsi, rsi
 31301  	je	.LBB4_1043
 31302  # %bb.787:
 31303  	mov	rdi, r9
 31304  	and	rdi, -2
 31305  	neg	rdi
 31306  	xor	esi, esi
 31307  	pxor	xmm2, xmm2
 31308  	pcmpeqd	xmm3, xmm3
 31309  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_11] # xmm4 = <1,1,1,1,u,u,u,u>
 31310  .LBB4_788:                              # =>This Inner Loop Header: Depth=1
 31311  	movdqu	xmm5, xmmword ptr [rcx + 4*rsi]
 31312  	movdqu	xmm6, xmmword ptr [rcx + 4*rsi + 16]
 31313  	movdqa	xmm0, xmm5
 31314  	pcmpgtd	xmm0, xmm2
 31315  	packssdw	xmm0, xmm0
 31316  	movdqa	xmm1, xmm6
 31317  	pcmpgtd	xmm1, xmm2
 31318  	packssdw	xmm1, xmm1
 31319  	pcmpeqd	xmm5, xmm2
 31320  	pxor	xmm5, xmm3
 31321  	packssdw	xmm5, xmm5
 31322  	pcmpeqd	xmm6, xmm2
 31323  	pxor	xmm6, xmm3
 31324  	packssdw	xmm6, xmm6
 31325  	pblendvb	xmm5, xmm4, xmm0
 31326  	movdqa	xmm0, xmm1
 31327  	pblendvb	xmm6, xmm4, xmm0
 31328  	punpcklqdq	xmm5, xmm6              # xmm5 = xmm5[0],xmm6[0]
 31329  	movdqu	xmmword ptr [r8 + 2*rsi], xmm5
 31330  	movdqu	xmm5, xmmword ptr [rcx + 4*rsi + 32]
 31331  	movdqu	xmm6, xmmword ptr [rcx + 4*rsi + 48]
 31332  	movdqa	xmm0, xmm5
 31333  	pcmpgtd	xmm0, xmm2
 31334  	packssdw	xmm0, xmm0
 31335  	movdqa	xmm1, xmm6
 31336  	pcmpgtd	xmm1, xmm2
 31337  	packssdw	xmm1, xmm1
 31338  	pcmpeqd	xmm5, xmm2
 31339  	pxor	xmm5, xmm3
 31340  	packssdw	xmm5, xmm5
 31341  	pcmpeqd	xmm6, xmm2
 31342  	pxor	xmm6, xmm3
 31343  	packssdw	xmm6, xmm6
 31344  	pblendvb	xmm5, xmm4, xmm0
 31345  	movdqa	xmm0, xmm1
 31346  	pblendvb	xmm6, xmm4, xmm0
 31347  	punpcklqdq	xmm5, xmm6              # xmm5 = xmm5[0],xmm6[0]
 31348  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm5
 31349  	add	rsi, 16
 31350  	add	rdi, 2
 31351  	jne	.LBB4_788
 31352  	jmp	.LBB4_1044
 31353  .LBB4_789:
 31354  	mov	edx, r10d
 31355  	and	edx, -8
 31356  	lea	rsi, [rdx - 8]
 31357  	mov	r9, rsi
 31358  	shr	r9, 3
 31359  	add	r9, 1
 31360  	test	rsi, rsi
 31361  	je	.LBB4_1049
 31362  # %bb.790:
 31363  	mov	rdi, r9
 31364  	and	rdi, -2
 31365  	neg	rdi
 31366  	xor	esi, esi
 31367  	pxor	xmm2, xmm2
 31368  	pcmpeqd	xmm3, xmm3
 31369  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_11] # xmm4 = <1,1,1,1,u,u,u,u>
 31370  .LBB4_791:                              # =>This Inner Loop Header: Depth=1
 31371  	movdqu	xmm5, xmmword ptr [rcx + 4*rsi]
 31372  	movdqu	xmm6, xmmword ptr [rcx + 4*rsi + 16]
 31373  	movdqa	xmm0, xmm5
 31374  	pcmpgtd	xmm0, xmm2
 31375  	packssdw	xmm0, xmm0
 31376  	movdqa	xmm1, xmm6
 31377  	pcmpgtd	xmm1, xmm2
 31378  	packssdw	xmm1, xmm1
 31379  	pcmpeqd	xmm5, xmm2
 31380  	pxor	xmm5, xmm3
 31381  	packssdw	xmm5, xmm5
 31382  	pcmpeqd	xmm6, xmm2
 31383  	pxor	xmm6, xmm3
 31384  	packssdw	xmm6, xmm6
 31385  	pblendvb	xmm5, xmm4, xmm0
 31386  	movdqa	xmm0, xmm1
 31387  	pblendvb	xmm6, xmm4, xmm0
 31388  	punpcklqdq	xmm5, xmm6              # xmm5 = xmm5[0],xmm6[0]
 31389  	movdqu	xmmword ptr [r8 + 2*rsi], xmm5
 31390  	movdqu	xmm5, xmmword ptr [rcx + 4*rsi + 32]
 31391  	movdqu	xmm6, xmmword ptr [rcx + 4*rsi + 48]
 31392  	movdqa	xmm0, xmm5
 31393  	pcmpgtd	xmm0, xmm2
 31394  	packssdw	xmm0, xmm0
 31395  	movdqa	xmm1, xmm6
 31396  	pcmpgtd	xmm1, xmm2
 31397  	packssdw	xmm1, xmm1
 31398  	pcmpeqd	xmm5, xmm2
 31399  	pxor	xmm5, xmm3
 31400  	packssdw	xmm5, xmm5
 31401  	pcmpeqd	xmm6, xmm2
 31402  	pxor	xmm6, xmm3
 31403  	packssdw	xmm6, xmm6
 31404  	pblendvb	xmm5, xmm4, xmm0
 31405  	movdqa	xmm0, xmm1
 31406  	pblendvb	xmm6, xmm4, xmm0
 31407  	punpcklqdq	xmm5, xmm6              # xmm5 = xmm5[0],xmm6[0]
 31408  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm5
 31409  	add	rsi, 16
 31410  	add	rdi, 2
 31411  	jne	.LBB4_791
 31412  	jmp	.LBB4_1050
 31413  .LBB4_792:
 31414  	mov	edx, eax
 31415  	and	edx, -4
 31416  	lea	rsi, [rdx - 4]
 31417  	mov	r9, rsi
 31418  	shr	r9, 2
 31419  	add	r9, 1
 31420  	test	rsi, rsi
 31421  	je	.LBB4_1177
 31422  # %bb.793:
 31423  	mov	rdi, r9
 31424  	and	rdi, -2
 31425  	neg	rdi
 31426  	xor	esi, esi
 31427  	pxor	xmm0, xmm0
 31428  	pcmpeqd	xmm1, xmm1
 31429  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_15] # xmm2 = [1,1]
 31430  .LBB4_794:                              # =>This Inner Loop Header: Depth=1
 31431  	movq	xmm3, qword ptr [rcx + 4*rsi]   # xmm3 = mem[0],zero
 31432  	movq	xmm4, qword ptr [rcx + 4*rsi + 8] # xmm4 = mem[0],zero
 31433  	pcmpeqd	xmm3, xmm0
 31434  	pxor	xmm3, xmm1
 31435  	pmovzxdq	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero
 31436  	pand	xmm3, xmm2
 31437  	pcmpeqd	xmm4, xmm0
 31438  	pxor	xmm4, xmm1
 31439  	pmovzxdq	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero
 31440  	pand	xmm4, xmm2
 31441  	movdqu	xmmword ptr [r8 + 8*rsi], xmm3
 31442  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm4
 31443  	movq	xmm3, qword ptr [rcx + 4*rsi + 16] # xmm3 = mem[0],zero
 31444  	movq	xmm4, qword ptr [rcx + 4*rsi + 24] # xmm4 = mem[0],zero
 31445  	pcmpeqd	xmm3, xmm0
 31446  	pxor	xmm3, xmm1
 31447  	pmovzxdq	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero
 31448  	pand	xmm3, xmm2
 31449  	pcmpeqd	xmm4, xmm0
 31450  	pxor	xmm4, xmm1
 31451  	pmovzxdq	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero
 31452  	pand	xmm4, xmm2
 31453  	movdqu	xmmword ptr [r8 + 8*rsi + 32], xmm3
 31454  	movdqu	xmmword ptr [r8 + 8*rsi + 48], xmm4
 31455  	add	rsi, 8
 31456  	add	rdi, 2
 31457  	jne	.LBB4_794
 31458  	jmp	.LBB4_1178
 31459  .LBB4_795:
 31460  	mov	edx, eax
 31461  	and	edx, -8
 31462  	lea	rsi, [rdx - 8]
 31463  	mov	r9, rsi
 31464  	shr	r9, 3
 31465  	add	r9, 1
 31466  	test	rsi, rsi
 31467  	je	.LBB4_1182
 31468  # %bb.796:
 31469  	mov	rdi, r9
 31470  	and	rdi, -2
 31471  	neg	rdi
 31472  	xor	esi, esi
 31473  	pxor	xmm0, xmm0
 31474  	movdqa	xmm1, xmmword ptr [rip + .LCPI4_19] # xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 31475  .LBB4_797:                              # =>This Inner Loop Header: Depth=1
 31476  	movdqu	xmm2, xmmword ptr [rcx + 4*rsi]
 31477  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 16]
 31478  	pcmpeqd	xmm2, xmm0
 31479  	pandn	xmm2, xmm1
 31480  	pcmpeqd	xmm3, xmm0
 31481  	pandn	xmm3, xmm1
 31482  	movdqu	xmmword ptr [r8 + 4*rsi], xmm2
 31483  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm3
 31484  	movdqu	xmm2, xmmword ptr [rcx + 4*rsi + 32]
 31485  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 48]
 31486  	pcmpeqd	xmm2, xmm0
 31487  	pandn	xmm2, xmm1
 31488  	pcmpeqd	xmm3, xmm0
 31489  	pandn	xmm3, xmm1
 31490  	movdqu	xmmword ptr [r8 + 4*rsi + 32], xmm2
 31491  	movdqu	xmmword ptr [r8 + 4*rsi + 48], xmm3
 31492  	add	rsi, 16
 31493  	add	rdi, 2
 31494  	jne	.LBB4_797
 31495  	jmp	.LBB4_1183
 31496  .LBB4_798:
 31497  	mov	edx, eax
 31498  	and	edx, -4
 31499  	lea	rsi, [rdx - 4]
 31500  	mov	r9, rsi
 31501  	shr	r9, 2
 31502  	add	r9, 1
 31503  	test	rsi, rsi
 31504  	je	.LBB4_1190
 31505  # %bb.799:
 31506  	mov	rdi, r9
 31507  	and	rdi, -2
 31508  	neg	rdi
 31509  	xor	esi, esi
 31510  	xorpd	xmm0, xmm0
 31511  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 31512  	movapd	xmm2, xmmword ptr [rip + .LCPI4_1] # xmm2 = [1.0E+0,1.0E+0]
 31513  .LBB4_800:                              # =>This Inner Loop Header: Depth=1
 31514  	movupd	xmm3, xmmword ptr [rcx + 8*rsi]
 31515  	movupd	xmm4, xmmword ptr [rcx + 8*rsi + 16]
 31516  	movapd	xmm5, xmm3
 31517  	andpd	xmm5, xmm1
 31518  	orpd	xmm5, xmm2
 31519  	movapd	xmm6, xmm4
 31520  	andpd	xmm6, xmm1
 31521  	orpd	xmm6, xmm2
 31522  	cvttsd2si	rbx, xmm5
 31523  	movq	xmm7, rbx
 31524  	pshufd	xmm5, xmm5, 238                 # xmm5 = xmm5[2,3,2,3]
 31525  	cvttsd2si	rbx, xmm5
 31526  	movq	xmm5, rbx
 31527  	punpcklqdq	xmm7, xmm5              # xmm7 = xmm7[0],xmm5[0]
 31528  	cvttsd2si	rbx, xmm6
 31529  	movq	xmm5, rbx
 31530  	pshufd	xmm6, xmm6, 238                 # xmm6 = xmm6[2,3,2,3]
 31531  	cvttsd2si	rbx, xmm6
 31532  	movq	xmm6, rbx
 31533  	punpcklqdq	xmm5, xmm6              # xmm5 = xmm5[0],xmm6[0]
 31534  	cmpneqpd	xmm3, xmm0
 31535  	andpd	xmm3, xmm7
 31536  	cmpneqpd	xmm4, xmm0
 31537  	andpd	xmm4, xmm5
 31538  	movupd	xmmword ptr [r8 + 8*rsi], xmm3
 31539  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm4
 31540  	movupd	xmm3, xmmword ptr [rcx + 8*rsi + 32]
 31541  	movupd	xmm4, xmmword ptr [rcx + 8*rsi + 48]
 31542  	movapd	xmm5, xmm3
 31543  	andpd	xmm5, xmm1
 31544  	orpd	xmm5, xmm2
 31545  	movapd	xmm6, xmm4
 31546  	andpd	xmm6, xmm1
 31547  	orpd	xmm6, xmm2
 31548  	cvttsd2si	rbx, xmm5
 31549  	movq	xmm7, rbx
 31550  	pshufd	xmm5, xmm5, 238                 # xmm5 = xmm5[2,3,2,3]
 31551  	cvttsd2si	rbx, xmm5
 31552  	movq	xmm5, rbx
 31553  	punpcklqdq	xmm7, xmm5              # xmm7 = xmm7[0],xmm5[0]
 31554  	cvttsd2si	rbx, xmm6
 31555  	movq	xmm5, rbx
 31556  	pshufd	xmm6, xmm6, 238                 # xmm6 = xmm6[2,3,2,3]
 31557  	cvttsd2si	rbx, xmm6
 31558  	movq	xmm6, rbx
 31559  	punpcklqdq	xmm5, xmm6              # xmm5 = xmm5[0],xmm6[0]
 31560  	cmpneqpd	xmm3, xmm0
 31561  	andpd	xmm3, xmm7
 31562  	cmpneqpd	xmm4, xmm0
 31563  	andpd	xmm4, xmm5
 31564  	movupd	xmmword ptr [r8 + 8*rsi + 32], xmm3
 31565  	movupd	xmmword ptr [r8 + 8*rsi + 48], xmm4
 31566  	add	rsi, 8
 31567  	add	rdi, 2
 31568  	jne	.LBB4_800
 31569  	jmp	.LBB4_1191
 31570  .LBB4_801:
 31571  	mov	edx, eax
 31572  	and	edx, -4
 31573  	lea	rsi, [rdx - 4]
 31574  	mov	r9, rsi
 31575  	shr	r9, 2
 31576  	add	r9, 1
 31577  	test	rsi, rsi
 31578  	je	.LBB4_1196
 31579  # %bb.802:
 31580  	mov	rdi, r9
 31581  	and	rdi, -2
 31582  	neg	rdi
 31583  	xor	esi, esi
 31584  	xorpd	xmm8, xmm8
 31585  	cvtpd2ps	xmm1, xmmword ptr [rip + .LCPI4_1]
 31586  	movaps	xmm9, xmmword ptr [rip + .LCPI4_3] # xmm9 = [NaN,NaN,NaN,NaN]
 31587  	movshdup	xmm3, xmm1                      # xmm3 = xmm1[1,1,3,3]
 31588  	andps	xmm3, xmm9
 31589  	andps	xmm1, xmm9
 31590  .LBB4_803:                              # =>This Inner Loop Header: Depth=1
 31591  	movupd	xmm4, xmmword ptr [rcx + 8*rsi]
 31592  	movupd	xmm6, xmmword ptr [rcx + 8*rsi + 16]
 31593  	xorps	xmm5, xmm5
 31594  	cvtsd2ss	xmm5, xmm4
 31595  	cmpeqpd	xmm4, xmm8
 31596  	shufps	xmm4, xmm4, 232                 # xmm4 = xmm4[0,2,2,3]
 31597  	xorps	xmm7, xmm7
 31598  	cvtsd2ss	xmm7, xmm6
 31599  	cmpeqpd	xmm6, xmm8
 31600  	shufps	xmm6, xmm6, 232                 # xmm6 = xmm6[0,2,2,3]
 31601  	movsd	xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
 31602  	cvtsd2ss	xmm0, xmm0
 31603  	movaps	xmm2, xmm9
 31604  	andnps	xmm2, xmm0
 31605  	orps	xmm2, xmm3
 31606  	movaps	xmm0, xmm9
 31607  	andnps	xmm0, xmm5
 31608  	orps	xmm0, xmm1
 31609  	unpcklps	xmm0, xmm2                      # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 31610  	andnps	xmm4, xmm0
 31611  	movsd	xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
 31612  	cvtsd2ss	xmm0, xmm0
 31613  	movaps	xmm2, xmm9
 31614  	andnps	xmm2, xmm0
 31615  	orps	xmm2, xmm3
 31616  	movaps	xmm0, xmm9
 31617  	andnps	xmm0, xmm7
 31618  	orps	xmm0, xmm1
 31619  	unpcklps	xmm0, xmm2                      # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 31620  	andnps	xmm6, xmm0
 31621  	movlhps	xmm4, xmm6                      # xmm4 = xmm4[0],xmm6[0]
 31622  	movups	xmmword ptr [r8 + 4*rsi], xmm4
 31623  	movupd	xmm4, xmmword ptr [rcx + 8*rsi + 32]
 31624  	movupd	xmm0, xmmword ptr [rcx + 8*rsi + 48]
 31625  	xorps	xmm2, xmm2
 31626  	cvtsd2ss	xmm2, xmm4
 31627  	cmpeqpd	xmm4, xmm8
 31628  	shufps	xmm4, xmm4, 232                 # xmm4 = xmm4[0,2,2,3]
 31629  	xorps	xmm5, xmm5
 31630  	cvtsd2ss	xmm5, xmm0
 31631  	cmpeqpd	xmm0, xmm8
 31632  	movsd	xmm6, qword ptr [rcx + 8*rsi + 40] # xmm6 = mem[0],zero
 31633  	cvtsd2ss	xmm6, xmm6
 31634  	shufps	xmm0, xmm0, 232                 # xmm0 = xmm0[0,2,2,3]
 31635  	movaps	xmm7, xmm9
 31636  	andnps	xmm7, xmm6
 31637  	orps	xmm7, xmm3
 31638  	movaps	xmm6, xmm9
 31639  	andnps	xmm6, xmm2
 31640  	orps	xmm6, xmm1
 31641  	unpcklps	xmm6, xmm7                      # xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
 31642  	andnps	xmm4, xmm6
 31643  	movsd	xmm2, qword ptr [rcx + 8*rsi + 56] # xmm2 = mem[0],zero
 31644  	cvtsd2ss	xmm2, xmm2
 31645  	movaps	xmm6, xmm9
 31646  	andnps	xmm6, xmm2
 31647  	orps	xmm6, xmm3
 31648  	movaps	xmm2, xmm9
 31649  	andnps	xmm2, xmm5
 31650  	orps	xmm2, xmm1
 31651  	unpcklps	xmm2, xmm6                      # xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
 31652  	andnps	xmm0, xmm2
 31653  	movlhps	xmm4, xmm0                      # xmm4 = xmm4[0],xmm0[0]
 31654  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm4
 31655  	add	rsi, 8
 31656  	add	rdi, 2
 31657  	jne	.LBB4_803
 31658  	jmp	.LBB4_1197
 31659  .LBB4_819:
 31660  	and	edx, -4
 31661  	xor	esi, esi
 31662  	movss	xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero
 31663  	jmp	.LBB4_821
 31664  .LBB4_820:                              #   in Loop: Header=BB4_821 Depth=1
 31665  	movss	dword ptr [r8 + 4*rsi + 12], xmm1
 31666  	add	rsi, 4
 31667  	cmp	rdx, rsi
 31668  	je	.LBB4_387
 31669  .LBB4_821:                              # =>This Inner Loop Header: Depth=1
 31670  	cmp	qword ptr [rcx + 8*rsi], 0
 31671  	movapd	xmm1, xmm0
 31672  	jne	.LBB4_822
 31673  # %bb.825:                              #   in Loop: Header=BB4_821 Depth=1
 31674  	xorpd	xmm1, xmm1
 31675  	movss	dword ptr [r8 + 4*rsi], xmm1
 31676  	cmp	qword ptr [rcx + 8*rsi + 8], 0
 31677  	movapd	xmm1, xmm0
 31678  	je	.LBB4_826
 31679  .LBB4_823:                              #   in Loop: Header=BB4_821 Depth=1
 31680  	movss	dword ptr [r8 + 4*rsi + 4], xmm1
 31681  	cmp	qword ptr [rcx + 8*rsi + 16], 0
 31682  	movapd	xmm1, xmm0
 31683  	jne	.LBB4_824
 31684  .LBB4_827:                              #   in Loop: Header=BB4_821 Depth=1
 31685  	xorpd	xmm1, xmm1
 31686  	movss	dword ptr [r8 + 4*rsi + 8], xmm1
 31687  	cmp	qword ptr [rcx + 8*rsi + 24], 0
 31688  	movapd	xmm1, xmm0
 31689  	jne	.LBB4_820
 31690  	jmp	.LBB4_828
 31691  .LBB4_822:                              #   in Loop: Header=BB4_821 Depth=1
 31692  	movss	dword ptr [r8 + 4*rsi], xmm1
 31693  	cmp	qword ptr [rcx + 8*rsi + 8], 0
 31694  	movapd	xmm1, xmm0
 31695  	jne	.LBB4_823
 31696  .LBB4_826:                              #   in Loop: Header=BB4_821 Depth=1
 31697  	xorpd	xmm1, xmm1
 31698  	movss	dword ptr [r8 + 4*rsi + 4], xmm1
 31699  	cmp	qword ptr [rcx + 8*rsi + 16], 0
 31700  	movapd	xmm1, xmm0
 31701  	je	.LBB4_827
 31702  .LBB4_824:                              #   in Loop: Header=BB4_821 Depth=1
 31703  	movss	dword ptr [r8 + 4*rsi + 8], xmm1
 31704  	cmp	qword ptr [rcx + 8*rsi + 24], 0
 31705  	movapd	xmm1, xmm0
 31706  	jne	.LBB4_820
 31707  .LBB4_828:                              #   in Loop: Header=BB4_821 Depth=1
 31708  	xorpd	xmm1, xmm1
 31709  	jmp	.LBB4_820
 31710  .LBB4_829:
 31711  	mov	edx, eax
 31712  	and	edx, -4
 31713  	lea	rsi, [rdx - 4]
 31714  	mov	r9, rsi
 31715  	shr	r9, 2
 31716  	add	r9, 1
 31717  	test	rsi, rsi
 31718  	je	.LBB4_1055
 31719  # %bb.830:
 31720  	mov	rdi, r9
 31721  	and	rdi, -2
 31722  	neg	rdi
 31723  	xor	esi, esi
 31724  	pxor	xmm0, xmm0
 31725  	pcmpeqd	xmm1, xmm1
 31726  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_15] # xmm2 = [1,1]
 31727  .LBB4_831:                              # =>This Inner Loop Header: Depth=1
 31728  	movd	xmm3, dword ptr [rcx + 2*rsi]   # xmm3 = mem[0],zero,zero,zero
 31729  	movd	xmm4, dword ptr [rcx + 2*rsi + 4] # xmm4 = mem[0],zero,zero,zero
 31730  	pcmpeqw	xmm3, xmm0
 31731  	pxor	xmm3, xmm1
 31732  	pmovzxwq	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
 31733  	pand	xmm3, xmm2
 31734  	pcmpeqw	xmm4, xmm0
 31735  	pxor	xmm4, xmm1
 31736  	pmovzxwq	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 31737  	pand	xmm4, xmm2
 31738  	movdqu	xmmword ptr [r8 + 8*rsi], xmm3
 31739  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm4
 31740  	movd	xmm3, dword ptr [rcx + 2*rsi + 8] # xmm3 = mem[0],zero,zero,zero
 31741  	movd	xmm4, dword ptr [rcx + 2*rsi + 12] # xmm4 = mem[0],zero,zero,zero
 31742  	pcmpeqw	xmm3, xmm0
 31743  	pxor	xmm3, xmm1
 31744  	pmovzxwq	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
 31745  	pand	xmm3, xmm2
 31746  	pcmpeqw	xmm4, xmm0
 31747  	pxor	xmm4, xmm1
 31748  	pmovzxwq	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 31749  	pand	xmm4, xmm2
 31750  	movdqu	xmmword ptr [r8 + 8*rsi + 32], xmm3
 31751  	movdqu	xmmword ptr [r8 + 8*rsi + 48], xmm4
 31752  	add	rsi, 8
 31753  	add	rdi, 2
 31754  	jne	.LBB4_831
 31755  	jmp	.LBB4_1056
 31756  .LBB4_832:
 31757  	mov	edx, eax
 31758  	and	edx, -8
 31759  	lea	rsi, [rdx - 8]
 31760  	mov	r9, rsi
 31761  	shr	r9, 3
 31762  	add	r9, 1
 31763  	test	rsi, rsi
 31764  	je	.LBB4_1204
 31765  # %bb.833:
 31766  	mov	rdi, r9
 31767  	and	rdi, -2
 31768  	neg	rdi
 31769  	xor	esi, esi
 31770  	pxor	xmm0, xmm0
 31771  	pcmpeqd	xmm1, xmm1
 31772  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1]
 31773  .LBB4_834:                              # =>This Inner Loop Header: Depth=1
 31774  	movq	xmm3, qword ptr [rcx + 2*rsi]   # xmm3 = mem[0],zero
 31775  	movq	xmm4, qword ptr [rcx + 2*rsi + 8] # xmm4 = mem[0],zero
 31776  	pcmpeqw	xmm3, xmm0
 31777  	pxor	xmm3, xmm1
 31778  	pmovzxwd	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 31779  	pand	xmm3, xmm2
 31780  	cvtdq2ps	xmm3, xmm3
 31781  	pcmpeqw	xmm4, xmm0
 31782  	pxor	xmm4, xmm1
 31783  	pmovzxwd	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
 31784  	pand	xmm4, xmm2
 31785  	cvtdq2ps	xmm4, xmm4
 31786  	movups	xmmword ptr [r8 + 4*rsi], xmm3
 31787  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm4
 31788  	movq	xmm3, qword ptr [rcx + 2*rsi + 16] # xmm3 = mem[0],zero
 31789  	movq	xmm4, qword ptr [rcx + 2*rsi + 24] # xmm4 = mem[0],zero
 31790  	pcmpeqw	xmm3, xmm0
 31791  	pxor	xmm3, xmm1
 31792  	pmovzxwd	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 31793  	pand	xmm3, xmm2
 31794  	cvtdq2ps	xmm3, xmm3
 31795  	pcmpeqw	xmm4, xmm0
 31796  	pxor	xmm4, xmm1
 31797  	pmovzxwd	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
 31798  	pand	xmm4, xmm2
 31799  	cvtdq2ps	xmm4, xmm4
 31800  	movups	xmmword ptr [r8 + 4*rsi + 32], xmm3
 31801  	movups	xmmword ptr [r8 + 4*rsi + 48], xmm4
 31802  	add	rsi, 16
 31803  	add	rdi, 2
 31804  	jne	.LBB4_834
 31805  	jmp	.LBB4_1205
 31806  .LBB4_835:
 31807  	mov	edx, r10d
 31808  	and	edx, -4
 31809  	lea	rsi, [rdx - 4]
 31810  	mov	r9, rsi
 31811  	shr	r9, 2
 31812  	add	r9, 1
 31813  	test	rsi, rsi
 31814  	je	.LBB4_1212
 31815  # %bb.836:
 31816  	mov	rdi, r9
 31817  	and	rdi, -2
 31818  	neg	rdi
 31819  	xor	esi, esi
 31820  	pxor	xmm2, xmm2
 31821  	pcmpeqd	xmm3, xmm3
 31822  	movapd	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 31823  .LBB4_837:                              # =>This Inner Loop Header: Depth=1
 31824  	movd	xmm5, dword ptr [rcx + 2*rsi]   # xmm5 = mem[0],zero,zero,zero
 31825  	movd	xmm6, dword ptr [rcx + 2*rsi + 4] # xmm6 = mem[0],zero,zero,zero
 31826  	movdqa	xmm0, xmm5
 31827  	pcmpgtw	xmm0, xmm2
 31828  	pmovsxwq	xmm0, xmm0
 31829  	movdqa	xmm1, xmm6
 31830  	pcmpgtw	xmm1, xmm2
 31831  	pmovsxwq	xmm1, xmm1
 31832  	pcmpeqw	xmm5, xmm2
 31833  	pxor	xmm5, xmm3
 31834  	pmovsxwq	xmm5, xmm5
 31835  	pcmpeqw	xmm6, xmm2
 31836  	pxor	xmm6, xmm3
 31837  	pmovsxwq	xmm6, xmm6
 31838  	blendvpd	xmm5, xmm4, xmm0
 31839  	movdqa	xmm0, xmm1
 31840  	blendvpd	xmm6, xmm4, xmm0
 31841  	movupd	xmmword ptr [r8 + 8*rsi], xmm5
 31842  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm6
 31843  	movd	xmm5, dword ptr [rcx + 2*rsi + 8] # xmm5 = mem[0],zero,zero,zero
 31844  	movd	xmm6, dword ptr [rcx + 2*rsi + 12] # xmm6 = mem[0],zero,zero,zero
 31845  	movdqa	xmm0, xmm5
 31846  	pcmpgtw	xmm0, xmm2
 31847  	pmovsxwq	xmm0, xmm0
 31848  	movdqa	xmm1, xmm6
 31849  	pcmpgtw	xmm1, xmm2
 31850  	pmovsxwq	xmm1, xmm1
 31851  	pcmpeqw	xmm5, xmm2
 31852  	pxor	xmm5, xmm3
 31853  	pmovsxwq	xmm5, xmm5
 31854  	pcmpeqw	xmm6, xmm2
 31855  	pxor	xmm6, xmm3
 31856  	pmovsxwq	xmm6, xmm6
 31857  	blendvpd	xmm5, xmm4, xmm0
 31858  	movdqa	xmm0, xmm1
 31859  	blendvpd	xmm6, xmm4, xmm0
 31860  	movupd	xmmword ptr [r8 + 8*rsi + 32], xmm5
 31861  	movupd	xmmword ptr [r8 + 8*rsi + 48], xmm6
 31862  	add	rsi, 8
 31863  	add	rdi, 2
 31864  	jne	.LBB4_837
 31865  	jmp	.LBB4_1213
 31866  .LBB4_838:
 31867  	mov	edx, eax
 31868  	and	edx, -8
 31869  	lea	rsi, [rdx - 8]
 31870  	mov	r9, rsi
 31871  	shr	r9, 3
 31872  	add	r9, 1
 31873  	test	rsi, rsi
 31874  	je	.LBB4_1218
 31875  # %bb.839:
 31876  	mov	rdi, r9
 31877  	and	rdi, -2
 31878  	neg	rdi
 31879  	xor	esi, esi
 31880  	pxor	xmm2, xmm2
 31881  	pcmpeqd	xmm3, xmm3
 31882  	movaps	xmm4, xmmword ptr [rip + .LCPI4_19] # xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 31883  .LBB4_840:                              # =>This Inner Loop Header: Depth=1
 31884  	movq	xmm5, qword ptr [rcx + 2*rsi]   # xmm5 = mem[0],zero
 31885  	movq	xmm6, qword ptr [rcx + 2*rsi + 8] # xmm6 = mem[0],zero
 31886  	movdqa	xmm0, xmm5
 31887  	pcmpgtw	xmm0, xmm2
 31888  	pmovsxwd	xmm0, xmm0
 31889  	movdqa	xmm1, xmm6
 31890  	pcmpgtw	xmm1, xmm2
 31891  	pmovsxwd	xmm1, xmm1
 31892  	pcmpeqw	xmm5, xmm2
 31893  	pxor	xmm5, xmm3
 31894  	pmovsxwd	xmm5, xmm5
 31895  	cvtdq2ps	xmm5, xmm5
 31896  	pcmpeqw	xmm6, xmm2
 31897  	pxor	xmm6, xmm3
 31898  	pmovsxwd	xmm6, xmm6
 31899  	cvtdq2ps	xmm6, xmm6
 31900  	blendvps	xmm5, xmm4, xmm0
 31901  	movdqa	xmm0, xmm1
 31902  	blendvps	xmm6, xmm4, xmm0
 31903  	movups	xmmword ptr [r8 + 4*rsi], xmm5
 31904  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm6
 31905  	movq	xmm5, qword ptr [rcx + 2*rsi + 16] # xmm5 = mem[0],zero
 31906  	movq	xmm6, qword ptr [rcx + 2*rsi + 24] # xmm6 = mem[0],zero
 31907  	movdqa	xmm0, xmm5
 31908  	pcmpgtw	xmm0, xmm2
 31909  	pmovsxwd	xmm0, xmm0
 31910  	movdqa	xmm1, xmm6
 31911  	pcmpgtw	xmm1, xmm2
 31912  	pmovsxwd	xmm1, xmm1
 31913  	pcmpeqw	xmm5, xmm2
 31914  	pxor	xmm5, xmm3
 31915  	pmovsxwd	xmm5, xmm5
 31916  	cvtdq2ps	xmm5, xmm5
 31917  	pcmpeqw	xmm6, xmm2
 31918  	pxor	xmm6, xmm3
 31919  	pmovsxwd	xmm6, xmm6
 31920  	cvtdq2ps	xmm6, xmm6
 31921  	blendvps	xmm5, xmm4, xmm0
 31922  	movdqa	xmm0, xmm1
 31923  	blendvps	xmm6, xmm4, xmm0
 31924  	movups	xmmword ptr [r8 + 4*rsi + 32], xmm5
 31925  	movups	xmmword ptr [r8 + 4*rsi + 48], xmm6
 31926  	add	rsi, 16
 31927  	add	rdi, 2
 31928  	jne	.LBB4_840
 31929  	jmp	.LBB4_1219
 31930  .LBB4_846:
 31931  	mov	esi, edx
 31932  	and	esi, -2
 31933  	xor	eax, eax
 31934  	movss	xmm0, dword ptr [rip + .LCPI4_14] # xmm0 = mem[0],zero,zero,zero
 31935  	movss	xmm1, dword ptr [rip + .LCPI4_5] # xmm1 = mem[0],zero,zero,zero
 31936  	jmp	.LBB4_848
 31937  .LBB4_847:                              #   in Loop: Header=BB4_848 Depth=1
 31938  	movss	dword ptr [r8 + 4*rax + 4], xmm3
 31939  	add	rax, 2
 31940  	cmp	rsi, rax
 31941  	je	.LBB4_410
 31942  .LBB4_848:                              # =>This Inner Loop Header: Depth=1
 31943  	cmp	qword ptr [rcx + 8*rax], 0
 31944  	movapd	xmm2, xmm0
 31945  	jne	.LBB4_849
 31946  # %bb.852:                              #   in Loop: Header=BB4_848 Depth=1
 31947  	xorpd	xmm2, xmm2
 31948  	movapd	xmm3, xmm1
 31949  	jle	.LBB4_853
 31950  .LBB4_850:                              #   in Loop: Header=BB4_848 Depth=1
 31951  	movss	dword ptr [r8 + 4*rax], xmm3
 31952  	cmp	qword ptr [rcx + 8*rax + 8], 0
 31953  	movapd	xmm2, xmm0
 31954  	jne	.LBB4_851
 31955  .LBB4_854:                              #   in Loop: Header=BB4_848 Depth=1
 31956  	xorpd	xmm2, xmm2
 31957  	movapd	xmm3, xmm1
 31958  	jg	.LBB4_847
 31959  	jmp	.LBB4_855
 31960  .LBB4_849:                              #   in Loop: Header=BB4_848 Depth=1
 31961  	movapd	xmm3, xmm1
 31962  	jg	.LBB4_850
 31963  .LBB4_853:                              #   in Loop: Header=BB4_848 Depth=1
 31964  	movapd	xmm3, xmm2
 31965  	movss	dword ptr [r8 + 4*rax], xmm3
 31966  	cmp	qword ptr [rcx + 8*rax + 8], 0
 31967  	movapd	xmm2, xmm0
 31968  	je	.LBB4_854
 31969  .LBB4_851:                              #   in Loop: Header=BB4_848 Depth=1
 31970  	movapd	xmm3, xmm1
 31971  	jg	.LBB4_847
 31972  .LBB4_855:                              #   in Loop: Header=BB4_848 Depth=1
 31973  	movapd	xmm3, xmm2
 31974  	jmp	.LBB4_847
 31975  .LBB4_856:
 31976  	mov	esi, edx
 31977  	and	esi, -2
 31978  	xor	eax, eax
 31979  	xorps	xmm0, xmm0
 31980  	jmp	.LBB4_859
 31981  .LBB4_857:                              #   in Loop: Header=BB4_859 Depth=1
 31982  	movmskps	edi, xmm1
 31983  	and	edi, 1
 31984  	neg	edi
 31985  	or	edi, 1
 31986  	xorps	xmm1, xmm1
 31987  	cvtsi2ss	xmm1, edi
 31988  	cvttss2si	rdi, xmm1
 31989  	mov	qword ptr [r8 + 8*rax + 8], rdi
 31990  	add	rax, 2
 31991  	cmp	rsi, rax
 31992  	je	.LBB4_416
 31993  .LBB4_859:                              # =>This Inner Loop Header: Depth=1
 31994  	movss	xmm1, dword ptr [rcx + 4*rax]   # xmm1 = mem[0],zero,zero,zero
 31995  	ucomiss	xmm0, xmm1
 31996  	jne	.LBB4_861
 31997  # %bb.860:                              #   in Loop: Header=BB4_859 Depth=1
 31998  	xor	edi, edi
 31999  	jmp	.LBB4_862
 32000  .LBB4_861:                              #   in Loop: Header=BB4_859 Depth=1
 32001  	movmskps	edi, xmm1
 32002  	and	edi, 1
 32003  	neg	edi
 32004  	or	edi, 1
 32005  	xorps	xmm1, xmm1
 32006  	cvtsi2ss	xmm1, edi
 32007  	cvttss2si	rdi, xmm1
 32008  .LBB4_862:                              #   in Loop: Header=BB4_859 Depth=1
 32009  	mov	qword ptr [r8 + 8*rax], rdi
 32010  	movss	xmm1, dword ptr [rcx + 4*rax + 4] # xmm1 = mem[0],zero,zero,zero
 32011  	ucomiss	xmm0, xmm1
 32012  	jne	.LBB4_857
 32013  # %bb.863:                              #   in Loop: Header=BB4_859 Depth=1
 32014  	xor	edi, edi
 32015  	mov	qword ptr [r8 + 8*rax + 8], rdi
 32016  	add	rax, 2
 32017  	cmp	rsi, rax
 32018  	jne	.LBB4_859
 32019  .LBB4_416:
 32020  	test	dl, 1
 32021  	je	.LBB4_1655
 32022  # %bb.417:
 32023  	movss	xmm0, dword ptr [rcx + 4*rax]   # xmm0 = mem[0],zero,zero,zero
 32024  	xorps	xmm1, xmm1
 32025  	ucomiss	xmm1, xmm0
 32026  	jne	.LBB4_1104
 32027  # %bb.418:
 32028  	xor	ecx, ecx
 32029  	jmp	.LBB4_1105
 32030  .LBB4_884:
 32031  	mov	edx, r10d
 32032  	and	edx, -4
 32033  	lea	rsi, [rdx - 4]
 32034  	mov	r9, rsi
 32035  	shr	r9, 2
 32036  	add	r9, 1
 32037  	test	rsi, rsi
 32038  	je	.LBB4_1060
 32039  # %bb.885:
 32040  	mov	rdi, r9
 32041  	and	rdi, -2
 32042  	neg	rdi
 32043  	xor	esi, esi
 32044  	pxor	xmm2, xmm2
 32045  	pcmpeqd	xmm3, xmm3
 32046  	movapd	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 32047  .LBB4_886:                              # =>This Inner Loop Header: Depth=1
 32048  	movq	xmm5, qword ptr [rcx + 4*rsi]   # xmm5 = mem[0],zero
 32049  	movq	xmm6, qword ptr [rcx + 4*rsi + 8] # xmm6 = mem[0],zero
 32050  	movdqa	xmm0, xmm5
 32051  	pcmpgtd	xmm0, xmm2
 32052  	pmovsxdq	xmm0, xmm0
 32053  	movdqa	xmm1, xmm6
 32054  	pcmpgtd	xmm1, xmm2
 32055  	pmovsxdq	xmm1, xmm1
 32056  	pcmpeqd	xmm5, xmm2
 32057  	pxor	xmm5, xmm3
 32058  	pmovsxdq	xmm5, xmm5
 32059  	pcmpeqd	xmm6, xmm2
 32060  	pxor	xmm6, xmm3
 32061  	pmovsxdq	xmm6, xmm6
 32062  	blendvpd	xmm5, xmm4, xmm0
 32063  	movdqa	xmm0, xmm1
 32064  	blendvpd	xmm6, xmm4, xmm0
 32065  	movupd	xmmword ptr [r8 + 8*rsi], xmm5
 32066  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm6
 32067  	movq	xmm5, qword ptr [rcx + 4*rsi + 16] # xmm5 = mem[0],zero
 32068  	movq	xmm6, qword ptr [rcx + 4*rsi + 24] # xmm6 = mem[0],zero
 32069  	movdqa	xmm0, xmm5
 32070  	pcmpgtd	xmm0, xmm2
 32071  	pmovsxdq	xmm0, xmm0
 32072  	movdqa	xmm1, xmm6
 32073  	pcmpgtd	xmm1, xmm2
 32074  	pmovsxdq	xmm1, xmm1
 32075  	pcmpeqd	xmm5, xmm2
 32076  	pxor	xmm5, xmm3
 32077  	pmovsxdq	xmm5, xmm5
 32078  	pcmpeqd	xmm6, xmm2
 32079  	pxor	xmm6, xmm3
 32080  	pmovsxdq	xmm6, xmm6
 32081  	blendvpd	xmm5, xmm4, xmm0
 32082  	movdqa	xmm0, xmm1
 32083  	blendvpd	xmm6, xmm4, xmm0
 32084  	movupd	xmmword ptr [r8 + 8*rsi + 32], xmm5
 32085  	movupd	xmmword ptr [r8 + 8*rsi + 48], xmm6
 32086  	add	rsi, 8
 32087  	add	rdi, 2
 32088  	jne	.LBB4_886
 32089  	jmp	.LBB4_1061
 32090  .LBB4_887:
 32091  	mov	edx, eax
 32092  	and	edx, -8
 32093  	lea	rsi, [rdx - 8]
 32094  	mov	r9, rsi
 32095  	shr	r9, 3
 32096  	add	r9, 1
 32097  	test	rsi, rsi
 32098  	je	.LBB4_1066
 32099  # %bb.888:
 32100  	mov	rdi, r9
 32101  	and	rdi, -2
 32102  	neg	rdi
 32103  	xor	esi, esi
 32104  	pxor	xmm2, xmm2
 32105  	pcmpeqd	xmm3, xmm3
 32106  	movaps	xmm4, xmmword ptr [rip + .LCPI4_19] # xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 32107  .LBB4_889:                              # =>This Inner Loop Header: Depth=1
 32108  	movdqu	xmm5, xmmword ptr [rcx + 4*rsi]
 32109  	movdqu	xmm6, xmmword ptr [rcx + 4*rsi + 16]
 32110  	movdqa	xmm0, xmm5
 32111  	pcmpgtd	xmm0, xmm2
 32112  	movdqa	xmm1, xmm6
 32113  	pcmpgtd	xmm1, xmm2
 32114  	pcmpeqd	xmm5, xmm2
 32115  	pxor	xmm5, xmm3
 32116  	cvtdq2ps	xmm5, xmm5
 32117  	pcmpeqd	xmm6, xmm2
 32118  	pxor	xmm6, xmm3
 32119  	cvtdq2ps	xmm6, xmm6
 32120  	blendvps	xmm5, xmm4, xmm0
 32121  	movdqa	xmm0, xmm1
 32122  	blendvps	xmm6, xmm4, xmm0
 32123  	movups	xmmword ptr [r8 + 4*rsi], xmm5
 32124  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm6
 32125  	movdqu	xmm5, xmmword ptr [rcx + 4*rsi + 32]
 32126  	movdqu	xmm6, xmmword ptr [rcx + 4*rsi + 48]
 32127  	movdqa	xmm0, xmm5
 32128  	pcmpgtd	xmm0, xmm2
 32129  	movdqa	xmm1, xmm6
 32130  	pcmpgtd	xmm1, xmm2
 32131  	pcmpeqd	xmm5, xmm2
 32132  	pxor	xmm5, xmm3
 32133  	cvtdq2ps	xmm5, xmm5
 32134  	pcmpeqd	xmm6, xmm2
 32135  	pxor	xmm6, xmm3
 32136  	cvtdq2ps	xmm6, xmm6
 32137  	blendvps	xmm5, xmm4, xmm0
 32138  	movdqa	xmm0, xmm1
 32139  	blendvps	xmm6, xmm4, xmm0
 32140  	movups	xmmword ptr [r8 + 4*rsi + 32], xmm5
 32141  	movups	xmmword ptr [r8 + 4*rsi + 48], xmm6
 32142  	add	rsi, 16
 32143  	add	rdi, 2
 32144  	jne	.LBB4_889
 32145  	jmp	.LBB4_1067
 32146  .LBB4_945:
 32147  	mov	esi, eax
 32148  	and	esi, -4
 32149  	lea	rdx, [rsi - 4]
 32150  	mov	r9, rdx
 32151  	shr	r9, 2
 32152  	add	r9, 1
 32153  	test	rdx, rdx
 32154  	je	.LBB4_1076
 32155  # %bb.946:
 32156  	mov	rdx, r9
 32157  	and	rdx, -2
 32158  	neg	rdx
 32159  	xor	edi, edi
 32160  	xorpd	xmm0, xmm0
 32161  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 32162  	movapd	xmm2, xmmword ptr [rip + .LCPI4_1] # xmm2 = [1.0E+0,1.0E+0]
 32163  .LBB4_947:                              # =>This Inner Loop Header: Depth=1
 32164  	movupd	xmm3, xmmword ptr [rcx + 8*rdi]
 32165  	movupd	xmm4, xmmword ptr [rcx + 8*rdi + 16]
 32166  	movapd	xmm5, xmm3
 32167  	cmpeqpd	xmm5, xmm0
 32168  	shufps	xmm5, xmm5, 232                 # xmm5 = xmm5[0,2,2,3]
 32169  	movapd	xmm6, xmm4
 32170  	cmpeqpd	xmm6, xmm0
 32171  	shufps	xmm6, xmm6, 232                 # xmm6 = xmm6[0,2,2,3]
 32172  	andpd	xmm3, xmm1
 32173  	orpd	xmm3, xmm2
 32174  	andpd	xmm4, xmm1
 32175  	orpd	xmm4, xmm2
 32176  	cvttpd2dq	xmm3, xmm3
 32177  	cvttpd2dq	xmm4, xmm4
 32178  	andnps	xmm5, xmm3
 32179  	andnps	xmm6, xmm4
 32180  	movlhps	xmm5, xmm6                      # xmm5 = xmm5[0],xmm6[0]
 32181  	movups	xmmword ptr [r8 + 4*rdi], xmm5
 32182  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 32]
 32183  	movupd	xmm4, xmmword ptr [rcx + 8*rdi + 48]
 32184  	movapd	xmm5, xmm3
 32185  	cmpeqpd	xmm5, xmm0
 32186  	shufps	xmm5, xmm5, 232                 # xmm5 = xmm5[0,2,2,3]
 32187  	movapd	xmm6, xmm4
 32188  	cmpeqpd	xmm6, xmm0
 32189  	shufps	xmm6, xmm6, 232                 # xmm6 = xmm6[0,2,2,3]
 32190  	andpd	xmm3, xmm1
 32191  	orpd	xmm3, xmm2
 32192  	andpd	xmm4, xmm1
 32193  	orpd	xmm4, xmm2
 32194  	cvttpd2dq	xmm3, xmm3
 32195  	andnps	xmm5, xmm3
 32196  	cvttpd2dq	xmm3, xmm4
 32197  	andnps	xmm6, xmm3
 32198  	movlhps	xmm5, xmm6                      # xmm5 = xmm5[0],xmm6[0]
 32199  	movups	xmmword ptr [r8 + 4*rdi + 16], xmm5
 32200  	add	rdi, 8
 32201  	add	rdx, 2
 32202  	jne	.LBB4_947
 32203  	jmp	.LBB4_1077
 32204  .LBB4_953:
 32205  	mov	edx, eax
 32206  	and	edx, -4
 32207  	lea	rsi, [rdx - 4]
 32208  	mov	r9, rsi
 32209  	shr	r9, 2
 32210  	add	r9, 1
 32211  	test	rsi, rsi
 32212  	je	.LBB4_1082
 32213  # %bb.954:
 32214  	mov	rdi, r9
 32215  	and	rdi, -2
 32216  	neg	rdi
 32217  	xor	esi, esi
 32218  	pxor	xmm0, xmm0
 32219  	movdqa	xmm1, xmmword ptr [rip + .LCPI4_16] # xmm1 = <1,1,u,u>
 32220  .LBB4_955:                              # =>This Inner Loop Header: Depth=1
 32221  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi]
 32222  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 16]
 32223  	pcmpeqq	xmm2, xmm0
 32224  	pshufd	xmm2, xmm2, 232                 # xmm2 = xmm2[0,2,2,3]
 32225  	pandn	xmm2, xmm1
 32226  	pcmpeqq	xmm3, xmm0
 32227  	pshufd	xmm3, xmm3, 232                 # xmm3 = xmm3[0,2,2,3]
 32228  	pandn	xmm3, xmm1
 32229  	punpcklqdq	xmm2, xmm3              # xmm2 = xmm2[0],xmm3[0]
 32230  	movdqu	xmmword ptr [r8 + 4*rsi], xmm2
 32231  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi + 32]
 32232  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 48]
 32233  	pcmpeqq	xmm2, xmm0
 32234  	pshufd	xmm2, xmm2, 232                 # xmm2 = xmm2[0,2,2,3]
 32235  	pandn	xmm2, xmm1
 32236  	pcmpeqq	xmm3, xmm0
 32237  	pshufd	xmm3, xmm3, 232                 # xmm3 = xmm3[0,2,2,3]
 32238  	pandn	xmm3, xmm1
 32239  	punpcklqdq	xmm2, xmm3              # xmm2 = xmm2[0],xmm3[0]
 32240  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm2
 32241  	add	rsi, 8
 32242  	add	rdi, 2
 32243  	jne	.LBB4_955
 32244  	jmp	.LBB4_1083
 32245  .LBB4_956:
 32246  	mov	edx, eax
 32247  	and	edx, -8
 32248  	lea	rsi, [rdx - 8]
 32249  	mov	r9, rsi
 32250  	shr	r9, 3
 32251  	add	r9, 1
 32252  	test	rsi, rsi
 32253  	je	.LBB4_1087
 32254  # %bb.957:
 32255  	mov	rdi, r9
 32256  	and	rdi, -2
 32257  	neg	rdi
 32258  	xor	esi, esi
 32259  	pxor	xmm0, xmm0
 32260  	pcmpeqd	xmm1, xmm1
 32261  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1]
 32262  .LBB4_958:                              # =>This Inner Loop Header: Depth=1
 32263  	movq	xmm3, qword ptr [rcx + 2*rsi]   # xmm3 = mem[0],zero
 32264  	movq	xmm4, qword ptr [rcx + 2*rsi + 8] # xmm4 = mem[0],zero
 32265  	pcmpeqw	xmm3, xmm0
 32266  	pxor	xmm3, xmm1
 32267  	pmovzxwd	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 32268  	pand	xmm3, xmm2
 32269  	pcmpeqw	xmm4, xmm0
 32270  	pxor	xmm4, xmm1
 32271  	pmovzxwd	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
 32272  	pand	xmm4, xmm2
 32273  	movdqu	xmmword ptr [r8 + 4*rsi], xmm3
 32274  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm4
 32275  	movq	xmm3, qword ptr [rcx + 2*rsi + 16] # xmm3 = mem[0],zero
 32276  	movq	xmm4, qword ptr [rcx + 2*rsi + 24] # xmm4 = mem[0],zero
 32277  	pcmpeqw	xmm3, xmm0
 32278  	pxor	xmm3, xmm1
 32279  	pmovzxwd	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 32280  	pand	xmm3, xmm2
 32281  	pcmpeqw	xmm4, xmm0
 32282  	pxor	xmm4, xmm1
 32283  	pmovzxwd	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
 32284  	pand	xmm4, xmm2
 32285  	movdqu	xmmword ptr [r8 + 4*rsi + 32], xmm3
 32286  	movdqu	xmmword ptr [r8 + 4*rsi + 48], xmm4
 32287  	add	rsi, 16
 32288  	add	rdi, 2
 32289  	jne	.LBB4_958
 32290  	jmp	.LBB4_1088
 32291  .LBB4_959:
 32292  	mov	edx, r10d
 32293  	and	edx, -8
 32294  	lea	rsi, [rdx - 8]
 32295  	mov	r9, rsi
 32296  	shr	r9, 3
 32297  	add	r9, 1
 32298  	test	rsi, rsi
 32299  	je	.LBB4_1092
 32300  # %bb.960:
 32301  	mov	rdi, r9
 32302  	and	rdi, -2
 32303  	neg	rdi
 32304  	xor	esi, esi
 32305  	pxor	xmm2, xmm2
 32306  	pcmpeqd	xmm3, xmm3
 32307  	movaps	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 32308  .LBB4_961:                              # =>This Inner Loop Header: Depth=1
 32309  	movq	xmm5, qword ptr [rcx + 2*rsi]   # xmm5 = mem[0],zero
 32310  	movq	xmm6, qword ptr [rcx + 2*rsi + 8] # xmm6 = mem[0],zero
 32311  	movdqa	xmm0, xmm5
 32312  	pcmpgtw	xmm0, xmm2
 32313  	pmovsxwd	xmm0, xmm0
 32314  	movdqa	xmm1, xmm6
 32315  	pcmpgtw	xmm1, xmm2
 32316  	pmovsxwd	xmm1, xmm1
 32317  	pcmpeqw	xmm5, xmm2
 32318  	pxor	xmm5, xmm3
 32319  	pmovsxwd	xmm5, xmm5
 32320  	pcmpeqw	xmm6, xmm2
 32321  	pxor	xmm6, xmm3
 32322  	pmovsxwd	xmm6, xmm6
 32323  	blendvps	xmm5, xmm4, xmm0
 32324  	movdqa	xmm0, xmm1
 32325  	blendvps	xmm6, xmm4, xmm0
 32326  	movups	xmmword ptr [r8 + 4*rsi], xmm5
 32327  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm6
 32328  	movq	xmm5, qword ptr [rcx + 2*rsi + 16] # xmm5 = mem[0],zero
 32329  	movq	xmm6, qword ptr [rcx + 2*rsi + 24] # xmm6 = mem[0],zero
 32330  	movdqa	xmm0, xmm5
 32331  	pcmpgtw	xmm0, xmm2
 32332  	pmovsxwd	xmm0, xmm0
 32333  	movdqa	xmm1, xmm6
 32334  	pcmpgtw	xmm1, xmm2
 32335  	pmovsxwd	xmm1, xmm1
 32336  	pcmpeqw	xmm5, xmm2
 32337  	pxor	xmm5, xmm3
 32338  	pmovsxwd	xmm5, xmm5
 32339  	pcmpeqw	xmm6, xmm2
 32340  	pxor	xmm6, xmm3
 32341  	pmovsxwd	xmm6, xmm6
 32342  	blendvps	xmm5, xmm4, xmm0
 32343  	movdqa	xmm0, xmm1
 32344  	blendvps	xmm6, xmm4, xmm0
 32345  	movups	xmmword ptr [r8 + 4*rsi + 32], xmm5
 32346  	movups	xmmword ptr [r8 + 4*rsi + 48], xmm6
 32347  	add	rsi, 16
 32348  	add	rdi, 2
 32349  	jne	.LBB4_961
 32350  	jmp	.LBB4_1093
 32351  .LBB4_962:
 32352  	mov	edx, r10d
 32353  	and	edx, -4
 32354  	lea	rsi, [rdx - 4]
 32355  	mov	r9, rsi
 32356  	shr	r9, 2
 32357  	add	r9, 1
 32358  	test	rsi, rsi
 32359  	je	.LBB4_1098
 32360  # %bb.963:
 32361  	mov	rdi, r9
 32362  	and	rdi, -2
 32363  	neg	rdi
 32364  	xor	esi, esi
 32365  	pxor	xmm2, xmm2
 32366  	pcmpeqd	xmm3, xmm3
 32367  	movaps	xmm4, xmmword ptr [rip + .LCPI4_16] # xmm4 = <1,1,u,u>
 32368  .LBB4_964:                              # =>This Inner Loop Header: Depth=1
 32369  	movdqu	xmm5, xmmword ptr [rcx + 8*rsi]
 32370  	movdqu	xmm6, xmmword ptr [rcx + 8*rsi + 16]
 32371  	movdqa	xmm0, xmm5
 32372  	pcmpgtq	xmm0, xmm2
 32373  	pshufd	xmm0, xmm0, 232                 # xmm0 = xmm0[0,2,2,3]
 32374  	movdqa	xmm1, xmm6
 32375  	pcmpgtq	xmm1, xmm2
 32376  	pshufd	xmm1, xmm1, 232                 # xmm1 = xmm1[0,2,2,3]
 32377  	pcmpeqq	xmm5, xmm2
 32378  	pshufd	xmm5, xmm5, 232                 # xmm5 = xmm5[0,2,2,3]
 32379  	pxor	xmm5, xmm3
 32380  	pcmpeqq	xmm6, xmm2
 32381  	pshufd	xmm6, xmm6, 232                 # xmm6 = xmm6[0,2,2,3]
 32382  	pxor	xmm6, xmm3
 32383  	blendvps	xmm5, xmm4, xmm0
 32384  	movdqa	xmm0, xmm1
 32385  	blendvps	xmm6, xmm4, xmm0
 32386  	movlhps	xmm5, xmm6                      # xmm5 = xmm5[0],xmm6[0]
 32387  	movups	xmmword ptr [r8 + 4*rsi], xmm5
 32388  	movdqu	xmm5, xmmword ptr [rcx + 8*rsi + 32]
 32389  	movdqu	xmm6, xmmword ptr [rcx + 8*rsi + 48]
 32390  	movdqa	xmm0, xmm5
 32391  	pcmpgtq	xmm0, xmm2
 32392  	pshufd	xmm0, xmm0, 232                 # xmm0 = xmm0[0,2,2,3]
 32393  	movdqa	xmm1, xmm6
 32394  	pcmpgtq	xmm1, xmm2
 32395  	pshufd	xmm1, xmm1, 232                 # xmm1 = xmm1[0,2,2,3]
 32396  	pcmpeqq	xmm5, xmm2
 32397  	pshufd	xmm5, xmm5, 232                 # xmm5 = xmm5[0,2,2,3]
 32398  	pxor	xmm5, xmm3
 32399  	pcmpeqq	xmm6, xmm2
 32400  	pshufd	xmm6, xmm6, 232                 # xmm6 = xmm6[0,2,2,3]
 32401  	pxor	xmm6, xmm3
 32402  	blendvps	xmm5, xmm4, xmm0
 32403  	movdqa	xmm0, xmm1
 32404  	blendvps	xmm6, xmm4, xmm0
 32405  	movlhps	xmm5, xmm6                      # xmm5 = xmm5[0],xmm6[0]
 32406  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm5
 32407  	add	rsi, 8
 32408  	add	rdi, 2
 32409  	jne	.LBB4_964
 32410  	jmp	.LBB4_1099
 32411  .LBB4_965:
 32412  	mov	edx, eax
 32413  	and	edx, -8
 32414  	xor	esi, esi
 32415  	xorps	xmm0, xmm0
 32416  	movdqa	xmm1, xmmword ptr [rip + .LCPI4_8] # xmm1 = [1,1,1,1]
 32417  .LBB4_966:                              # =>This Inner Loop Header: Depth=1
 32418  	movdqu	xmm2, xmmword ptr [rcx + 4*rsi]
 32419  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 16]
 32420  	movdqa	xmm4, xmm2
 32421  	psrad	xmm4, 31
 32422  	por	xmm4, xmm1
 32423  	movdqa	xmm5, xmm3
 32424  	psrad	xmm5, 31
 32425  	por	xmm5, xmm1
 32426  	cvtdq2ps	xmm4, xmm4
 32427  	cvtdq2ps	xmm5, xmm5
 32428  	cvttps2dq	xmm4, xmm4
 32429  	cvttps2dq	xmm5, xmm5
 32430  	cmpneqps	xmm2, xmm0
 32431  	andps	xmm2, xmm4
 32432  	cmpneqps	xmm3, xmm0
 32433  	andps	xmm3, xmm5
 32434  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 32435  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm3
 32436  	add	rsi, 8
 32437  	cmp	rdx, rsi
 32438  	jne	.LBB4_966
 32439  # %bb.967:
 32440  	cmp	rdx, rax
 32441  	je	.LBB4_1655
 32442  .LBB4_968:
 32443  	xorps	xmm0, xmm0
 32444  	jmp	.LBB4_970
 32445  .LBB4_969:                              #   in Loop: Header=BB4_970 Depth=1
 32446  	mov	dword ptr [r8 + 4*rdx], esi
 32447  	add	rdx, 1
 32448  	cmp	rax, rdx
 32449  	je	.LBB4_1655
 32450  .LBB4_970:                              # =>This Inner Loop Header: Depth=1
 32451  	movss	xmm1, dword ptr [rcx + 4*rdx]   # xmm1 = mem[0],zero,zero,zero
 32452  	xor	esi, esi
 32453  	ucomiss	xmm0, xmm1
 32454  	je	.LBB4_969
 32455  # %bb.971:                              #   in Loop: Header=BB4_970 Depth=1
 32456  	movmskps	esi, xmm1
 32457  	and	esi, 1
 32458  	neg	esi
 32459  	or	esi, 1
 32460  	xorps	xmm1, xmm1
 32461  	cvtsi2ss	xmm1, esi
 32462  	cvttss2si	esi, xmm1
 32463  	jmp	.LBB4_969
 32464  .LBB4_496:
 32465  	mov	edx, r10d
 32466  	and	edx, -8
 32467  	lea	rsi, [rdx - 8]
 32468  	mov	r9, rsi
 32469  	shr	r9, 3
 32470  	add	r9, 1
 32471  	test	rsi, rsi
 32472  	je	.LBB4_1228
 32473  # %bb.497:
 32474  	mov	rdi, r9
 32475  	and	rdi, -2
 32476  	neg	rdi
 32477  	xor	esi, esi
 32478  	pxor	xmm0, xmm0
 32479  	movdqa	xmm1, xmmword ptr [rip + .LCPI4_8] # xmm1 = [1,1,1,1]
 32480  .LBB4_498:                              # =>This Inner Loop Header: Depth=1
 32481  	movdqu	xmm2, xmmword ptr [rcx + 4*rsi]
 32482  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 16]
 32483  	pcmpeqd	xmm2, xmm0
 32484  	pandn	xmm2, xmm1
 32485  	pcmpeqd	xmm3, xmm0
 32486  	pandn	xmm3, xmm1
 32487  	movdqu	xmmword ptr [r8 + 4*rsi], xmm2
 32488  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm3
 32489  	movdqu	xmm2, xmmword ptr [rcx + 4*rsi + 32]
 32490  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 48]
 32491  	pcmpeqd	xmm2, xmm0
 32492  	pandn	xmm2, xmm1
 32493  	pcmpeqd	xmm3, xmm0
 32494  	pandn	xmm3, xmm1
 32495  	movdqu	xmmword ptr [r8 + 4*rsi + 32], xmm2
 32496  	movdqu	xmmword ptr [r8 + 4*rsi + 48], xmm3
 32497  	add	rsi, 16
 32498  	add	rdi, 2
 32499  	jne	.LBB4_498
 32500  	jmp	.LBB4_1229
 32501  .LBB4_504:
 32502  	mov	edx, r10d
 32503  	and	edx, -8
 32504  	lea	rsi, [rdx - 8]
 32505  	mov	r9, rsi
 32506  	shr	r9, 3
 32507  	add	r9, 1
 32508  	test	rsi, rsi
 32509  	je	.LBB4_1236
 32510  # %bb.505:
 32511  	mov	rdi, r9
 32512  	and	rdi, -2
 32513  	neg	rdi
 32514  	xor	esi, esi
 32515  	pxor	xmm2, xmm2
 32516  	pcmpeqd	xmm3, xmm3
 32517  	movaps	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 32518  .LBB4_506:                              # =>This Inner Loop Header: Depth=1
 32519  	movd	xmm5, dword ptr [rcx + rsi]     # xmm5 = mem[0],zero,zero,zero
 32520  	movd	xmm6, dword ptr [rcx + rsi + 4] # xmm6 = mem[0],zero,zero,zero
 32521  	movdqa	xmm0, xmm5
 32522  	pcmpgtb	xmm0, xmm2
 32523  	pmovsxbd	xmm0, xmm0
 32524  	movdqa	xmm1, xmm6
 32525  	pcmpgtb	xmm1, xmm2
 32526  	pmovsxbd	xmm1, xmm1
 32527  	pcmpeqb	xmm5, xmm2
 32528  	pxor	xmm5, xmm3
 32529  	pmovsxbd	xmm5, xmm5
 32530  	pcmpeqb	xmm6, xmm2
 32531  	pxor	xmm6, xmm3
 32532  	pmovsxbd	xmm6, xmm6
 32533  	blendvps	xmm5, xmm4, xmm0
 32534  	movdqa	xmm0, xmm1
 32535  	blendvps	xmm6, xmm4, xmm0
 32536  	movups	xmmword ptr [r8 + 4*rsi], xmm5
 32537  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm6
 32538  	movd	xmm5, dword ptr [rcx + rsi + 8] # xmm5 = mem[0],zero,zero,zero
 32539  	movd	xmm6, dword ptr [rcx + rsi + 12] # xmm6 = mem[0],zero,zero,zero
 32540  	movdqa	xmm0, xmm5
 32541  	pcmpgtb	xmm0, xmm2
 32542  	pmovsxbd	xmm0, xmm0
 32543  	movdqa	xmm1, xmm6
 32544  	pcmpgtb	xmm1, xmm2
 32545  	pmovsxbd	xmm1, xmm1
 32546  	pcmpeqb	xmm5, xmm2
 32547  	pxor	xmm5, xmm3
 32548  	pmovsxbd	xmm5, xmm5
 32549  	pcmpeqb	xmm6, xmm2
 32550  	pxor	xmm6, xmm3
 32551  	pmovsxbd	xmm6, xmm6
 32552  	blendvps	xmm5, xmm4, xmm0
 32553  	movdqa	xmm0, xmm1
 32554  	blendvps	xmm6, xmm4, xmm0
 32555  	movups	xmmword ptr [r8 + 4*rsi + 32], xmm5
 32556  	movups	xmmword ptr [r8 + 4*rsi + 48], xmm6
 32557  	add	rsi, 16
 32558  	add	rdi, 2
 32559  	jne	.LBB4_506
 32560  	jmp	.LBB4_1237
 32561  .LBB4_524:
 32562  	mov	edx, r10d
 32563  	and	edx, -8
 32564  	lea	rsi, [rdx - 8]
 32565  	mov	r9, rsi
 32566  	shr	r9, 3
 32567  	add	r9, 1
 32568  	test	rsi, rsi
 32569  	je	.LBB4_1245
 32570  # %bb.525:
 32571  	mov	rdi, r9
 32572  	and	rdi, -2
 32573  	neg	rdi
 32574  	xor	esi, esi
 32575  	pxor	xmm0, xmm0
 32576  	pcmpeqd	xmm1, xmm1
 32577  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1]
 32578  .LBB4_526:                              # =>This Inner Loop Header: Depth=1
 32579  	movd	xmm3, dword ptr [rcx + rsi]     # xmm3 = mem[0],zero,zero,zero
 32580  	movd	xmm4, dword ptr [rcx + rsi + 4] # xmm4 = mem[0],zero,zero,zero
 32581  	pcmpeqb	xmm3, xmm0
 32582  	pxor	xmm3, xmm1
 32583  	pmovzxbd	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
 32584  	pand	xmm3, xmm2
 32585  	pcmpeqb	xmm4, xmm0
 32586  	pxor	xmm4, xmm1
 32587  	pmovzxbd	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
 32588  	pand	xmm4, xmm2
 32589  	movdqu	xmmword ptr [r8 + 4*rsi], xmm3
 32590  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm4
 32591  	movd	xmm3, dword ptr [rcx + rsi + 8] # xmm3 = mem[0],zero,zero,zero
 32592  	movd	xmm4, dword ptr [rcx + rsi + 12] # xmm4 = mem[0],zero,zero,zero
 32593  	pcmpeqb	xmm3, xmm0
 32594  	pxor	xmm3, xmm1
 32595  	pmovzxbd	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
 32596  	pand	xmm3, xmm2
 32597  	pcmpeqb	xmm4, xmm0
 32598  	pxor	xmm4, xmm1
 32599  	pmovzxbd	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
 32600  	pand	xmm4, xmm2
 32601  	movdqu	xmmword ptr [r8 + 4*rsi + 32], xmm3
 32602  	movdqu	xmmword ptr [r8 + 4*rsi + 48], xmm4
 32603  	add	rsi, 16
 32604  	add	rdi, 2
 32605  	jne	.LBB4_526
 32606  	jmp	.LBB4_1246
 32607  .LBB4_529:
 32608  	mov	edx, r11d
 32609  	and	edx, -8
 32610  	lea	rsi, [rdx - 8]
 32611  	mov	r9, rsi
 32612  	shr	r9, 3
 32613  	add	r9, 1
 32614  	test	rsi, rsi
 32615  	je	.LBB4_1253
 32616  # %bb.530:
 32617  	mov	rdi, r9
 32618  	and	rdi, -2
 32619  	neg	rdi
 32620  	xor	esi, esi
 32621  	pxor	xmm2, xmm2
 32622  	pcmpeqd	xmm3, xmm3
 32623  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 32624  .LBB4_531:                              # =>This Inner Loop Header: Depth=1
 32625  	movdqu	xmm5, xmmword ptr [rcx + 4*rsi]
 32626  	movdqu	xmm6, xmmword ptr [rcx + 4*rsi + 16]
 32627  	movdqa	xmm0, xmm4
 32628  	pcmpgtd	xmm0, xmm5
 32629  	pcmpeqd	xmm5, xmm2
 32630  	pxor	xmm5, xmm3
 32631  	movdqa	xmm1, xmm4
 32632  	pcmpgtd	xmm1, xmm6
 32633  	pcmpeqd	xmm6, xmm2
 32634  	pxor	xmm6, xmm3
 32635  	movdqa	xmm7, xmm4
 32636  	blendvps	xmm7, xmm5, xmm0
 32637  	movdqa	xmm5, xmm4
 32638  	movdqa	xmm0, xmm1
 32639  	blendvps	xmm5, xmm6, xmm0
 32640  	movups	xmmword ptr [r8 + 4*rsi], xmm7
 32641  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm5
 32642  	movdqu	xmm5, xmmword ptr [rcx + 4*rsi + 32]
 32643  	movdqu	xmm6, xmmword ptr [rcx + 4*rsi + 48]
 32644  	movdqa	xmm0, xmm4
 32645  	pcmpgtd	xmm0, xmm5
 32646  	pcmpeqd	xmm5, xmm2
 32647  	pxor	xmm5, xmm3
 32648  	movdqa	xmm1, xmm4
 32649  	pcmpgtd	xmm1, xmm6
 32650  	pcmpeqd	xmm6, xmm2
 32651  	pxor	xmm6, xmm3
 32652  	movdqa	xmm7, xmm4
 32653  	blendvps	xmm7, xmm5, xmm0
 32654  	movdqa	xmm5, xmm4
 32655  	movdqa	xmm0, xmm1
 32656  	blendvps	xmm5, xmm6, xmm0
 32657  	movups	xmmword ptr [r8 + 4*rsi + 32], xmm7
 32658  	movups	xmmword ptr [r8 + 4*rsi + 48], xmm5
 32659  	add	rsi, 16
 32660  	add	rdi, 2
 32661  	jne	.LBB4_531
 32662  	jmp	.LBB4_1254
 32663  .LBB4_544:
 32664  	mov	edx, eax
 32665  	and	edx, -4
 32666  	lea	rsi, [rdx - 4]
 32667  	mov	r9, rsi
 32668  	shr	r9, 2
 32669  	add	r9, 1
 32670  	test	rsi, rsi
 32671  	je	.LBB4_1262
 32672  # %bb.545:
 32673  	mov	rdi, r9
 32674  	and	rdi, -2
 32675  	neg	rdi
 32676  	xor	esi, esi
 32677  	xorpd	xmm0, xmm0
 32678  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 32679  	movapd	xmm2, xmmword ptr [rip + .LCPI4_1] # xmm2 = [1.0E+0,1.0E+0]
 32680  .LBB4_546:                              # =>This Inner Loop Header: Depth=1
 32681  	movupd	xmm3, xmmword ptr [rcx + 8*rsi]
 32682  	movupd	xmm4, xmmword ptr [rcx + 8*rsi + 16]
 32683  	movapd	xmm5, xmm3
 32684  	andpd	xmm5, xmm1
 32685  	orpd	xmm5, xmm2
 32686  	movapd	xmm6, xmm4
 32687  	andpd	xmm6, xmm1
 32688  	orpd	xmm6, xmm2
 32689  	cmpneqpd	xmm3, xmm0
 32690  	andpd	xmm3, xmm5
 32691  	cmpneqpd	xmm4, xmm0
 32692  	andpd	xmm4, xmm6
 32693  	movupd	xmmword ptr [r8 + 8*rsi], xmm3
 32694  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm4
 32695  	movupd	xmm3, xmmword ptr [rcx + 8*rsi + 32]
 32696  	movupd	xmm4, xmmword ptr [rcx + 8*rsi + 48]
 32697  	movapd	xmm5, xmm3
 32698  	andpd	xmm5, xmm1
 32699  	orpd	xmm5, xmm2
 32700  	movapd	xmm6, xmm4
 32701  	andpd	xmm6, xmm1
 32702  	orpd	xmm6, xmm2
 32703  	cmpneqpd	xmm3, xmm0
 32704  	andpd	xmm3, xmm5
 32705  	cmpneqpd	xmm4, xmm0
 32706  	andpd	xmm4, xmm6
 32707  	movupd	xmmword ptr [r8 + 8*rsi + 32], xmm3
 32708  	movupd	xmmword ptr [r8 + 8*rsi + 48], xmm4
 32709  	add	rsi, 8
 32710  	add	rdi, 2
 32711  	jne	.LBB4_546
 32712  	jmp	.LBB4_1263
 32713  .LBB4_625:
 32714  	mov	edx, eax
 32715  	and	edx, -8
 32716  	lea	rsi, [rdx - 8]
 32717  	mov	r9, rsi
 32718  	shr	r9, 3
 32719  	add	r9, 1
 32720  	test	rsi, rsi
 32721  	je	.LBB4_1271
 32722  # %bb.626:
 32723  	mov	rdi, r9
 32724  	and	rdi, -2
 32725  	neg	rdi
 32726  	xor	esi, esi
 32727  	pxor	xmm0, xmm0
 32728  	pcmpeqd	xmm1, xmm1
 32729  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_12] # xmm2 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u>
 32730  .LBB4_627:                              # =>This Inner Loop Header: Depth=1
 32731  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi]
 32732  	movdqu	xmm4, xmmword ptr [rcx + 4*rsi + 16]
 32733  	pcmpeqd	xmm3, xmm0
 32734  	pxor	xmm3, xmm1
 32735  	packssdw	xmm3, xmm3
 32736  	packsswb	xmm3, xmm3
 32737  	pand	xmm3, xmm2
 32738  	pcmpeqd	xmm4, xmm0
 32739  	pxor	xmm4, xmm1
 32740  	packssdw	xmm4, xmm4
 32741  	packsswb	xmm4, xmm4
 32742  	pand	xmm4, xmm2
 32743  	movd	dword ptr [r8 + rsi], xmm3
 32744  	movd	dword ptr [r8 + rsi + 4], xmm4
 32745  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 32]
 32746  	movdqu	xmm4, xmmword ptr [rcx + 4*rsi + 48]
 32747  	pcmpeqd	xmm3, xmm0
 32748  	pxor	xmm3, xmm1
 32749  	packssdw	xmm3, xmm3
 32750  	packsswb	xmm3, xmm3
 32751  	pand	xmm3, xmm2
 32752  	pcmpeqd	xmm4, xmm0
 32753  	pxor	xmm4, xmm1
 32754  	packssdw	xmm4, xmm4
 32755  	packsswb	xmm4, xmm4
 32756  	pand	xmm4, xmm2
 32757  	movd	dword ptr [r8 + rsi + 8], xmm3
 32758  	movd	dword ptr [r8 + rsi + 12], xmm4
 32759  	add	rsi, 16
 32760  	add	rdi, 2
 32761  	jne	.LBB4_627
 32762  	jmp	.LBB4_1272
 32763  .LBB4_630:
 32764  	mov	edx, eax
 32765  	and	edx, -4
 32766  	lea	rsi, [rdx - 4]
 32767  	mov	r9, rsi
 32768  	shr	r9, 2
 32769  	add	r9, 1
 32770  	test	rsi, rsi
 32771  	je	.LBB4_1279
 32772  # %bb.631:
 32773  	mov	rdi, r9
 32774  	and	rdi, -2
 32775  	neg	rdi
 32776  	xor	esi, esi
 32777  	xorpd	xmm2, xmm2
 32778  	movapd	xmm3, xmmword ptr [rip + .LCPI4_0] # xmm3 = [-0.0E+0,-0.0E+0]
 32779  	movapd	xmm4, xmmword ptr [rip + .LCPI4_1] # xmm4 = [1.0E+0,1.0E+0]
 32780  	movdqa	xmm5, xmmword ptr [rip + .LCPI4_7] # xmm5 = <0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 32781  .LBB4_632:                              # =>This Inner Loop Header: Depth=1
 32782  	movupd	xmm6, xmmword ptr [rcx + 8*rsi]
 32783  	movupd	xmm7, xmmword ptr [rcx + 8*rsi + 16]
 32784  	movapd	xmm0, xmm6
 32785  	cmpeqpd	xmm0, xmm2
 32786  	packssdw	xmm0, xmm0
 32787  	packssdw	xmm0, xmm0
 32788  	packsswb	xmm0, xmm0
 32789  	movapd	xmm1, xmm7
 32790  	cmpeqpd	xmm1, xmm2
 32791  	packssdw	xmm1, xmm1
 32792  	packssdw	xmm1, xmm1
 32793  	packsswb	xmm1, xmm1
 32794  	andpd	xmm6, xmm3
 32795  	orpd	xmm6, xmm4
 32796  	andpd	xmm7, xmm3
 32797  	orpd	xmm7, xmm4
 32798  	cvttpd2dq	xmm6, xmm6
 32799  	pshufb	xmm6, xmm5
 32800  	cvttpd2dq	xmm7, xmm7
 32801  	pshufb	xmm7, xmm5
 32802  	pblendvb	xmm6, xmm2, xmm0
 32803  	movdqa	xmm0, xmm1
 32804  	pblendvb	xmm7, xmm2, xmm0
 32805  	pextrw	word ptr [r8 + rsi], xmm6, 0
 32806  	pextrw	word ptr [r8 + rsi + 2], xmm7, 0
 32807  	movupd	xmm6, xmmword ptr [rcx + 8*rsi + 32]
 32808  	movupd	xmm7, xmmword ptr [rcx + 8*rsi + 48]
 32809  	movapd	xmm0, xmm6
 32810  	cmpeqpd	xmm0, xmm2
 32811  	packssdw	xmm0, xmm0
 32812  	packssdw	xmm0, xmm0
 32813  	packsswb	xmm0, xmm0
 32814  	movapd	xmm1, xmm7
 32815  	cmpeqpd	xmm1, xmm2
 32816  	packssdw	xmm1, xmm1
 32817  	packssdw	xmm1, xmm1
 32818  	packsswb	xmm1, xmm1
 32819  	andpd	xmm6, xmm3
 32820  	orpd	xmm6, xmm4
 32821  	andpd	xmm7, xmm3
 32822  	orpd	xmm7, xmm4
 32823  	cvttpd2dq	xmm6, xmm6
 32824  	pshufb	xmm6, xmm5
 32825  	cvttpd2dq	xmm7, xmm7
 32826  	pshufb	xmm7, xmm5
 32827  	pblendvb	xmm6, xmm2, xmm0
 32828  	movdqa	xmm0, xmm1
 32829  	pblendvb	xmm7, xmm2, xmm0
 32830  	pextrw	word ptr [r8 + rsi + 4], xmm6, 0
 32831  	pextrw	word ptr [r8 + rsi + 6], xmm7, 0
 32832  	add	rsi, 8
 32833  	add	rdi, 2
 32834  	jne	.LBB4_632
 32835  	jmp	.LBB4_1280
 32836  .LBB4_635:
 32837  	mov	esi, r10d
 32838  	and	esi, -32
 32839  	lea	rax, [rsi - 32]
 32840  	mov	r9, rax
 32841  	shr	r9, 5
 32842  	add	r9, 1
 32843  	test	rax, rax
 32844  	je	.LBB4_1288
 32845  # %bb.636:
 32846  	mov	rdi, r9
 32847  	and	rdi, -2
 32848  	neg	rdi
 32849  	xor	eax, eax
 32850  	pxor	xmm2, xmm2
 32851  	pcmpeqd	xmm3, xmm3
 32852  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_22] # xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 32853  .LBB4_637:                              # =>This Inner Loop Header: Depth=1
 32854  	movdqu	xmm5, xmmword ptr [rcx + rax]
 32855  	movdqu	xmm6, xmmword ptr [rcx + rax + 16]
 32856  	movdqa	xmm0, xmm4
 32857  	pcmpgtb	xmm0, xmm5
 32858  	pcmpeqb	xmm5, xmm2
 32859  	pxor	xmm5, xmm3
 32860  	movdqa	xmm1, xmm4
 32861  	pcmpgtb	xmm1, xmm6
 32862  	pcmpeqb	xmm6, xmm2
 32863  	pxor	xmm6, xmm3
 32864  	movdqa	xmm7, xmm4
 32865  	pblendvb	xmm7, xmm5, xmm0
 32866  	movdqa	xmm5, xmm4
 32867  	movdqa	xmm0, xmm1
 32868  	pblendvb	xmm5, xmm6, xmm0
 32869  	movdqu	xmmword ptr [r8 + rax], xmm7
 32870  	movdqu	xmmword ptr [r8 + rax + 16], xmm5
 32871  	movdqu	xmm5, xmmword ptr [rcx + rax + 32]
 32872  	movdqu	xmm6, xmmword ptr [rcx + rax + 48]
 32873  	movdqa	xmm0, xmm4
 32874  	pcmpgtb	xmm0, xmm5
 32875  	pcmpeqb	xmm5, xmm2
 32876  	pxor	xmm5, xmm3
 32877  	movdqa	xmm1, xmm4
 32878  	pcmpgtb	xmm1, xmm6
 32879  	pcmpeqb	xmm6, xmm2
 32880  	pxor	xmm6, xmm3
 32881  	movdqa	xmm7, xmm4
 32882  	pblendvb	xmm7, xmm5, xmm0
 32883  	movdqa	xmm5, xmm4
 32884  	movdqa	xmm0, xmm1
 32885  	pblendvb	xmm5, xmm6, xmm0
 32886  	movdqu	xmmword ptr [r8 + rax + 32], xmm7
 32887  	movdqu	xmmword ptr [r8 + rax + 48], xmm5
 32888  	add	rax, 64
 32889  	add	rdi, 2
 32890  	jne	.LBB4_637
 32891  	jmp	.LBB4_1289
 32892  .LBB4_640:
 32893  	mov	edx, eax
 32894  	and	edx, -4
 32895  	lea	rsi, [rdx - 4]
 32896  	mov	r9, rsi
 32897  	shr	r9, 2
 32898  	add	r9, 1
 32899  	test	rsi, rsi
 32900  	je	.LBB4_1297
 32901  # %bb.641:
 32902  	mov	rdi, r9
 32903  	and	rdi, -2
 32904  	neg	rdi
 32905  	xor	esi, esi
 32906  	pxor	xmm0, xmm0
 32907  	pcmpeqd	xmm1, xmm1
 32908  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_18] # xmm2 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 32909  .LBB4_642:                              # =>This Inner Loop Header: Depth=1
 32910  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi]
 32911  	movdqu	xmm4, xmmword ptr [rcx + 8*rsi + 16]
 32912  	pcmpeqq	xmm3, xmm0
 32913  	pxor	xmm3, xmm1
 32914  	packssdw	xmm3, xmm3
 32915  	packssdw	xmm3, xmm3
 32916  	packsswb	xmm3, xmm3
 32917  	pand	xmm3, xmm2
 32918  	pcmpeqq	xmm4, xmm0
 32919  	pxor	xmm4, xmm1
 32920  	packssdw	xmm4, xmm4
 32921  	packssdw	xmm4, xmm4
 32922  	packsswb	xmm4, xmm4
 32923  	pextrw	word ptr [r8 + rsi], xmm3, 0
 32924  	pand	xmm4, xmm2
 32925  	pextrw	word ptr [r8 + rsi + 2], xmm4, 0
 32926  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 32]
 32927  	movdqu	xmm4, xmmword ptr [rcx + 8*rsi + 48]
 32928  	pcmpeqq	xmm3, xmm0
 32929  	pxor	xmm3, xmm1
 32930  	packssdw	xmm3, xmm3
 32931  	packssdw	xmm3, xmm3
 32932  	packsswb	xmm3, xmm3
 32933  	pand	xmm3, xmm2
 32934  	pcmpeqq	xmm4, xmm0
 32935  	pxor	xmm4, xmm1
 32936  	packssdw	xmm4, xmm4
 32937  	packssdw	xmm4, xmm4
 32938  	packsswb	xmm4, xmm4
 32939  	pextrw	word ptr [r8 + rsi + 4], xmm3, 0
 32940  	pand	xmm4, xmm2
 32941  	pextrw	word ptr [r8 + rsi + 6], xmm4, 0
 32942  	add	rsi, 8
 32943  	add	rdi, 2
 32944  	jne	.LBB4_642
 32945  	jmp	.LBB4_1298
 32946  .LBB4_645:
 32947  	mov	edx, eax
 32948  	and	edx, -16
 32949  	lea	rsi, [rdx - 16]
 32950  	mov	r9, rsi
 32951  	shr	r9, 4
 32952  	add	r9, 1
 32953  	test	rsi, rsi
 32954  	je	.LBB4_1305
 32955  # %bb.646:
 32956  	mov	rdi, r9
 32957  	and	rdi, -2
 32958  	neg	rdi
 32959  	xor	esi, esi
 32960  	pxor	xmm0, xmm0
 32961  	pcmpeqd	xmm1, xmm1
 32962  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_21] # xmm2 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u>
 32963  .LBB4_647:                              # =>This Inner Loop Header: Depth=1
 32964  	movdqu	xmm3, xmmword ptr [rcx + 2*rsi]
 32965  	movdqu	xmm4, xmmword ptr [rcx + 2*rsi + 16]
 32966  	pcmpeqw	xmm3, xmm0
 32967  	pxor	xmm3, xmm1
 32968  	packsswb	xmm3, xmm3
 32969  	pand	xmm3, xmm2
 32970  	pcmpeqw	xmm4, xmm0
 32971  	pxor	xmm4, xmm1
 32972  	packsswb	xmm4, xmm4
 32973  	pand	xmm4, xmm2
 32974  	punpcklqdq	xmm3, xmm4              # xmm3 = xmm3[0],xmm4[0]
 32975  	movdqu	xmmword ptr [r8 + rsi], xmm3
 32976  	movdqu	xmm3, xmmword ptr [rcx + 2*rsi + 32]
 32977  	movdqu	xmm4, xmmword ptr [rcx + 2*rsi + 48]
 32978  	pcmpeqw	xmm3, xmm0
 32979  	pxor	xmm3, xmm1
 32980  	packsswb	xmm3, xmm3
 32981  	pand	xmm3, xmm2
 32982  	pcmpeqw	xmm4, xmm0
 32983  	pxor	xmm4, xmm1
 32984  	packsswb	xmm4, xmm4
 32985  	pand	xmm4, xmm2
 32986  	punpcklqdq	xmm3, xmm4              # xmm3 = xmm3[0],xmm4[0]
 32987  	movdqu	xmmword ptr [r8 + rsi + 16], xmm3
 32988  	add	rsi, 32
 32989  	add	rdi, 2
 32990  	jne	.LBB4_647
 32991  	jmp	.LBB4_1306
 32992  .LBB4_650:
 32993  	mov	esi, r10d
 32994  	and	esi, -16
 32995  	lea	rax, [rsi - 16]
 32996  	mov	r9, rax
 32997  	shr	r9, 4
 32998  	add	r9, 1
 32999  	test	rax, rax
 33000  	je	.LBB4_1313
 33001  # %bb.651:
 33002  	mov	rdi, r9
 33003  	and	rdi, -2
 33004  	neg	rdi
 33005  	xor	eax, eax
 33006  	pxor	xmm2, xmm2
 33007  	pcmpeqd	xmm3, xmm3
 33008  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_21] # xmm4 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u>
 33009  .LBB4_652:                              # =>This Inner Loop Header: Depth=1
 33010  	movdqu	xmm5, xmmword ptr [rcx + 2*rax]
 33011  	movdqu	xmm6, xmmword ptr [rcx + 2*rax + 16]
 33012  	movdqa	xmm0, xmm5
 33013  	pcmpgtw	xmm0, xmm2
 33014  	packsswb	xmm0, xmm0
 33015  	movdqa	xmm1, xmm6
 33016  	pcmpgtw	xmm1, xmm2
 33017  	packsswb	xmm1, xmm1
 33018  	pcmpeqw	xmm5, xmm2
 33019  	pxor	xmm5, xmm3
 33020  	packsswb	xmm5, xmm5
 33021  	pcmpeqw	xmm6, xmm2
 33022  	pxor	xmm6, xmm3
 33023  	packsswb	xmm6, xmm6
 33024  	pblendvb	xmm5, xmm4, xmm0
 33025  	movdqa	xmm0, xmm1
 33026  	pblendvb	xmm6, xmm4, xmm0
 33027  	punpcklqdq	xmm5, xmm6              # xmm5 = xmm5[0],xmm6[0]
 33028  	movdqu	xmmword ptr [r8 + rax], xmm5
 33029  	movdqu	xmm5, xmmword ptr [rcx + 2*rax + 32]
 33030  	movdqu	xmm6, xmmword ptr [rcx + 2*rax + 48]
 33031  	movdqa	xmm0, xmm5
 33032  	pcmpgtw	xmm0, xmm2
 33033  	packsswb	xmm0, xmm0
 33034  	movdqa	xmm1, xmm6
 33035  	pcmpgtw	xmm1, xmm2
 33036  	packsswb	xmm1, xmm1
 33037  	pcmpeqw	xmm5, xmm2
 33038  	pxor	xmm5, xmm3
 33039  	packsswb	xmm5, xmm5
 33040  	pcmpeqw	xmm6, xmm2
 33041  	pxor	xmm6, xmm3
 33042  	packsswb	xmm6, xmm6
 33043  	pblendvb	xmm5, xmm4, xmm0
 33044  	movdqa	xmm0, xmm1
 33045  	pblendvb	xmm6, xmm4, xmm0
 33046  	punpcklqdq	xmm5, xmm6              # xmm5 = xmm5[0],xmm6[0]
 33047  	movdqu	xmmword ptr [r8 + rax + 16], xmm5
 33048  	add	rax, 32
 33049  	add	rdi, 2
 33050  	jne	.LBB4_652
 33051  	jmp	.LBB4_1314
 33052  .LBB4_655:
 33053  	mov	esi, r10d
 33054  	and	esi, -4
 33055  	lea	rax, [rsi - 4]
 33056  	mov	r9, rax
 33057  	shr	r9, 2
 33058  	add	r9, 1
 33059  	test	rax, rax
 33060  	je	.LBB4_1322
 33061  # %bb.656:
 33062  	mov	rdi, r9
 33063  	and	rdi, -2
 33064  	neg	rdi
 33065  	xor	eax, eax
 33066  	pxor	xmm2, xmm2
 33067  	pcmpeqd	xmm3, xmm3
 33068  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_18] # xmm4 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 33069  .LBB4_657:                              # =>This Inner Loop Header: Depth=1
 33070  	movdqu	xmm5, xmmword ptr [rcx + 8*rax]
 33071  	movdqu	xmm6, xmmword ptr [rcx + 8*rax + 16]
 33072  	movdqa	xmm0, xmm5
 33073  	pcmpgtq	xmm0, xmm2
 33074  	packssdw	xmm0, xmm0
 33075  	packssdw	xmm0, xmm0
 33076  	packsswb	xmm0, xmm0
 33077  	movdqa	xmm1, xmm6
 33078  	pcmpgtq	xmm1, xmm2
 33079  	packssdw	xmm1, xmm1
 33080  	packssdw	xmm1, xmm1
 33081  	packsswb	xmm1, xmm1
 33082  	pcmpeqq	xmm5, xmm2
 33083  	pxor	xmm5, xmm3
 33084  	packssdw	xmm5, xmm5
 33085  	packssdw	xmm5, xmm5
 33086  	packsswb	xmm5, xmm5
 33087  	pcmpeqq	xmm6, xmm2
 33088  	pxor	xmm6, xmm3
 33089  	packssdw	xmm6, xmm6
 33090  	packssdw	xmm6, xmm6
 33091  	packsswb	xmm6, xmm6
 33092  	pblendvb	xmm5, xmm4, xmm0
 33093  	movdqa	xmm0, xmm1
 33094  	pblendvb	xmm6, xmm4, xmm0
 33095  	pextrw	word ptr [r8 + rax], xmm5, 0
 33096  	pextrw	word ptr [r8 + rax + 2], xmm6, 0
 33097  	movdqu	xmm5, xmmword ptr [rcx + 8*rax + 32]
 33098  	movdqu	xmm6, xmmword ptr [rcx + 8*rax + 48]
 33099  	movdqa	xmm0, xmm5
 33100  	pcmpgtq	xmm0, xmm2
 33101  	packssdw	xmm0, xmm0
 33102  	packssdw	xmm0, xmm0
 33103  	packsswb	xmm0, xmm0
 33104  	movdqa	xmm1, xmm6
 33105  	pcmpgtq	xmm1, xmm2
 33106  	packssdw	xmm1, xmm1
 33107  	packssdw	xmm1, xmm1
 33108  	packsswb	xmm1, xmm1
 33109  	pcmpeqq	xmm5, xmm2
 33110  	pxor	xmm5, xmm3
 33111  	packssdw	xmm5, xmm5
 33112  	packssdw	xmm5, xmm5
 33113  	packsswb	xmm5, xmm5
 33114  	pcmpeqq	xmm6, xmm2
 33115  	pxor	xmm6, xmm3
 33116  	packssdw	xmm6, xmm6
 33117  	packssdw	xmm6, xmm6
 33118  	packsswb	xmm6, xmm6
 33119  	pblendvb	xmm5, xmm4, xmm0
 33120  	movdqa	xmm0, xmm1
 33121  	pblendvb	xmm6, xmm4, xmm0
 33122  	pextrw	word ptr [r8 + rax + 4], xmm5, 0
 33123  	pextrw	word ptr [r8 + rax + 6], xmm6, 0
 33124  	add	rax, 8
 33125  	add	rdi, 2
 33126  	jne	.LBB4_657
 33127  	jmp	.LBB4_1323
 33128  .LBB4_660:
 33129  	mov	edx, r10d
 33130  	and	edx, -8
 33131  	lea	rsi, [rdx - 8]
 33132  	mov	r9, rsi
 33133  	shr	r9, 3
 33134  	add	r9, 1
 33135  	test	rsi, rsi
 33136  	je	.LBB4_1331
 33137  # %bb.661:
 33138  	mov	rdi, r9
 33139  	and	rdi, -2
 33140  	neg	rdi
 33141  	xor	esi, esi
 33142  	xorps	xmm4, xmm4
 33143  	pcmpeqd	xmm8, xmm8
 33144  	movdqa	xmm6, xmmword ptr [rip + .LCPI4_12] # xmm6 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u>
 33145  .LBB4_662:                              # =>This Inner Loop Header: Depth=1
 33146  	movups	xmm0, xmmword ptr [rcx + 4*rsi]
 33147  	movups	xmm1, xmmword ptr [rcx + 4*rsi + 16]
 33148  	movaps	xmm2, xmm0
 33149  	cmpeqps	xmm2, xmm4
 33150  	packssdw	xmm2, xmm2
 33151  	packsswb	xmm2, xmm2
 33152  	movaps	xmm3, xmm1
 33153  	cmpeqps	xmm3, xmm4
 33154  	packssdw	xmm3, xmm3
 33155  	packsswb	xmm3, xmm3
 33156  	pcmpgtd	xmm0, xmm8
 33157  	packssdw	xmm0, xmm0
 33158  	packsswb	xmm0, xmm0
 33159  	pcmpgtd	xmm1, xmm8
 33160  	packssdw	xmm1, xmm1
 33161  	packsswb	xmm1, xmm1
 33162  	pcmpeqd	xmm7, xmm7
 33163  	pblendvb	xmm7, xmm6, xmm0
 33164  	pcmpeqd	xmm5, xmm5
 33165  	movdqa	xmm0, xmm1
 33166  	pblendvb	xmm5, xmm6, xmm0
 33167  	movdqa	xmm0, xmm2
 33168  	pblendvb	xmm7, xmm4, xmm0
 33169  	movdqa	xmm0, xmm3
 33170  	pblendvb	xmm5, xmm4, xmm0
 33171  	movd	dword ptr [r8 + rsi], xmm7
 33172  	movd	dword ptr [r8 + rsi + 4], xmm5
 33173  	movups	xmm0, xmmword ptr [rcx + 4*rsi + 32]
 33174  	movups	xmm1, xmmword ptr [rcx + 4*rsi + 48]
 33175  	movaps	xmm2, xmm0
 33176  	cmpeqps	xmm2, xmm4
 33177  	packssdw	xmm2, xmm2
 33178  	packsswb	xmm2, xmm2
 33179  	movaps	xmm3, xmm1
 33180  	cmpeqps	xmm3, xmm4
 33181  	packssdw	xmm3, xmm3
 33182  	packsswb	xmm3, xmm3
 33183  	pcmpgtd	xmm0, xmm8
 33184  	packssdw	xmm0, xmm0
 33185  	packsswb	xmm0, xmm0
 33186  	pcmpgtd	xmm1, xmm8
 33187  	packssdw	xmm1, xmm1
 33188  	pcmpeqd	xmm5, xmm5
 33189  	pblendvb	xmm5, xmm6, xmm0
 33190  	packsswb	xmm1, xmm1
 33191  	pcmpeqd	xmm7, xmm7
 33192  	movdqa	xmm0, xmm1
 33193  	pblendvb	xmm7, xmm6, xmm0
 33194  	movdqa	xmm0, xmm2
 33195  	pblendvb	xmm5, xmm4, xmm0
 33196  	movdqa	xmm0, xmm3
 33197  	pblendvb	xmm7, xmm4, xmm0
 33198  	movd	dword ptr [r8 + rsi + 8], xmm5
 33199  	movd	dword ptr [r8 + rsi + 12], xmm7
 33200  	add	rsi, 16
 33201  	add	rdi, 2
 33202  	jne	.LBB4_662
 33203  	jmp	.LBB4_1332
 33204  .LBB4_665:
 33205  	mov	edx, eax
 33206  	and	edx, -32
 33207  	lea	rsi, [rdx - 32]
 33208  	mov	r9, rsi
 33209  	shr	r9, 5
 33210  	add	r9, 1
 33211  	test	rsi, rsi
 33212  	je	.LBB4_1340
 33213  # %bb.666:
 33214  	mov	rdi, r9
 33215  	and	rdi, -2
 33216  	neg	rdi
 33217  	xor	esi, esi
 33218  	pxor	xmm0, xmm0
 33219  	movdqa	xmm1, xmmword ptr [rip + .LCPI4_22] # xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 33220  .LBB4_667:                              # =>This Inner Loop Header: Depth=1
 33221  	movdqu	xmm2, xmmword ptr [rcx + rsi]
 33222  	movdqu	xmm3, xmmword ptr [rcx + rsi + 16]
 33223  	pcmpeqb	xmm2, xmm0
 33224  	pandn	xmm2, xmm1
 33225  	pcmpeqb	xmm3, xmm0
 33226  	pandn	xmm3, xmm1
 33227  	movdqu	xmmword ptr [r8 + rsi], xmm2
 33228  	movdqu	xmmword ptr [r8 + rsi + 16], xmm3
 33229  	movdqu	xmm2, xmmword ptr [rcx + rsi + 32]
 33230  	movdqu	xmm3, xmmword ptr [rcx + rsi + 48]
 33231  	pcmpeqb	xmm2, xmm0
 33232  	pandn	xmm2, xmm1
 33233  	pcmpeqb	xmm3, xmm0
 33234  	pandn	xmm3, xmm1
 33235  	movdqu	xmmword ptr [r8 + rsi + 32], xmm2
 33236  	movdqu	xmmword ptr [r8 + rsi + 48], xmm3
 33237  	add	rsi, 64
 33238  	add	rdi, 2
 33239  	jne	.LBB4_667
 33240  	jmp	.LBB4_1341
 33241  .LBB4_670:
 33242  	mov	esi, r10d
 33243  	and	esi, -8
 33244  	lea	rax, [rsi - 8]
 33245  	mov	r9, rax
 33246  	shr	r9, 3
 33247  	add	r9, 1
 33248  	test	rax, rax
 33249  	je	.LBB4_1348
 33250  # %bb.671:
 33251  	mov	rdi, r9
 33252  	and	rdi, -2
 33253  	neg	rdi
 33254  	xor	eax, eax
 33255  	pxor	xmm2, xmm2
 33256  	pcmpeqd	xmm3, xmm3
 33257  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_12] # xmm4 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u>
 33258  .LBB4_672:                              # =>This Inner Loop Header: Depth=1
 33259  	movdqu	xmm5, xmmword ptr [rcx + 4*rax]
 33260  	movdqu	xmm6, xmmword ptr [rcx + 4*rax + 16]
 33261  	movdqa	xmm0, xmm5
 33262  	pcmpgtd	xmm0, xmm2
 33263  	packssdw	xmm0, xmm0
 33264  	packsswb	xmm0, xmm0
 33265  	movdqa	xmm1, xmm6
 33266  	pcmpgtd	xmm1, xmm2
 33267  	packssdw	xmm1, xmm1
 33268  	packsswb	xmm1, xmm1
 33269  	pcmpeqd	xmm5, xmm2
 33270  	pxor	xmm5, xmm3
 33271  	packssdw	xmm5, xmm5
 33272  	packsswb	xmm5, xmm5
 33273  	pcmpeqd	xmm6, xmm2
 33274  	pxor	xmm6, xmm3
 33275  	packssdw	xmm6, xmm6
 33276  	packsswb	xmm6, xmm6
 33277  	pblendvb	xmm5, xmm4, xmm0
 33278  	movdqa	xmm0, xmm1
 33279  	pblendvb	xmm6, xmm4, xmm0
 33280  	movd	dword ptr [r8 + rax], xmm5
 33281  	movd	dword ptr [r8 + rax + 4], xmm6
 33282  	movdqu	xmm5, xmmword ptr [rcx + 4*rax + 32]
 33283  	movdqu	xmm6, xmmword ptr [rcx + 4*rax + 48]
 33284  	movdqa	xmm0, xmm5
 33285  	pcmpgtd	xmm0, xmm2
 33286  	packssdw	xmm0, xmm0
 33287  	packsswb	xmm0, xmm0
 33288  	movdqa	xmm1, xmm6
 33289  	pcmpgtd	xmm1, xmm2
 33290  	packssdw	xmm1, xmm1
 33291  	packsswb	xmm1, xmm1
 33292  	pcmpeqd	xmm5, xmm2
 33293  	pxor	xmm5, xmm3
 33294  	packssdw	xmm5, xmm5
 33295  	packsswb	xmm5, xmm5
 33296  	pcmpeqd	xmm6, xmm2
 33297  	pxor	xmm6, xmm3
 33298  	packssdw	xmm6, xmm6
 33299  	packsswb	xmm6, xmm6
 33300  	pblendvb	xmm5, xmm4, xmm0
 33301  	movdqa	xmm0, xmm1
 33302  	pblendvb	xmm6, xmm4, xmm0
 33303  	movd	dword ptr [r8 + rax + 8], xmm5
 33304  	movd	dword ptr [r8 + rax + 12], xmm6
 33305  	add	rax, 16
 33306  	add	rdi, 2
 33307  	jne	.LBB4_672
 33308  	jmp	.LBB4_1349
 33309  .LBB4_681:
 33310  	mov	edx, r10d
 33311  	and	edx, -4
 33312  	lea	rsi, [rdx - 4]
 33313  	mov	r9, rsi
 33314  	shr	r9, 2
 33315  	add	r9, 1
 33316  	test	rsi, rsi
 33317  	je	.LBB4_1357
 33318  # %bb.682:
 33319  	mov	rdi, r9
 33320  	and	rdi, -2
 33321  	neg	rdi
 33322  	xor	esi, esi
 33323  	pxor	xmm2, xmm2
 33324  	pcmpeqd	xmm3, xmm3
 33325  	movapd	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 33326  .LBB4_683:                              # =>This Inner Loop Header: Depth=1
 33327  	movzx	eax, word ptr [rcx + rsi]
 33328  	movd	xmm5, eax
 33329  	movzx	eax, word ptr [rcx + rsi + 2]
 33330  	movd	xmm6, eax
 33331  	movdqa	xmm0, xmm5
 33332  	pcmpgtb	xmm0, xmm2
 33333  	pmovsxbq	xmm0, xmm0
 33334  	movdqa	xmm1, xmm6
 33335  	pcmpgtb	xmm1, xmm2
 33336  	pmovsxbq	xmm1, xmm1
 33337  	pcmpeqb	xmm5, xmm2
 33338  	pxor	xmm5, xmm3
 33339  	pmovsxbq	xmm5, xmm5
 33340  	pcmpeqb	xmm6, xmm2
 33341  	pxor	xmm6, xmm3
 33342  	pmovsxbq	xmm6, xmm6
 33343  	blendvpd	xmm5, xmm4, xmm0
 33344  	movdqa	xmm0, xmm1
 33345  	blendvpd	xmm6, xmm4, xmm0
 33346  	movupd	xmmword ptr [r8 + 8*rsi], xmm5
 33347  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm6
 33348  	movzx	eax, word ptr [rcx + rsi + 4]
 33349  	movd	xmm5, eax
 33350  	movzx	eax, word ptr [rcx + rsi + 6]
 33351  	movd	xmm6, eax
 33352  	movdqa	xmm0, xmm5
 33353  	pcmpgtb	xmm0, xmm2
 33354  	pmovsxbq	xmm0, xmm0
 33355  	movdqa	xmm1, xmm6
 33356  	pcmpgtb	xmm1, xmm2
 33357  	pmovsxbq	xmm1, xmm1
 33358  	pcmpeqb	xmm5, xmm2
 33359  	pxor	xmm5, xmm3
 33360  	pmovsxbq	xmm5, xmm5
 33361  	pcmpeqb	xmm6, xmm2
 33362  	pxor	xmm6, xmm3
 33363  	pmovsxbq	xmm6, xmm6
 33364  	blendvpd	xmm5, xmm4, xmm0
 33365  	movdqa	xmm0, xmm1
 33366  	blendvpd	xmm6, xmm4, xmm0
 33367  	movupd	xmmword ptr [r8 + 8*rsi + 32], xmm5
 33368  	movupd	xmmword ptr [r8 + 8*rsi + 48], xmm6
 33369  	add	rsi, 8
 33370  	add	rdi, 2
 33371  	jne	.LBB4_683
 33372  	jmp	.LBB4_1358
 33373  .LBB4_686:
 33374  	mov	edx, r10d
 33375  	and	edx, -4
 33376  	lea	rsi, [rdx - 4]
 33377  	mov	r9, rsi
 33378  	shr	r9, 2
 33379  	add	r9, 1
 33380  	test	rsi, rsi
 33381  	je	.LBB4_1366
 33382  # %bb.687:
 33383  	mov	rdi, r9
 33384  	and	rdi, -2
 33385  	neg	rdi
 33386  	xor	esi, esi
 33387  	pxor	xmm0, xmm0
 33388  	movdqa	xmm1, xmmword ptr [rip + .LCPI4_15] # xmm1 = [1,1]
 33389  .LBB4_688:                              # =>This Inner Loop Header: Depth=1
 33390  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi]
 33391  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 16]
 33392  	pcmpeqq	xmm2, xmm0
 33393  	pandn	xmm2, xmm1
 33394  	pcmpeqq	xmm3, xmm0
 33395  	pandn	xmm3, xmm1
 33396  	movdqu	xmmword ptr [r8 + 8*rsi], xmm2
 33397  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm3
 33398  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi + 32]
 33399  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 48]
 33400  	pcmpeqq	xmm2, xmm0
 33401  	pandn	xmm2, xmm1
 33402  	pcmpeqq	xmm3, xmm0
 33403  	pandn	xmm3, xmm1
 33404  	movdqu	xmmword ptr [r8 + 8*rsi + 32], xmm2
 33405  	movdqu	xmmword ptr [r8 + 8*rsi + 48], xmm3
 33406  	add	rsi, 8
 33407  	add	rdi, 2
 33408  	jne	.LBB4_688
 33409  	jmp	.LBB4_1367
 33410  .LBB4_697:
 33411  	mov	edx, r11d
 33412  	and	edx, -4
 33413  	lea	rsi, [rdx - 4]
 33414  	mov	r9, rsi
 33415  	shr	r9, 2
 33416  	add	r9, 1
 33417  	test	rsi, rsi
 33418  	je	.LBB4_1374
 33419  # %bb.698:
 33420  	mov	rdi, r9
 33421  	and	rdi, -2
 33422  	neg	rdi
 33423  	xor	esi, esi
 33424  	pxor	xmm2, xmm2
 33425  	pcmpeqd	xmm3, xmm3
 33426  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 33427  .LBB4_699:                              # =>This Inner Loop Header: Depth=1
 33428  	movdqu	xmm5, xmmword ptr [rcx + 8*rsi]
 33429  	movdqu	xmm6, xmmword ptr [rcx + 8*rsi + 16]
 33430  	movdqa	xmm0, xmm4
 33431  	pcmpgtq	xmm0, xmm5
 33432  	pcmpeqq	xmm5, xmm2
 33433  	pxor	xmm5, xmm3
 33434  	movdqa	xmm1, xmm4
 33435  	pcmpgtq	xmm1, xmm6
 33436  	pcmpeqq	xmm6, xmm2
 33437  	pxor	xmm6, xmm3
 33438  	movdqa	xmm7, xmm4
 33439  	blendvpd	xmm7, xmm5, xmm0
 33440  	movdqa	xmm5, xmm4
 33441  	movdqa	xmm0, xmm1
 33442  	blendvpd	xmm5, xmm6, xmm0
 33443  	movupd	xmmword ptr [r8 + 8*rsi], xmm7
 33444  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm5
 33445  	movdqu	xmm5, xmmword ptr [rcx + 8*rsi + 32]
 33446  	movdqu	xmm6, xmmword ptr [rcx + 8*rsi + 48]
 33447  	movdqa	xmm0, xmm4
 33448  	pcmpgtq	xmm0, xmm5
 33449  	pcmpeqq	xmm5, xmm2
 33450  	pxor	xmm5, xmm3
 33451  	movdqa	xmm1, xmm4
 33452  	pcmpgtq	xmm1, xmm6
 33453  	pcmpeqq	xmm6, xmm2
 33454  	pxor	xmm6, xmm3
 33455  	movdqa	xmm7, xmm4
 33456  	blendvpd	xmm7, xmm5, xmm0
 33457  	movdqa	xmm5, xmm4
 33458  	movdqa	xmm0, xmm1
 33459  	blendvpd	xmm5, xmm6, xmm0
 33460  	movupd	xmmword ptr [r8 + 8*rsi + 32], xmm7
 33461  	movupd	xmmword ptr [r8 + 8*rsi + 48], xmm5
 33462  	add	rsi, 8
 33463  	add	rdi, 2
 33464  	jne	.LBB4_699
 33465  	jmp	.LBB4_1375
 33466  .LBB4_710:
 33467  	mov	edx, r10d
 33468  	and	edx, -4
 33469  	lea	rsi, [rdx - 4]
 33470  	mov	r9, rsi
 33471  	shr	r9, 2
 33472  	add	r9, 1
 33473  	test	rsi, rsi
 33474  	je	.LBB4_1383
 33475  # %bb.711:
 33476  	mov	rdi, r9
 33477  	and	rdi, -2
 33478  	neg	rdi
 33479  	xor	esi, esi
 33480  	pxor	xmm0, xmm0
 33481  	pcmpeqd	xmm1, xmm1
 33482  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_15] # xmm2 = [1,1]
 33483  .LBB4_712:                              # =>This Inner Loop Header: Depth=1
 33484  	movzx	eax, word ptr [rcx + rsi]
 33485  	movd	xmm3, eax
 33486  	movzx	eax, word ptr [rcx + rsi + 2]
 33487  	movd	xmm4, eax
 33488  	pcmpeqb	xmm3, xmm0
 33489  	pxor	xmm3, xmm1
 33490  	pmovzxbq	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
 33491  	pand	xmm3, xmm2
 33492  	pcmpeqb	xmm4, xmm0
 33493  	pxor	xmm4, xmm1
 33494  	pmovzxbq	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 33495  	pand	xmm4, xmm2
 33496  	movdqu	xmmword ptr [r8 + 8*rsi], xmm3
 33497  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm4
 33498  	movzx	eax, word ptr [rcx + rsi + 4]
 33499  	movd	xmm3, eax
 33500  	movzx	eax, word ptr [rcx + rsi + 6]
 33501  	movd	xmm4, eax
 33502  	pcmpeqb	xmm3, xmm0
 33503  	pxor	xmm3, xmm1
 33504  	pmovzxbq	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
 33505  	pand	xmm3, xmm2
 33506  	pcmpeqb	xmm4, xmm0
 33507  	pxor	xmm4, xmm1
 33508  	pmovzxbq	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 33509  	pand	xmm4, xmm2
 33510  	movdqu	xmmword ptr [r8 + 8*rsi + 32], xmm3
 33511  	movdqu	xmmword ptr [r8 + 8*rsi + 48], xmm4
 33512  	add	rsi, 8
 33513  	add	rdi, 2
 33514  	jne	.LBB4_712
 33515  	jmp	.LBB4_1384
 33516  .LBB4_730:
 33517  	mov	edx, r10d
 33518  	and	edx, -16
 33519  	lea	rsi, [rdx - 16]
 33520  	mov	r9, rsi
 33521  	shr	r9, 4
 33522  	add	r9, 1
 33523  	test	rsi, rsi
 33524  	je	.LBB4_1391
 33525  # %bb.731:
 33526  	mov	rdi, r9
 33527  	and	rdi, -2
 33528  	neg	rdi
 33529  	xor	esi, esi
 33530  	pxor	xmm2, xmm2
 33531  	pcmpeqd	xmm3, xmm3
 33532  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1]
 33533  .LBB4_732:                              # =>This Inner Loop Header: Depth=1
 33534  	movq	xmm5, qword ptr [rcx + rsi]     # xmm5 = mem[0],zero
 33535  	movq	xmm6, qword ptr [rcx + rsi + 8] # xmm6 = mem[0],zero
 33536  	movdqa	xmm0, xmm5
 33537  	pcmpgtb	xmm0, xmm2
 33538  	pmovsxbw	xmm0, xmm0
 33539  	movdqa	xmm1, xmm6
 33540  	pcmpgtb	xmm1, xmm2
 33541  	pmovsxbw	xmm1, xmm1
 33542  	pcmpeqb	xmm5, xmm2
 33543  	pxor	xmm5, xmm3
 33544  	pmovsxbw	xmm5, xmm5
 33545  	pcmpeqb	xmm6, xmm2
 33546  	pxor	xmm6, xmm3
 33547  	pmovsxbw	xmm6, xmm6
 33548  	pblendvb	xmm5, xmm4, xmm0
 33549  	movdqa	xmm0, xmm1
 33550  	pblendvb	xmm6, xmm4, xmm0
 33551  	movdqu	xmmword ptr [r8 + 2*rsi], xmm5
 33552  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm6
 33553  	movq	xmm5, qword ptr [rcx + rsi + 16] # xmm5 = mem[0],zero
 33554  	movq	xmm6, qword ptr [rcx + rsi + 24] # xmm6 = mem[0],zero
 33555  	movdqa	xmm0, xmm5
 33556  	pcmpgtb	xmm0, xmm2
 33557  	pmovsxbw	xmm0, xmm0
 33558  	movdqa	xmm1, xmm6
 33559  	pcmpgtb	xmm1, xmm2
 33560  	pmovsxbw	xmm1, xmm1
 33561  	pcmpeqb	xmm5, xmm2
 33562  	pxor	xmm5, xmm3
 33563  	pmovsxbw	xmm5, xmm5
 33564  	pcmpeqb	xmm6, xmm2
 33565  	pxor	xmm6, xmm3
 33566  	pmovsxbw	xmm6, xmm6
 33567  	pblendvb	xmm5, xmm4, xmm0
 33568  	movdqa	xmm0, xmm1
 33569  	pblendvb	xmm6, xmm4, xmm0
 33570  	movdqu	xmmword ptr [r8 + 2*rsi + 32], xmm5
 33571  	movdqu	xmmword ptr [r8 + 2*rsi + 48], xmm6
 33572  	add	rsi, 32
 33573  	add	rdi, 2
 33574  	jne	.LBB4_732
 33575  	jmp	.LBB4_1392
 33576  .LBB4_735:
 33577  	mov	edx, r10d
 33578  	and	edx, -16
 33579  	lea	rsi, [rdx - 16]
 33580  	mov	r9, rsi
 33581  	shr	r9, 4
 33582  	add	r9, 1
 33583  	test	rsi, rsi
 33584  	je	.LBB4_1400
 33585  # %bb.736:
 33586  	mov	rdi, r9
 33587  	and	rdi, -2
 33588  	neg	rdi
 33589  	xor	esi, esi
 33590  	pxor	xmm2, xmm2
 33591  	pcmpeqd	xmm3, xmm3
 33592  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1]
 33593  .LBB4_737:                              # =>This Inner Loop Header: Depth=1
 33594  	movq	xmm5, qword ptr [rcx + rsi]     # xmm5 = mem[0],zero
 33595  	movq	xmm6, qword ptr [rcx + rsi + 8] # xmm6 = mem[0],zero
 33596  	movdqa	xmm0, xmm5
 33597  	pcmpgtb	xmm0, xmm2
 33598  	pmovsxbw	xmm0, xmm0
 33599  	movdqa	xmm1, xmm6
 33600  	pcmpgtb	xmm1, xmm2
 33601  	pmovsxbw	xmm1, xmm1
 33602  	pcmpeqb	xmm5, xmm2
 33603  	pxor	xmm5, xmm3
 33604  	pmovsxbw	xmm5, xmm5
 33605  	pcmpeqb	xmm6, xmm2
 33606  	pxor	xmm6, xmm3
 33607  	pmovsxbw	xmm6, xmm6
 33608  	pblendvb	xmm5, xmm4, xmm0
 33609  	movdqa	xmm0, xmm1
 33610  	pblendvb	xmm6, xmm4, xmm0
 33611  	movdqu	xmmword ptr [r8 + 2*rsi], xmm5
 33612  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm6
 33613  	movq	xmm5, qword ptr [rcx + rsi + 16] # xmm5 = mem[0],zero
 33614  	movq	xmm6, qword ptr [rcx + rsi + 24] # xmm6 = mem[0],zero
 33615  	movdqa	xmm0, xmm5
 33616  	pcmpgtb	xmm0, xmm2
 33617  	pmovsxbw	xmm0, xmm0
 33618  	movdqa	xmm1, xmm6
 33619  	pcmpgtb	xmm1, xmm2
 33620  	pmovsxbw	xmm1, xmm1
 33621  	pcmpeqb	xmm5, xmm2
 33622  	pxor	xmm5, xmm3
 33623  	pmovsxbw	xmm5, xmm5
 33624  	pcmpeqb	xmm6, xmm2
 33625  	pxor	xmm6, xmm3
 33626  	pmovsxbw	xmm6, xmm6
 33627  	pblendvb	xmm5, xmm4, xmm0
 33628  	movdqa	xmm0, xmm1
 33629  	pblendvb	xmm6, xmm4, xmm0
 33630  	movdqu	xmmword ptr [r8 + 2*rsi + 32], xmm5
 33631  	movdqu	xmmword ptr [r8 + 2*rsi + 48], xmm6
 33632  	add	rsi, 32
 33633  	add	rdi, 2
 33634  	jne	.LBB4_737
 33635  	jmp	.LBB4_1401
 33636  .LBB4_746:
 33637  	mov	edx, r10d
 33638  	and	edx, -16
 33639  	lea	rsi, [rdx - 16]
 33640  	mov	r9, rsi
 33641  	shr	r9, 4
 33642  	add	r9, 1
 33643  	test	rsi, rsi
 33644  	je	.LBB4_1409
 33645  # %bb.747:
 33646  	mov	rdi, r9
 33647  	and	rdi, -2
 33648  	neg	rdi
 33649  	xor	esi, esi
 33650  	pxor	xmm0, xmm0
 33651  	movdqa	xmm1, xmmword ptr [rip + .LCPI4_20] # xmm1 = [1,1,1,1,1,1,1,1]
 33652  .LBB4_748:                              # =>This Inner Loop Header: Depth=1
 33653  	movdqu	xmm2, xmmword ptr [rcx + 2*rsi]
 33654  	movdqu	xmm3, xmmword ptr [rcx + 2*rsi + 16]
 33655  	pcmpeqw	xmm2, xmm0
 33656  	pandn	xmm2, xmm1
 33657  	pcmpeqw	xmm3, xmm0
 33658  	pandn	xmm3, xmm1
 33659  	movdqu	xmmword ptr [r8 + 2*rsi], xmm2
 33660  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm3
 33661  	movdqu	xmm2, xmmword ptr [rcx + 2*rsi + 32]
 33662  	movdqu	xmm3, xmmword ptr [rcx + 2*rsi + 48]
 33663  	pcmpeqw	xmm2, xmm0
 33664  	pandn	xmm2, xmm1
 33665  	pcmpeqw	xmm3, xmm0
 33666  	pandn	xmm3, xmm1
 33667  	movdqu	xmmword ptr [r8 + 2*rsi + 32], xmm2
 33668  	movdqu	xmmword ptr [r8 + 2*rsi + 48], xmm3
 33669  	add	rsi, 32
 33670  	add	rdi, 2
 33671  	jne	.LBB4_748
 33672  	jmp	.LBB4_1410
 33673  .LBB4_751:
 33674  	mov	edx, r10d
 33675  	and	edx, -16
 33676  	lea	rsi, [rdx - 16]
 33677  	mov	r9, rsi
 33678  	shr	r9, 4
 33679  	add	r9, 1
 33680  	test	rsi, rsi
 33681  	je	.LBB4_1417
 33682  # %bb.752:
 33683  	mov	rdi, r9
 33684  	and	rdi, -2
 33685  	neg	rdi
 33686  	xor	esi, esi
 33687  	pxor	xmm0, xmm0
 33688  	movdqa	xmm1, xmmword ptr [rip + .LCPI4_20] # xmm1 = [1,1,1,1,1,1,1,1]
 33689  .LBB4_753:                              # =>This Inner Loop Header: Depth=1
 33690  	movdqu	xmm2, xmmword ptr [rcx + 2*rsi]
 33691  	movdqu	xmm3, xmmword ptr [rcx + 2*rsi + 16]
 33692  	pcmpeqw	xmm2, xmm0
 33693  	pandn	xmm2, xmm1
 33694  	pcmpeqw	xmm3, xmm0
 33695  	pandn	xmm3, xmm1
 33696  	movdqu	xmmword ptr [r8 + 2*rsi], xmm2
 33697  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm3
 33698  	movdqu	xmm2, xmmword ptr [rcx + 2*rsi + 32]
 33699  	movdqu	xmm3, xmmword ptr [rcx + 2*rsi + 48]
 33700  	pcmpeqw	xmm2, xmm0
 33701  	pandn	xmm2, xmm1
 33702  	pcmpeqw	xmm3, xmm0
 33703  	pandn	xmm3, xmm1
 33704  	movdqu	xmmword ptr [r8 + 2*rsi + 32], xmm2
 33705  	movdqu	xmmword ptr [r8 + 2*rsi + 48], xmm3
 33706  	add	rsi, 32
 33707  	add	rdi, 2
 33708  	jne	.LBB4_753
 33709  	jmp	.LBB4_1418
 33710  .LBB4_756:
 33711  	mov	edx, r11d
 33712  	and	edx, -16
 33713  	lea	rsi, [rdx - 16]
 33714  	mov	r9, rsi
 33715  	shr	r9, 4
 33716  	add	r9, 1
 33717  	test	rsi, rsi
 33718  	je	.LBB4_1425
 33719  # %bb.757:
 33720  	mov	rdi, r9
 33721  	and	rdi, -2
 33722  	neg	rdi
 33723  	xor	esi, esi
 33724  	pxor	xmm2, xmm2
 33725  	pcmpeqd	xmm3, xmm3
 33726  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1]
 33727  .LBB4_758:                              # =>This Inner Loop Header: Depth=1
 33728  	movdqu	xmm5, xmmword ptr [rcx + 2*rsi]
 33729  	movdqu	xmm6, xmmword ptr [rcx + 2*rsi + 16]
 33730  	movdqa	xmm0, xmm4
 33731  	pcmpgtw	xmm0, xmm5
 33732  	pcmpeqw	xmm5, xmm2
 33733  	pxor	xmm5, xmm3
 33734  	movdqa	xmm1, xmm4
 33735  	pcmpgtw	xmm1, xmm6
 33736  	pcmpeqw	xmm6, xmm2
 33737  	pxor	xmm6, xmm3
 33738  	movdqa	xmm7, xmm4
 33739  	pblendvb	xmm7, xmm5, xmm0
 33740  	movdqa	xmm5, xmm4
 33741  	movdqa	xmm0, xmm1
 33742  	pblendvb	xmm5, xmm6, xmm0
 33743  	movdqu	xmmword ptr [r8 + 2*rsi], xmm7
 33744  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm5
 33745  	movdqu	xmm5, xmmword ptr [rcx + 2*rsi + 32]
 33746  	movdqu	xmm6, xmmword ptr [rcx + 2*rsi + 48]
 33747  	movdqa	xmm0, xmm4
 33748  	pcmpgtw	xmm0, xmm5
 33749  	pcmpeqw	xmm5, xmm2
 33750  	pxor	xmm5, xmm3
 33751  	movdqa	xmm1, xmm4
 33752  	pcmpgtw	xmm1, xmm6
 33753  	pcmpeqw	xmm6, xmm2
 33754  	pxor	xmm6, xmm3
 33755  	movdqa	xmm7, xmm4
 33756  	pblendvb	xmm7, xmm5, xmm0
 33757  	movdqa	xmm5, xmm4
 33758  	movdqa	xmm0, xmm1
 33759  	pblendvb	xmm5, xmm6, xmm0
 33760  	movdqu	xmmword ptr [r8 + 2*rsi + 32], xmm7
 33761  	movdqu	xmmword ptr [r8 + 2*rsi + 48], xmm5
 33762  	add	rsi, 32
 33763  	add	rdi, 2
 33764  	jne	.LBB4_758
 33765  	jmp	.LBB4_1426
 33766  .LBB4_761:
 33767  	mov	edx, r11d
 33768  	and	edx, -16
 33769  	lea	rsi, [rdx - 16]
 33770  	mov	r9, rsi
 33771  	shr	r9, 4
 33772  	add	r9, 1
 33773  	test	rsi, rsi
 33774  	je	.LBB4_1434
 33775  # %bb.762:
 33776  	mov	rdi, r9
 33777  	and	rdi, -2
 33778  	neg	rdi
 33779  	xor	esi, esi
 33780  	pxor	xmm2, xmm2
 33781  	pcmpeqd	xmm3, xmm3
 33782  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1]
 33783  .LBB4_763:                              # =>This Inner Loop Header: Depth=1
 33784  	movdqu	xmm5, xmmword ptr [rcx + 2*rsi]
 33785  	movdqu	xmm6, xmmword ptr [rcx + 2*rsi + 16]
 33786  	movdqa	xmm0, xmm4
 33787  	pcmpgtw	xmm0, xmm5
 33788  	pcmpeqw	xmm5, xmm2
 33789  	pxor	xmm5, xmm3
 33790  	movdqa	xmm1, xmm4
 33791  	pcmpgtw	xmm1, xmm6
 33792  	pcmpeqw	xmm6, xmm2
 33793  	pxor	xmm6, xmm3
 33794  	movdqa	xmm7, xmm4
 33795  	pblendvb	xmm7, xmm5, xmm0
 33796  	movdqa	xmm5, xmm4
 33797  	movdqa	xmm0, xmm1
 33798  	pblendvb	xmm5, xmm6, xmm0
 33799  	movdqu	xmmword ptr [r8 + 2*rsi], xmm7
 33800  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm5
 33801  	movdqu	xmm5, xmmword ptr [rcx + 2*rsi + 32]
 33802  	movdqu	xmm6, xmmword ptr [rcx + 2*rsi + 48]
 33803  	movdqa	xmm0, xmm4
 33804  	pcmpgtw	xmm0, xmm5
 33805  	pcmpeqw	xmm5, xmm2
 33806  	pxor	xmm5, xmm3
 33807  	movdqa	xmm1, xmm4
 33808  	pcmpgtw	xmm1, xmm6
 33809  	pcmpeqw	xmm6, xmm2
 33810  	pxor	xmm6, xmm3
 33811  	movdqa	xmm7, xmm4
 33812  	pblendvb	xmm7, xmm5, xmm0
 33813  	movdqa	xmm5, xmm4
 33814  	movdqa	xmm0, xmm1
 33815  	pblendvb	xmm5, xmm6, xmm0
 33816  	movdqu	xmmword ptr [r8 + 2*rsi + 32], xmm7
 33817  	movdqu	xmmword ptr [r8 + 2*rsi + 48], xmm5
 33818  	add	rsi, 32
 33819  	add	rdi, 2
 33820  	jne	.LBB4_763
 33821  	jmp	.LBB4_1435
 33822  .LBB4_778:
 33823  	mov	edx, r10d
 33824  	and	edx, -16
 33825  	lea	rsi, [rdx - 16]
 33826  	mov	r9, rsi
 33827  	shr	r9, 4
 33828  	add	r9, 1
 33829  	test	rsi, rsi
 33830  	je	.LBB4_1443
 33831  # %bb.779:
 33832  	mov	rdi, r9
 33833  	and	rdi, -2
 33834  	neg	rdi
 33835  	xor	esi, esi
 33836  	pxor	xmm0, xmm0
 33837  	pcmpeqd	xmm1, xmm1
 33838  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_20] # xmm2 = [1,1,1,1,1,1,1,1]
 33839  .LBB4_780:                              # =>This Inner Loop Header: Depth=1
 33840  	movq	xmm3, qword ptr [rcx + rsi]     # xmm3 = mem[0],zero
 33841  	movq	xmm4, qword ptr [rcx + rsi + 8] # xmm4 = mem[0],zero
 33842  	pcmpeqb	xmm3, xmm0
 33843  	pxor	xmm3, xmm1
 33844  	pmovzxbw	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 33845  	pand	xmm3, xmm2
 33846  	pcmpeqb	xmm4, xmm0
 33847  	pxor	xmm4, xmm1
 33848  	pmovzxbw	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
 33849  	pand	xmm4, xmm2
 33850  	movdqu	xmmword ptr [r8 + 2*rsi], xmm3
 33851  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm4
 33852  	movq	xmm3, qword ptr [rcx + rsi + 16] # xmm3 = mem[0],zero
 33853  	movq	xmm4, qword ptr [rcx + rsi + 24] # xmm4 = mem[0],zero
 33854  	pcmpeqb	xmm3, xmm0
 33855  	pxor	xmm3, xmm1
 33856  	pmovzxbw	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 33857  	pand	xmm3, xmm2
 33858  	pcmpeqb	xmm4, xmm0
 33859  	pxor	xmm4, xmm1
 33860  	pmovzxbw	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
 33861  	pand	xmm4, xmm2
 33862  	movdqu	xmmword ptr [r8 + 2*rsi + 32], xmm3
 33863  	movdqu	xmmword ptr [r8 + 2*rsi + 48], xmm4
 33864  	add	rsi, 32
 33865  	add	rdi, 2
 33866  	jne	.LBB4_780
 33867  	jmp	.LBB4_1444
 33868  .LBB4_783:
 33869  	mov	edx, r10d
 33870  	and	edx, -16
 33871  	lea	rsi, [rdx - 16]
 33872  	mov	r9, rsi
 33873  	shr	r9, 4
 33874  	add	r9, 1
 33875  	test	rsi, rsi
 33876  	je	.LBB4_1451
 33877  # %bb.784:
 33878  	mov	rdi, r9
 33879  	and	rdi, -2
 33880  	neg	rdi
 33881  	xor	esi, esi
 33882  	pxor	xmm0, xmm0
 33883  	pcmpeqd	xmm1, xmm1
 33884  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_20] # xmm2 = [1,1,1,1,1,1,1,1]
 33885  .LBB4_785:                              # =>This Inner Loop Header: Depth=1
 33886  	movq	xmm3, qword ptr [rcx + rsi]     # xmm3 = mem[0],zero
 33887  	movq	xmm4, qword ptr [rcx + rsi + 8] # xmm4 = mem[0],zero
 33888  	pcmpeqb	xmm3, xmm0
 33889  	pxor	xmm3, xmm1
 33890  	pmovzxbw	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 33891  	pand	xmm3, xmm2
 33892  	pcmpeqb	xmm4, xmm0
 33893  	pxor	xmm4, xmm1
 33894  	pmovzxbw	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
 33895  	pand	xmm4, xmm2
 33896  	movdqu	xmmword ptr [r8 + 2*rsi], xmm3
 33897  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm4
 33898  	movq	xmm3, qword ptr [rcx + rsi + 16] # xmm3 = mem[0],zero
 33899  	movq	xmm4, qword ptr [rcx + rsi + 24] # xmm4 = mem[0],zero
 33900  	pcmpeqb	xmm3, xmm0
 33901  	pxor	xmm3, xmm1
 33902  	pmovzxbw	xmm3, xmm3                      # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 33903  	pand	xmm3, xmm2
 33904  	pcmpeqb	xmm4, xmm0
 33905  	pxor	xmm4, xmm1
 33906  	pmovzxbw	xmm4, xmm4                      # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
 33907  	pand	xmm4, xmm2
 33908  	movdqu	xmmword ptr [r8 + 2*rsi + 32], xmm3
 33909  	movdqu	xmmword ptr [r8 + 2*rsi + 48], xmm4
 33910  	add	rsi, 32
 33911  	add	rdi, 2
 33912  	jne	.LBB4_785
 33913  	jmp	.LBB4_1452
 33914  .LBB4_806:
 33915  	mov	edx, r10d
 33916  	and	edx, -4
 33917  	lea	rsi, [rdx - 4]
 33918  	mov	r9, rsi
 33919  	shr	r9, 2
 33920  	add	r9, 1
 33921  	test	rsi, rsi
 33922  	je	.LBB4_1459
 33923  # %bb.807:
 33924  	mov	rdi, r9
 33925  	and	rdi, -2
 33926  	neg	rdi
 33927  	xor	esi, esi
 33928  	pxor	xmm2, xmm2
 33929  	pcmpeqd	xmm3, xmm3
 33930  	movapd	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 33931  .LBB4_808:                              # =>This Inner Loop Header: Depth=1
 33932  	movzx	eax, word ptr [rcx + rsi]
 33933  	movd	xmm5, eax
 33934  	movzx	eax, word ptr [rcx + rsi + 2]
 33935  	movd	xmm6, eax
 33936  	movdqa	xmm0, xmm5
 33937  	pcmpgtb	xmm0, xmm2
 33938  	pmovsxbq	xmm0, xmm0
 33939  	movdqa	xmm1, xmm6
 33940  	pcmpgtb	xmm1, xmm2
 33941  	pmovsxbq	xmm1, xmm1
 33942  	pcmpeqb	xmm5, xmm2
 33943  	pxor	xmm5, xmm3
 33944  	pmovsxbq	xmm5, xmm5
 33945  	pcmpeqb	xmm6, xmm2
 33946  	pxor	xmm6, xmm3
 33947  	pmovsxbq	xmm6, xmm6
 33948  	blendvpd	xmm5, xmm4, xmm0
 33949  	movdqa	xmm0, xmm1
 33950  	blendvpd	xmm6, xmm4, xmm0
 33951  	movupd	xmmword ptr [r8 + 8*rsi], xmm5
 33952  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm6
 33953  	movzx	eax, word ptr [rcx + rsi + 4]
 33954  	movd	xmm5, eax
 33955  	movzx	eax, word ptr [rcx + rsi + 6]
 33956  	movd	xmm6, eax
 33957  	movdqa	xmm0, xmm5
 33958  	pcmpgtb	xmm0, xmm2
 33959  	pmovsxbq	xmm0, xmm0
 33960  	movdqa	xmm1, xmm6
 33961  	pcmpgtb	xmm1, xmm2
 33962  	pmovsxbq	xmm1, xmm1
 33963  	pcmpeqb	xmm5, xmm2
 33964  	pxor	xmm5, xmm3
 33965  	pmovsxbq	xmm5, xmm5
 33966  	pcmpeqb	xmm6, xmm2
 33967  	pxor	xmm6, xmm3
 33968  	pmovsxbq	xmm6, xmm6
 33969  	blendvpd	xmm5, xmm4, xmm0
 33970  	movdqa	xmm0, xmm1
 33971  	blendvpd	xmm6, xmm4, xmm0
 33972  	movupd	xmmword ptr [r8 + 8*rsi + 32], xmm5
 33973  	movupd	xmmword ptr [r8 + 8*rsi + 48], xmm6
 33974  	add	rsi, 8
 33975  	add	rdi, 2
 33976  	jne	.LBB4_808
 33977  	jmp	.LBB4_1460
 33978  .LBB4_811:
 33979  	mov	edx, eax
 33980  	and	edx, -8
 33981  	lea	rsi, [rdx - 8]
 33982  	mov	r9, rsi
 33983  	shr	r9, 3
 33984  	add	r9, 1
 33985  	test	rsi, rsi
 33986  	je	.LBB4_1468
 33987  # %bb.812:
 33988  	mov	rdi, r9
 33989  	and	rdi, -2
 33990  	neg	rdi
 33991  	xor	esi, esi
 33992  	pxor	xmm2, xmm2
 33993  	pcmpeqd	xmm3, xmm3
 33994  	movaps	xmm4, xmmword ptr [rip + .LCPI4_19] # xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 33995  .LBB4_813:                              # =>This Inner Loop Header: Depth=1
 33996  	movd	xmm5, dword ptr [rcx + rsi]     # xmm5 = mem[0],zero,zero,zero
 33997  	movd	xmm6, dword ptr [rcx + rsi + 4] # xmm6 = mem[0],zero,zero,zero
 33998  	movdqa	xmm0, xmm5
 33999  	pcmpgtb	xmm0, xmm2
 34000  	pmovsxbd	xmm0, xmm0
 34001  	movdqa	xmm1, xmm6
 34002  	pcmpgtb	xmm1, xmm2
 34003  	pmovsxbd	xmm1, xmm1
 34004  	pcmpeqb	xmm5, xmm2
 34005  	pxor	xmm5, xmm3
 34006  	pmovsxbd	xmm5, xmm5
 34007  	cvtdq2ps	xmm5, xmm5
 34008  	pcmpeqb	xmm6, xmm2
 34009  	pxor	xmm6, xmm3
 34010  	pmovsxbd	xmm6, xmm6
 34011  	cvtdq2ps	xmm6, xmm6
 34012  	blendvps	xmm5, xmm4, xmm0
 34013  	movdqa	xmm0, xmm1
 34014  	blendvps	xmm6, xmm4, xmm0
 34015  	movups	xmmword ptr [r8 + 4*rsi], xmm5
 34016  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm6
 34017  	movd	xmm5, dword ptr [rcx + rsi + 8] # xmm5 = mem[0],zero,zero,zero
 34018  	movd	xmm6, dword ptr [rcx + rsi + 12] # xmm6 = mem[0],zero,zero,zero
 34019  	movdqa	xmm0, xmm5
 34020  	pcmpgtb	xmm0, xmm2
 34021  	pmovsxbd	xmm0, xmm0
 34022  	movdqa	xmm1, xmm6
 34023  	pcmpgtb	xmm1, xmm2
 34024  	pmovsxbd	xmm1, xmm1
 34025  	pcmpeqb	xmm5, xmm2
 34026  	pxor	xmm5, xmm3
 34027  	pmovsxbd	xmm5, xmm5
 34028  	cvtdq2ps	xmm5, xmm5
 34029  	pcmpeqb	xmm6, xmm2
 34030  	pxor	xmm6, xmm3
 34031  	pmovsxbd	xmm6, xmm6
 34032  	cvtdq2ps	xmm6, xmm6
 34033  	blendvps	xmm5, xmm4, xmm0
 34034  	movdqa	xmm0, xmm1
 34035  	blendvps	xmm6, xmm4, xmm0
 34036  	movups	xmmword ptr [r8 + 4*rsi + 32], xmm5
 34037  	movups	xmmword ptr [r8 + 4*rsi + 48], xmm6
 34038  	add	rsi, 16
 34039  	add	rdi, 2
 34040  	jne	.LBB4_813
 34041  	jmp	.LBB4_1469
 34042  .LBB4_816:
 34043  	mov	edx, r10d
 34044  	and	edx, -4
 34045  	lea	rsi, [rdx - 4]
 34046  	mov	r9, rsi
 34047  	shr	r9, 2
 34048  	add	r9, 1
 34049  	test	rsi, rsi
 34050  	je	.LBB4_1490
 34051  # %bb.817:
 34052  	mov	rdi, r9
 34053  	and	rdi, -2
 34054  	neg	rdi
 34055  	xor	esi, esi
 34056  	pxor	xmm0, xmm0
 34057  	movdqa	xmm1, xmmword ptr [rip + .LCPI4_15] # xmm1 = [1,1]
 34058  .LBB4_818:                              # =>This Inner Loop Header: Depth=1
 34059  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi]
 34060  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 16]
 34061  	pcmpeqq	xmm2, xmm0
 34062  	pandn	xmm2, xmm1
 34063  	pcmpeqq	xmm3, xmm0
 34064  	pandn	xmm3, xmm1
 34065  	movdqu	xmmword ptr [r8 + 8*rsi], xmm2
 34066  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm3
 34067  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi + 32]
 34068  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 48]
 34069  	pcmpeqq	xmm2, xmm0
 34070  	pandn	xmm2, xmm1
 34071  	pcmpeqq	xmm3, xmm0
 34072  	pandn	xmm3, xmm1
 34073  	movdqu	xmmword ptr [r8 + 8*rsi + 32], xmm2
 34074  	movdqu	xmmword ptr [r8 + 8*rsi + 48], xmm3
 34075  	add	rsi, 8
 34076  	add	rdi, 2
 34077  	jne	.LBB4_818
 34078  	jmp	.LBB4_1491
 34079  .LBB4_843:
 34080  	mov	edx, r11d
 34081  	and	edx, -4
 34082  	lea	rsi, [rdx - 4]
 34083  	mov	r9, rsi
 34084  	shr	r9, 2
 34085  	add	r9, 1
 34086  	test	rsi, rsi
 34087  	je	.LBB4_1498
 34088  # %bb.844:
 34089  	mov	rdi, r9
 34090  	and	rdi, -2
 34091  	neg	rdi
 34092  	xor	esi, esi
 34093  	pxor	xmm2, xmm2
 34094  	pcmpeqd	xmm3, xmm3
 34095  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 34096  .LBB4_845:                              # =>This Inner Loop Header: Depth=1
 34097  	movdqu	xmm5, xmmword ptr [rcx + 8*rsi]
 34098  	movdqu	xmm6, xmmword ptr [rcx + 8*rsi + 16]
 34099  	movdqa	xmm0, xmm4
 34100  	pcmpgtq	xmm0, xmm5
 34101  	pcmpeqq	xmm5, xmm2
 34102  	pxor	xmm5, xmm3
 34103  	movdqa	xmm1, xmm4
 34104  	pcmpgtq	xmm1, xmm6
 34105  	pcmpeqq	xmm6, xmm2
 34106  	pxor	xmm6, xmm3
 34107  	movdqa	xmm7, xmm4
 34108  	blendvpd	xmm7, xmm5, xmm0
 34109  	movdqa	xmm5, xmm4
 34110  	movdqa	xmm0, xmm1
 34111  	blendvpd	xmm5, xmm6, xmm0
 34112  	movupd	xmmword ptr [r8 + 8*rsi], xmm7
 34113  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm5
 34114  	movdqu	xmm5, xmmword ptr [rcx + 8*rsi + 32]
 34115  	movdqu	xmm6, xmmword ptr [rcx + 8*rsi + 48]
 34116  	movdqa	xmm0, xmm4
 34117  	pcmpgtq	xmm0, xmm5
 34118  	pcmpeqq	xmm5, xmm2
 34119  	pxor	xmm5, xmm3
 34120  	movdqa	xmm1, xmm4
 34121  	pcmpgtq	xmm1, xmm6
 34122  	pcmpeqq	xmm6, xmm2
 34123  	pxor	xmm6, xmm3
 34124  	movdqa	xmm7, xmm4
 34125  	blendvpd	xmm7, xmm5, xmm0
 34126  	movdqa	xmm5, xmm4
 34127  	movdqa	xmm0, xmm1
 34128  	blendvpd	xmm5, xmm6, xmm0
 34129  	movupd	xmmword ptr [r8 + 8*rsi + 32], xmm7
 34130  	movupd	xmmword ptr [r8 + 8*rsi + 48], xmm5
 34131  	add	rsi, 8
 34132  	add	rdi, 2
 34133  	jne	.LBB4_845
 34134  	jmp	.LBB4_1499
 34135  .LBB4_989:
 34136  	movss	xmm0, dword ptr [rip + .LCPI4_14] # xmm0 = mem[0],zero,zero,zero
 34137  .LBB4_990:
 34138  	jle	.LBB4_992
 34139  # %bb.991:
 34140  	movss	xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero
 34141  .LBB4_992:
 34142  	movss	dword ptr [r8 + 4*rax], xmm0
 34143  	jmp	.LBB4_1655
 34144  .LBB4_866:
 34145  	mov	edx, eax
 34146  	and	edx, -8
 34147  	xor	esi, esi
 34148  	xorps	xmm0, xmm0
 34149  	movdqa	xmm1, xmmword ptr [rip + .LCPI4_8] # xmm1 = [1,1,1,1]
 34150  .LBB4_867:                              # =>This Inner Loop Header: Depth=1
 34151  	movdqu	xmm2, xmmword ptr [rcx + 4*rsi]
 34152  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 16]
 34153  	movdqa	xmm4, xmm2
 34154  	psrad	xmm4, 31
 34155  	por	xmm4, xmm1
 34156  	movdqa	xmm5, xmm3
 34157  	psrad	xmm5, 31
 34158  	por	xmm5, xmm1
 34159  	cvtdq2ps	xmm4, xmm4
 34160  	cvtdq2ps	xmm5, xmm5
 34161  	cmpneqps	xmm2, xmm0
 34162  	andps	xmm2, xmm4
 34163  	cmpneqps	xmm3, xmm0
 34164  	andps	xmm3, xmm5
 34165  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 34166  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm3
 34167  	add	rsi, 8
 34168  	cmp	rdx, rsi
 34169  	jne	.LBB4_867
 34170  # %bb.868:
 34171  	cmp	rdx, rax
 34172  	je	.LBB4_1655
 34173  	jmp	.LBB4_869
 34174  .LBB4_876:
 34175  	mov	edx, r10d
 34176  	and	edx, -4
 34177  	lea	rsi, [rdx - 4]
 34178  	mov	r9, rsi
 34179  	shr	r9, 2
 34180  	add	r9, 1
 34181  	test	rsi, rsi
 34182  	je	.LBB4_1507
 34183  # %bb.877:
 34184  	mov	rdi, r9
 34185  	and	rdi, -2
 34186  	neg	rdi
 34187  	xor	esi, esi
 34188  	pxor	xmm0, xmm0
 34189  	pcmpeqd	xmm1, xmm1
 34190  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_15] # xmm2 = [1,1]
 34191  .LBB4_878:                              # =>This Inner Loop Header: Depth=1
 34192  	movzx	eax, word ptr [rcx + rsi]
 34193  	movd	xmm3, eax
 34194  	movzx	eax, word ptr [rcx + rsi + 2]
 34195  	movd	xmm4, eax
 34196  	pcmpeqb	xmm3, xmm0
 34197  	pxor	xmm3, xmm1
 34198  	pmovzxbq	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
 34199  	pand	xmm3, xmm2
 34200  	pcmpeqb	xmm4, xmm0
 34201  	pxor	xmm4, xmm1
 34202  	pmovzxbq	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 34203  	pand	xmm4, xmm2
 34204  	movdqu	xmmword ptr [r8 + 8*rsi], xmm3
 34205  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm4
 34206  	movzx	eax, word ptr [rcx + rsi + 4]
 34207  	movd	xmm3, eax
 34208  	movzx	eax, word ptr [rcx + rsi + 6]
 34209  	movd	xmm4, eax
 34210  	pcmpeqb	xmm3, xmm0
 34211  	pxor	xmm3, xmm1
 34212  	pmovzxbq	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
 34213  	pand	xmm3, xmm2
 34214  	pcmpeqb	xmm4, xmm0
 34215  	pxor	xmm4, xmm1
 34216  	pmovzxbq	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 34217  	pand	xmm4, xmm2
 34218  	movdqu	xmmword ptr [r8 + 8*rsi + 32], xmm3
 34219  	movdqu	xmmword ptr [r8 + 8*rsi + 48], xmm4
 34220  	add	rsi, 8
 34221  	add	rdi, 2
 34222  	jne	.LBB4_878
 34223  	jmp	.LBB4_1508
 34224  .LBB4_881:
 34225  	mov	edx, eax
 34226  	and	edx, -8
 34227  	lea	rsi, [rdx - 8]
 34228  	mov	r9, rsi
 34229  	shr	r9, 3
 34230  	add	r9, 1
 34231  	test	rsi, rsi
 34232  	je	.LBB4_1515
 34233  # %bb.882:
 34234  	mov	rdi, r9
 34235  	and	rdi, -2
 34236  	neg	rdi
 34237  	xor	esi, esi
 34238  	pxor	xmm0, xmm0
 34239  	pcmpeqd	xmm1, xmm1
 34240  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1]
 34241  .LBB4_883:                              # =>This Inner Loop Header: Depth=1
 34242  	movd	xmm3, dword ptr [rcx + rsi]     # xmm3 = mem[0],zero,zero,zero
 34243  	movd	xmm4, dword ptr [rcx + rsi + 4] # xmm4 = mem[0],zero,zero,zero
 34244  	pcmpeqb	xmm3, xmm0
 34245  	pxor	xmm3, xmm1
 34246  	pmovzxbd	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
 34247  	pand	xmm3, xmm2
 34248  	cvtdq2ps	xmm3, xmm3
 34249  	pcmpeqb	xmm4, xmm0
 34250  	pxor	xmm4, xmm1
 34251  	pmovzxbd	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
 34252  	pand	xmm4, xmm2
 34253  	cvtdq2ps	xmm4, xmm4
 34254  	movups	xmmword ptr [r8 + 4*rsi], xmm3
 34255  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm4
 34256  	movd	xmm3, dword ptr [rcx + rsi + 8] # xmm3 = mem[0],zero,zero,zero
 34257  	movd	xmm4, dword ptr [rcx + rsi + 12] # xmm4 = mem[0],zero,zero,zero
 34258  	pcmpeqb	xmm3, xmm0
 34259  	pxor	xmm3, xmm1
 34260  	pmovzxbd	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
 34261  	pand	xmm3, xmm2
 34262  	cvtdq2ps	xmm3, xmm3
 34263  	pcmpeqb	xmm4, xmm0
 34264  	pxor	xmm4, xmm1
 34265  	pmovzxbd	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
 34266  	pand	xmm4, xmm2
 34267  	cvtdq2ps	xmm4, xmm4
 34268  	movups	xmmword ptr [r8 + 4*rsi + 32], xmm3
 34269  	movups	xmmword ptr [r8 + 4*rsi + 48], xmm4
 34270  	add	rsi, 16
 34271  	add	rdi, 2
 34272  	jne	.LBB4_883
 34273  	jmp	.LBB4_1516
 34274  .LBB4_892:
 34275  	mov	edx, eax
 34276  	and	edx, -8
 34277  	lea	rsi, [rdx - 8]
 34278  	mov	r9, rsi
 34279  	shr	r9, 3
 34280  	add	r9, 1
 34281  	test	rsi, rsi
 34282  	je	.LBB4_1535
 34283  # %bb.893:
 34284  	mov	rdi, r9
 34285  	and	rdi, -2
 34286  	neg	rdi
 34287  	xor	esi, esi
 34288  	pxor	xmm0, xmm0
 34289  	pcmpeqd	xmm1, xmm1
 34290  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_12] # xmm2 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u>
 34291  .LBB4_894:                              # =>This Inner Loop Header: Depth=1
 34292  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi]
 34293  	movdqu	xmm4, xmmword ptr [rcx + 4*rsi + 16]
 34294  	pcmpeqd	xmm3, xmm0
 34295  	pxor	xmm3, xmm1
 34296  	packssdw	xmm3, xmm3
 34297  	packsswb	xmm3, xmm3
 34298  	pand	xmm3, xmm2
 34299  	pcmpeqd	xmm4, xmm0
 34300  	pxor	xmm4, xmm1
 34301  	packssdw	xmm4, xmm4
 34302  	packsswb	xmm4, xmm4
 34303  	pand	xmm4, xmm2
 34304  	movd	dword ptr [r8 + rsi], xmm3
 34305  	movd	dword ptr [r8 + rsi + 4], xmm4
 34306  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 32]
 34307  	movdqu	xmm4, xmmword ptr [rcx + 4*rsi + 48]
 34308  	pcmpeqd	xmm3, xmm0
 34309  	pxor	xmm3, xmm1
 34310  	packssdw	xmm3, xmm3
 34311  	packsswb	xmm3, xmm3
 34312  	pand	xmm3, xmm2
 34313  	pcmpeqd	xmm4, xmm0
 34314  	pxor	xmm4, xmm1
 34315  	packssdw	xmm4, xmm4
 34316  	packsswb	xmm4, xmm4
 34317  	pand	xmm4, xmm2
 34318  	movd	dword ptr [r8 + rsi + 8], xmm3
 34319  	movd	dword ptr [r8 + rsi + 12], xmm4
 34320  	add	rsi, 16
 34321  	add	rdi, 2
 34322  	jne	.LBB4_894
 34323  	jmp	.LBB4_1536
 34324  .LBB4_897:
 34325  	mov	edx, eax
 34326  	and	edx, -4
 34327  	lea	rsi, [rdx - 4]
 34328  	mov	r9, rsi
 34329  	shr	r9, 2
 34330  	add	r9, 1
 34331  	test	rsi, rsi
 34332  	je	.LBB4_1543
 34333  # %bb.898:
 34334  	mov	rdi, r9
 34335  	and	rdi, -2
 34336  	neg	rdi
 34337  	xor	esi, esi
 34338  	xorpd	xmm2, xmm2
 34339  	movapd	xmm3, xmmword ptr [rip + .LCPI4_0] # xmm3 = [-0.0E+0,-0.0E+0]
 34340  	movapd	xmm4, xmmword ptr [rip + .LCPI4_1] # xmm4 = [1.0E+0,1.0E+0]
 34341  	movdqa	xmm5, xmmword ptr [rip + .LCPI4_7] # xmm5 = <0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 34342  .LBB4_899:                              # =>This Inner Loop Header: Depth=1
 34343  	movupd	xmm6, xmmword ptr [rcx + 8*rsi]
 34344  	movupd	xmm7, xmmword ptr [rcx + 8*rsi + 16]
 34345  	movapd	xmm0, xmm6
 34346  	cmpeqpd	xmm0, xmm2
 34347  	packssdw	xmm0, xmm0
 34348  	packssdw	xmm0, xmm0
 34349  	packsswb	xmm0, xmm0
 34350  	movapd	xmm1, xmm7
 34351  	cmpeqpd	xmm1, xmm2
 34352  	packssdw	xmm1, xmm1
 34353  	packssdw	xmm1, xmm1
 34354  	packsswb	xmm1, xmm1
 34355  	andpd	xmm6, xmm3
 34356  	orpd	xmm6, xmm4
 34357  	andpd	xmm7, xmm3
 34358  	orpd	xmm7, xmm4
 34359  	cvttpd2dq	xmm6, xmm6
 34360  	pshufb	xmm6, xmm5
 34361  	cvttpd2dq	xmm7, xmm7
 34362  	pshufb	xmm7, xmm5
 34363  	pblendvb	xmm6, xmm2, xmm0
 34364  	movdqa	xmm0, xmm1
 34365  	pblendvb	xmm7, xmm2, xmm0
 34366  	pextrw	word ptr [r8 + rsi], xmm6, 0
 34367  	pextrw	word ptr [r8 + rsi + 2], xmm7, 0
 34368  	movupd	xmm6, xmmword ptr [rcx + 8*rsi + 32]
 34369  	movupd	xmm7, xmmword ptr [rcx + 8*rsi + 48]
 34370  	movapd	xmm0, xmm6
 34371  	cmpeqpd	xmm0, xmm2
 34372  	packssdw	xmm0, xmm0
 34373  	packssdw	xmm0, xmm0
 34374  	packsswb	xmm0, xmm0
 34375  	movapd	xmm1, xmm7
 34376  	cmpeqpd	xmm1, xmm2
 34377  	packssdw	xmm1, xmm1
 34378  	packssdw	xmm1, xmm1
 34379  	packsswb	xmm1, xmm1
 34380  	andpd	xmm6, xmm3
 34381  	orpd	xmm6, xmm4
 34382  	andpd	xmm7, xmm3
 34383  	orpd	xmm7, xmm4
 34384  	cvttpd2dq	xmm6, xmm6
 34385  	pshufb	xmm6, xmm5
 34386  	cvttpd2dq	xmm7, xmm7
 34387  	pshufb	xmm7, xmm5
 34388  	pblendvb	xmm6, xmm2, xmm0
 34389  	movdqa	xmm0, xmm1
 34390  	pblendvb	xmm7, xmm2, xmm0
 34391  	pextrw	word ptr [r8 + rsi + 4], xmm6, 0
 34392  	pextrw	word ptr [r8 + rsi + 6], xmm7, 0
 34393  	add	rsi, 8
 34394  	add	rdi, 2
 34395  	jne	.LBB4_899
 34396  	jmp	.LBB4_1544
 34397  .LBB4_902:
 34398  	mov	esi, r10d
 34399  	and	esi, -32
 34400  	lea	rax, [rsi - 32]
 34401  	mov	r9, rax
 34402  	shr	r9, 5
 34403  	add	r9, 1
 34404  	test	rax, rax
 34405  	je	.LBB4_1552
 34406  # %bb.903:
 34407  	mov	rdi, r9
 34408  	and	rdi, -2
 34409  	neg	rdi
 34410  	xor	eax, eax
 34411  	pxor	xmm2, xmm2
 34412  	pcmpeqd	xmm3, xmm3
 34413  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_22] # xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 34414  .LBB4_904:                              # =>This Inner Loop Header: Depth=1
 34415  	movdqu	xmm5, xmmword ptr [rcx + rax]
 34416  	movdqu	xmm6, xmmword ptr [rcx + rax + 16]
 34417  	movdqa	xmm0, xmm4
 34418  	pcmpgtb	xmm0, xmm5
 34419  	pcmpeqb	xmm5, xmm2
 34420  	pxor	xmm5, xmm3
 34421  	movdqa	xmm1, xmm4
 34422  	pcmpgtb	xmm1, xmm6
 34423  	pcmpeqb	xmm6, xmm2
 34424  	pxor	xmm6, xmm3
 34425  	movdqa	xmm7, xmm4
 34426  	pblendvb	xmm7, xmm5, xmm0
 34427  	movdqa	xmm5, xmm4
 34428  	movdqa	xmm0, xmm1
 34429  	pblendvb	xmm5, xmm6, xmm0
 34430  	movdqu	xmmword ptr [r8 + rax], xmm7
 34431  	movdqu	xmmword ptr [r8 + rax + 16], xmm5
 34432  	movdqu	xmm5, xmmword ptr [rcx + rax + 32]
 34433  	movdqu	xmm6, xmmword ptr [rcx + rax + 48]
 34434  	movdqa	xmm0, xmm4
 34435  	pcmpgtb	xmm0, xmm5
 34436  	pcmpeqb	xmm5, xmm2
 34437  	pxor	xmm5, xmm3
 34438  	movdqa	xmm1, xmm4
 34439  	pcmpgtb	xmm1, xmm6
 34440  	pcmpeqb	xmm6, xmm2
 34441  	pxor	xmm6, xmm3
 34442  	movdqa	xmm7, xmm4
 34443  	pblendvb	xmm7, xmm5, xmm0
 34444  	movdqa	xmm5, xmm4
 34445  	movdqa	xmm0, xmm1
 34446  	pblendvb	xmm5, xmm6, xmm0
 34447  	movdqu	xmmword ptr [r8 + rax + 32], xmm7
 34448  	movdqu	xmmword ptr [r8 + rax + 48], xmm5
 34449  	add	rax, 64
 34450  	add	rdi, 2
 34451  	jne	.LBB4_904
 34452  	jmp	.LBB4_1553
 34453  .LBB4_907:
 34454  	mov	edx, eax
 34455  	and	edx, -4
 34456  	lea	rsi, [rdx - 4]
 34457  	mov	r9, rsi
 34458  	shr	r9, 2
 34459  	add	r9, 1
 34460  	test	rsi, rsi
 34461  	je	.LBB4_1561
 34462  # %bb.908:
 34463  	mov	rdi, r9
 34464  	and	rdi, -2
 34465  	neg	rdi
 34466  	xor	esi, esi
 34467  	pxor	xmm0, xmm0
 34468  	pcmpeqd	xmm1, xmm1
 34469  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_18] # xmm2 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 34470  .LBB4_909:                              # =>This Inner Loop Header: Depth=1
 34471  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi]
 34472  	movdqu	xmm4, xmmword ptr [rcx + 8*rsi + 16]
 34473  	pcmpeqq	xmm3, xmm0
 34474  	pxor	xmm3, xmm1
 34475  	packssdw	xmm3, xmm3
 34476  	packssdw	xmm3, xmm3
 34477  	packsswb	xmm3, xmm3
 34478  	pand	xmm3, xmm2
 34479  	pcmpeqq	xmm4, xmm0
 34480  	pxor	xmm4, xmm1
 34481  	packssdw	xmm4, xmm4
 34482  	packssdw	xmm4, xmm4
 34483  	packsswb	xmm4, xmm4
 34484  	pextrw	word ptr [r8 + rsi], xmm3, 0
 34485  	pand	xmm4, xmm2
 34486  	pextrw	word ptr [r8 + rsi + 2], xmm4, 0
 34487  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 32]
 34488  	movdqu	xmm4, xmmword ptr [rcx + 8*rsi + 48]
 34489  	pcmpeqq	xmm3, xmm0
 34490  	pxor	xmm3, xmm1
 34491  	packssdw	xmm3, xmm3
 34492  	packssdw	xmm3, xmm3
 34493  	packsswb	xmm3, xmm3
 34494  	pand	xmm3, xmm2
 34495  	pcmpeqq	xmm4, xmm0
 34496  	pxor	xmm4, xmm1
 34497  	packssdw	xmm4, xmm4
 34498  	packssdw	xmm4, xmm4
 34499  	packsswb	xmm4, xmm4
 34500  	pextrw	word ptr [r8 + rsi + 4], xmm3, 0
 34501  	pand	xmm4, xmm2
 34502  	pextrw	word ptr [r8 + rsi + 6], xmm4, 0
 34503  	add	rsi, 8
 34504  	add	rdi, 2
 34505  	jne	.LBB4_909
 34506  	jmp	.LBB4_1562
 34507  .LBB4_912:
 34508  	mov	edx, eax
 34509  	and	edx, -16
 34510  	lea	rsi, [rdx - 16]
 34511  	mov	r9, rsi
 34512  	shr	r9, 4
 34513  	add	r9, 1
 34514  	test	rsi, rsi
 34515  	je	.LBB4_1569
 34516  # %bb.913:
 34517  	mov	rdi, r9
 34518  	and	rdi, -2
 34519  	neg	rdi
 34520  	xor	esi, esi
 34521  	pxor	xmm0, xmm0
 34522  	pcmpeqd	xmm1, xmm1
 34523  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_21] # xmm2 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u>
 34524  .LBB4_914:                              # =>This Inner Loop Header: Depth=1
 34525  	movdqu	xmm3, xmmword ptr [rcx + 2*rsi]
 34526  	movdqu	xmm4, xmmword ptr [rcx + 2*rsi + 16]
 34527  	pcmpeqw	xmm3, xmm0
 34528  	pxor	xmm3, xmm1
 34529  	packsswb	xmm3, xmm3
 34530  	pand	xmm3, xmm2
 34531  	pcmpeqw	xmm4, xmm0
 34532  	pxor	xmm4, xmm1
 34533  	packsswb	xmm4, xmm4
 34534  	pand	xmm4, xmm2
 34535  	punpcklqdq	xmm3, xmm4              # xmm3 = xmm3[0],xmm4[0]
 34536  	movdqu	xmmword ptr [r8 + rsi], xmm3
 34537  	movdqu	xmm3, xmmword ptr [rcx + 2*rsi + 32]
 34538  	movdqu	xmm4, xmmword ptr [rcx + 2*rsi + 48]
 34539  	pcmpeqw	xmm3, xmm0
 34540  	pxor	xmm3, xmm1
 34541  	packsswb	xmm3, xmm3
 34542  	pand	xmm3, xmm2
 34543  	pcmpeqw	xmm4, xmm0
 34544  	pxor	xmm4, xmm1
 34545  	packsswb	xmm4, xmm4
 34546  	pand	xmm4, xmm2
 34547  	punpcklqdq	xmm3, xmm4              # xmm3 = xmm3[0],xmm4[0]
 34548  	movdqu	xmmword ptr [r8 + rsi + 16], xmm3
 34549  	add	rsi, 32
 34550  	add	rdi, 2
 34551  	jne	.LBB4_914
 34552  	jmp	.LBB4_1570
 34553  .LBB4_917:
 34554  	mov	esi, r10d
 34555  	and	esi, -16
 34556  	lea	rax, [rsi - 16]
 34557  	mov	r9, rax
 34558  	shr	r9, 4
 34559  	add	r9, 1
 34560  	test	rax, rax
 34561  	je	.LBB4_1577
 34562  # %bb.918:
 34563  	mov	rdi, r9
 34564  	and	rdi, -2
 34565  	neg	rdi
 34566  	xor	eax, eax
 34567  	pxor	xmm2, xmm2
 34568  	pcmpeqd	xmm3, xmm3
 34569  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_21] # xmm4 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u>
 34570  .LBB4_919:                              # =>This Inner Loop Header: Depth=1
 34571  	movdqu	xmm5, xmmword ptr [rcx + 2*rax]
 34572  	movdqu	xmm6, xmmword ptr [rcx + 2*rax + 16]
 34573  	movdqa	xmm0, xmm5
 34574  	pcmpgtw	xmm0, xmm2
 34575  	packsswb	xmm0, xmm0
 34576  	movdqa	xmm1, xmm6
 34577  	pcmpgtw	xmm1, xmm2
 34578  	packsswb	xmm1, xmm1
 34579  	pcmpeqw	xmm5, xmm2
 34580  	pxor	xmm5, xmm3
 34581  	packsswb	xmm5, xmm5
 34582  	pcmpeqw	xmm6, xmm2
 34583  	pxor	xmm6, xmm3
 34584  	packsswb	xmm6, xmm6
 34585  	pblendvb	xmm5, xmm4, xmm0
 34586  	movdqa	xmm0, xmm1
 34587  	pblendvb	xmm6, xmm4, xmm0
 34588  	punpcklqdq	xmm5, xmm6              # xmm5 = xmm5[0],xmm6[0]
 34589  	movdqu	xmmword ptr [r8 + rax], xmm5
 34590  	movdqu	xmm5, xmmword ptr [rcx + 2*rax + 32]
 34591  	movdqu	xmm6, xmmword ptr [rcx + 2*rax + 48]
 34592  	movdqa	xmm0, xmm5
 34593  	pcmpgtw	xmm0, xmm2
 34594  	packsswb	xmm0, xmm0
 34595  	movdqa	xmm1, xmm6
 34596  	pcmpgtw	xmm1, xmm2
 34597  	packsswb	xmm1, xmm1
 34598  	pcmpeqw	xmm5, xmm2
 34599  	pxor	xmm5, xmm3
 34600  	packsswb	xmm5, xmm5
 34601  	pcmpeqw	xmm6, xmm2
 34602  	pxor	xmm6, xmm3
 34603  	packsswb	xmm6, xmm6
 34604  	pblendvb	xmm5, xmm4, xmm0
 34605  	movdqa	xmm0, xmm1
 34606  	pblendvb	xmm6, xmm4, xmm0
 34607  	punpcklqdq	xmm5, xmm6              # xmm5 = xmm5[0],xmm6[0]
 34608  	movdqu	xmmword ptr [r8 + rax + 16], xmm5
 34609  	add	rax, 32
 34610  	add	rdi, 2
 34611  	jne	.LBB4_919
 34612  	jmp	.LBB4_1578
 34613  .LBB4_922:
 34614  	mov	esi, r10d
 34615  	and	esi, -4
 34616  	lea	rax, [rsi - 4]
 34617  	mov	r9, rax
 34618  	shr	r9, 2
 34619  	add	r9, 1
 34620  	test	rax, rax
 34621  	je	.LBB4_1586
 34622  # %bb.923:
 34623  	mov	rdi, r9
 34624  	and	rdi, -2
 34625  	neg	rdi
 34626  	xor	eax, eax
 34627  	pxor	xmm2, xmm2
 34628  	pcmpeqd	xmm3, xmm3
 34629  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_18] # xmm4 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 34630  .LBB4_924:                              # =>This Inner Loop Header: Depth=1
 34631  	movdqu	xmm5, xmmword ptr [rcx + 8*rax]
 34632  	movdqu	xmm6, xmmword ptr [rcx + 8*rax + 16]
 34633  	movdqa	xmm0, xmm5
 34634  	pcmpgtq	xmm0, xmm2
 34635  	packssdw	xmm0, xmm0
 34636  	packssdw	xmm0, xmm0
 34637  	packsswb	xmm0, xmm0
 34638  	movdqa	xmm1, xmm6
 34639  	pcmpgtq	xmm1, xmm2
 34640  	packssdw	xmm1, xmm1
 34641  	packssdw	xmm1, xmm1
 34642  	packsswb	xmm1, xmm1
 34643  	pcmpeqq	xmm5, xmm2
 34644  	pxor	xmm5, xmm3
 34645  	packssdw	xmm5, xmm5
 34646  	packssdw	xmm5, xmm5
 34647  	packsswb	xmm5, xmm5
 34648  	pcmpeqq	xmm6, xmm2
 34649  	pxor	xmm6, xmm3
 34650  	packssdw	xmm6, xmm6
 34651  	packssdw	xmm6, xmm6
 34652  	packsswb	xmm6, xmm6
 34653  	pblendvb	xmm5, xmm4, xmm0
 34654  	movdqa	xmm0, xmm1
 34655  	pblendvb	xmm6, xmm4, xmm0
 34656  	pextrw	word ptr [r8 + rax], xmm5, 0
 34657  	pextrw	word ptr [r8 + rax + 2], xmm6, 0
 34658  	movdqu	xmm5, xmmword ptr [rcx + 8*rax + 32]
 34659  	movdqu	xmm6, xmmword ptr [rcx + 8*rax + 48]
 34660  	movdqa	xmm0, xmm5
 34661  	pcmpgtq	xmm0, xmm2
 34662  	packssdw	xmm0, xmm0
 34663  	packssdw	xmm0, xmm0
 34664  	packsswb	xmm0, xmm0
 34665  	movdqa	xmm1, xmm6
 34666  	pcmpgtq	xmm1, xmm2
 34667  	packssdw	xmm1, xmm1
 34668  	packssdw	xmm1, xmm1
 34669  	packsswb	xmm1, xmm1
 34670  	pcmpeqq	xmm5, xmm2
 34671  	pxor	xmm5, xmm3
 34672  	packssdw	xmm5, xmm5
 34673  	packssdw	xmm5, xmm5
 34674  	packsswb	xmm5, xmm5
 34675  	pcmpeqq	xmm6, xmm2
 34676  	pxor	xmm6, xmm3
 34677  	packssdw	xmm6, xmm6
 34678  	packssdw	xmm6, xmm6
 34679  	packsswb	xmm6, xmm6
 34680  	pblendvb	xmm5, xmm4, xmm0
 34681  	movdqa	xmm0, xmm1
 34682  	pblendvb	xmm6, xmm4, xmm0
 34683  	pextrw	word ptr [r8 + rax + 4], xmm5, 0
 34684  	pextrw	word ptr [r8 + rax + 6], xmm6, 0
 34685  	add	rax, 8
 34686  	add	rdi, 2
 34687  	jne	.LBB4_924
 34688  	jmp	.LBB4_1587
 34689  .LBB4_927:
 34690  	mov	edx, r10d
 34691  	and	edx, -8
 34692  	lea	rsi, [rdx - 8]
 34693  	mov	r9, rsi
 34694  	shr	r9, 3
 34695  	add	r9, 1
 34696  	test	rsi, rsi
 34697  	je	.LBB4_1595
 34698  # %bb.928:
 34699  	mov	rdi, r9
 34700  	and	rdi, -2
 34701  	neg	rdi
 34702  	xor	esi, esi
 34703  	xorps	xmm4, xmm4
 34704  	pcmpeqd	xmm8, xmm8
 34705  	movdqa	xmm6, xmmword ptr [rip + .LCPI4_12] # xmm6 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u>
 34706  .LBB4_929:                              # =>This Inner Loop Header: Depth=1
 34707  	movups	xmm0, xmmword ptr [rcx + 4*rsi]
 34708  	movups	xmm1, xmmword ptr [rcx + 4*rsi + 16]
 34709  	movaps	xmm2, xmm0
 34710  	cmpeqps	xmm2, xmm4
 34711  	packssdw	xmm2, xmm2
 34712  	packsswb	xmm2, xmm2
 34713  	movaps	xmm3, xmm1
 34714  	cmpeqps	xmm3, xmm4
 34715  	packssdw	xmm3, xmm3
 34716  	packsswb	xmm3, xmm3
 34717  	pcmpgtd	xmm0, xmm8
 34718  	packssdw	xmm0, xmm0
 34719  	packsswb	xmm0, xmm0
 34720  	pcmpgtd	xmm1, xmm8
 34721  	packssdw	xmm1, xmm1
 34722  	packsswb	xmm1, xmm1
 34723  	pcmpeqd	xmm7, xmm7
 34724  	pblendvb	xmm7, xmm6, xmm0
 34725  	pcmpeqd	xmm5, xmm5
 34726  	movdqa	xmm0, xmm1
 34727  	pblendvb	xmm5, xmm6, xmm0
 34728  	movdqa	xmm0, xmm2
 34729  	pblendvb	xmm7, xmm4, xmm0
 34730  	movdqa	xmm0, xmm3
 34731  	pblendvb	xmm5, xmm4, xmm0
 34732  	movd	dword ptr [r8 + rsi], xmm7
 34733  	movd	dword ptr [r8 + rsi + 4], xmm5
 34734  	movups	xmm0, xmmword ptr [rcx + 4*rsi + 32]
 34735  	movups	xmm1, xmmword ptr [rcx + 4*rsi + 48]
 34736  	movaps	xmm2, xmm0
 34737  	cmpeqps	xmm2, xmm4
 34738  	packssdw	xmm2, xmm2
 34739  	packsswb	xmm2, xmm2
 34740  	movaps	xmm3, xmm1
 34741  	cmpeqps	xmm3, xmm4
 34742  	packssdw	xmm3, xmm3
 34743  	packsswb	xmm3, xmm3
 34744  	pcmpgtd	xmm0, xmm8
 34745  	packssdw	xmm0, xmm0
 34746  	packsswb	xmm0, xmm0
 34747  	pcmpgtd	xmm1, xmm8
 34748  	packssdw	xmm1, xmm1
 34749  	pcmpeqd	xmm5, xmm5
 34750  	pblendvb	xmm5, xmm6, xmm0
 34751  	packsswb	xmm1, xmm1
 34752  	pcmpeqd	xmm7, xmm7
 34753  	movdqa	xmm0, xmm1
 34754  	pblendvb	xmm7, xmm6, xmm0
 34755  	movdqa	xmm0, xmm2
 34756  	pblendvb	xmm5, xmm4, xmm0
 34757  	movdqa	xmm0, xmm3
 34758  	pblendvb	xmm7, xmm4, xmm0
 34759  	movd	dword ptr [r8 + rsi + 8], xmm5
 34760  	movd	dword ptr [r8 + rsi + 12], xmm7
 34761  	add	rsi, 16
 34762  	add	rdi, 2
 34763  	jne	.LBB4_929
 34764  	jmp	.LBB4_1596
 34765  .LBB4_932:
 34766  	mov	edx, eax
 34767  	and	edx, -32
 34768  	lea	rsi, [rdx - 32]
 34769  	mov	r9, rsi
 34770  	shr	r9, 5
 34771  	add	r9, 1
 34772  	test	rsi, rsi
 34773  	je	.LBB4_1604
 34774  # %bb.933:
 34775  	mov	rdi, r9
 34776  	and	rdi, -2
 34777  	neg	rdi
 34778  	xor	esi, esi
 34779  	pxor	xmm0, xmm0
 34780  	movdqa	xmm1, xmmword ptr [rip + .LCPI4_22] # xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 34781  .LBB4_934:                              # =>This Inner Loop Header: Depth=1
 34782  	movdqu	xmm2, xmmword ptr [rcx + rsi]
 34783  	movdqu	xmm3, xmmword ptr [rcx + rsi + 16]
 34784  	pcmpeqb	xmm2, xmm0
 34785  	pandn	xmm2, xmm1
 34786  	pcmpeqb	xmm3, xmm0
 34787  	pandn	xmm3, xmm1
 34788  	movdqu	xmmword ptr [r8 + rsi], xmm2
 34789  	movdqu	xmmword ptr [r8 + rsi + 16], xmm3
 34790  	movdqu	xmm2, xmmword ptr [rcx + rsi + 32]
 34791  	movdqu	xmm3, xmmword ptr [rcx + rsi + 48]
 34792  	pcmpeqb	xmm2, xmm0
 34793  	pandn	xmm2, xmm1
 34794  	pcmpeqb	xmm3, xmm0
 34795  	pandn	xmm3, xmm1
 34796  	movdqu	xmmword ptr [r8 + rsi + 32], xmm2
 34797  	movdqu	xmmword ptr [r8 + rsi + 48], xmm3
 34798  	add	rsi, 64
 34799  	add	rdi, 2
 34800  	jne	.LBB4_934
 34801  	jmp	.LBB4_1605
 34802  .LBB4_937:
 34803  	mov	esi, r10d
 34804  	and	esi, -8
 34805  	lea	rax, [rsi - 8]
 34806  	mov	r9, rax
 34807  	shr	r9, 3
 34808  	add	r9, 1
 34809  	test	rax, rax
 34810  	je	.LBB4_1612
 34811  # %bb.938:
 34812  	mov	rdi, r9
 34813  	and	rdi, -2
 34814  	neg	rdi
 34815  	xor	eax, eax
 34816  	pxor	xmm2, xmm2
 34817  	pcmpeqd	xmm3, xmm3
 34818  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_12] # xmm4 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u>
 34819  .LBB4_939:                              # =>This Inner Loop Header: Depth=1
 34820  	movdqu	xmm5, xmmword ptr [rcx + 4*rax]
 34821  	movdqu	xmm6, xmmword ptr [rcx + 4*rax + 16]
 34822  	movdqa	xmm0, xmm5
 34823  	pcmpgtd	xmm0, xmm2
 34824  	packssdw	xmm0, xmm0
 34825  	packsswb	xmm0, xmm0
 34826  	movdqa	xmm1, xmm6
 34827  	pcmpgtd	xmm1, xmm2
 34828  	packssdw	xmm1, xmm1
 34829  	packsswb	xmm1, xmm1
 34830  	pcmpeqd	xmm5, xmm2
 34831  	pxor	xmm5, xmm3
 34832  	packssdw	xmm5, xmm5
 34833  	packsswb	xmm5, xmm5
 34834  	pcmpeqd	xmm6, xmm2
 34835  	pxor	xmm6, xmm3
 34836  	packssdw	xmm6, xmm6
 34837  	packsswb	xmm6, xmm6
 34838  	pblendvb	xmm5, xmm4, xmm0
 34839  	movdqa	xmm0, xmm1
 34840  	pblendvb	xmm6, xmm4, xmm0
 34841  	movd	dword ptr [r8 + rax], xmm5
 34842  	movd	dword ptr [r8 + rax + 4], xmm6
 34843  	movdqu	xmm5, xmmword ptr [rcx + 4*rax + 32]
 34844  	movdqu	xmm6, xmmword ptr [rcx + 4*rax + 48]
 34845  	movdqa	xmm0, xmm5
 34846  	pcmpgtd	xmm0, xmm2
 34847  	packssdw	xmm0, xmm0
 34848  	packsswb	xmm0, xmm0
 34849  	movdqa	xmm1, xmm6
 34850  	pcmpgtd	xmm1, xmm2
 34851  	packssdw	xmm1, xmm1
 34852  	packsswb	xmm1, xmm1
 34853  	pcmpeqd	xmm5, xmm2
 34854  	pxor	xmm5, xmm3
 34855  	packssdw	xmm5, xmm5
 34856  	packsswb	xmm5, xmm5
 34857  	pcmpeqd	xmm6, xmm2
 34858  	pxor	xmm6, xmm3
 34859  	packssdw	xmm6, xmm6
 34860  	packsswb	xmm6, xmm6
 34861  	pblendvb	xmm5, xmm4, xmm0
 34862  	movdqa	xmm0, xmm1
 34863  	pblendvb	xmm6, xmm4, xmm0
 34864  	movd	dword ptr [r8 + rax + 8], xmm5
 34865  	movd	dword ptr [r8 + rax + 12], xmm6
 34866  	add	rax, 16
 34867  	add	rdi, 2
 34868  	jne	.LBB4_939
 34869  	jmp	.LBB4_1613
 34870  .LBB4_942:
 34871  	mov	edx, r10d
 34872  	and	edx, -8
 34873  	lea	rsi, [rdx - 8]
 34874  	mov	r9, rsi
 34875  	shr	r9, 3
 34876  	add	r9, 1
 34877  	test	rsi, rsi
 34878  	je	.LBB4_1621
 34879  # %bb.943:
 34880  	mov	rdi, r9
 34881  	and	rdi, -2
 34882  	neg	rdi
 34883  	xor	esi, esi
 34884  	pxor	xmm0, xmm0
 34885  	movdqa	xmm1, xmmword ptr [rip + .LCPI4_8] # xmm1 = [1,1,1,1]
 34886  .LBB4_944:                              # =>This Inner Loop Header: Depth=1
 34887  	movdqu	xmm2, xmmword ptr [rcx + 4*rsi]
 34888  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 16]
 34889  	pcmpeqd	xmm2, xmm0
 34890  	pandn	xmm2, xmm1
 34891  	pcmpeqd	xmm3, xmm0
 34892  	pandn	xmm3, xmm1
 34893  	movdqu	xmmword ptr [r8 + 4*rsi], xmm2
 34894  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm3
 34895  	movdqu	xmm2, xmmword ptr [rcx + 4*rsi + 32]
 34896  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 48]
 34897  	pcmpeqd	xmm2, xmm0
 34898  	pandn	xmm2, xmm1
 34899  	pcmpeqd	xmm3, xmm0
 34900  	pandn	xmm3, xmm1
 34901  	movdqu	xmmword ptr [r8 + 4*rsi + 32], xmm2
 34902  	movdqu	xmmword ptr [r8 + 4*rsi + 48], xmm3
 34903  	add	rsi, 16
 34904  	add	rdi, 2
 34905  	jne	.LBB4_944
 34906  	jmp	.LBB4_1622
 34907  .LBB4_950:
 34908  	mov	edx, r10d
 34909  	and	edx, -8
 34910  	lea	rsi, [rdx - 8]
 34911  	mov	r9, rsi
 34912  	shr	r9, 3
 34913  	add	r9, 1
 34914  	test	rsi, rsi
 34915  	je	.LBB4_1629
 34916  # %bb.951:
 34917  	mov	rdi, r9
 34918  	and	rdi, -2
 34919  	neg	rdi
 34920  	xor	esi, esi
 34921  	pxor	xmm2, xmm2
 34922  	pcmpeqd	xmm3, xmm3
 34923  	movaps	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 34924  .LBB4_952:                              # =>This Inner Loop Header: Depth=1
 34925  	movd	xmm5, dword ptr [rcx + rsi]     # xmm5 = mem[0],zero,zero,zero
 34926  	movd	xmm6, dword ptr [rcx + rsi + 4] # xmm6 = mem[0],zero,zero,zero
 34927  	movdqa	xmm0, xmm5
 34928  	pcmpgtb	xmm0, xmm2
 34929  	pmovsxbd	xmm0, xmm0
 34930  	movdqa	xmm1, xmm6
 34931  	pcmpgtb	xmm1, xmm2
 34932  	pmovsxbd	xmm1, xmm1
 34933  	pcmpeqb	xmm5, xmm2
 34934  	pxor	xmm5, xmm3
 34935  	pmovsxbd	xmm5, xmm5
 34936  	pcmpeqb	xmm6, xmm2
 34937  	pxor	xmm6, xmm3
 34938  	pmovsxbd	xmm6, xmm6
 34939  	blendvps	xmm5, xmm4, xmm0
 34940  	movdqa	xmm0, xmm1
 34941  	blendvps	xmm6, xmm4, xmm0
 34942  	movups	xmmword ptr [r8 + 4*rsi], xmm5
 34943  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm6
 34944  	movd	xmm5, dword ptr [rcx + rsi + 8] # xmm5 = mem[0],zero,zero,zero
 34945  	movd	xmm6, dword ptr [rcx + rsi + 12] # xmm6 = mem[0],zero,zero,zero
 34946  	movdqa	xmm0, xmm5
 34947  	pcmpgtb	xmm0, xmm2
 34948  	pmovsxbd	xmm0, xmm0
 34949  	movdqa	xmm1, xmm6
 34950  	pcmpgtb	xmm1, xmm2
 34951  	pmovsxbd	xmm1, xmm1
 34952  	pcmpeqb	xmm5, xmm2
 34953  	pxor	xmm5, xmm3
 34954  	pmovsxbd	xmm5, xmm5
 34955  	pcmpeqb	xmm6, xmm2
 34956  	pxor	xmm6, xmm3
 34957  	pmovsxbd	xmm6, xmm6
 34958  	blendvps	xmm5, xmm4, xmm0
 34959  	movdqa	xmm0, xmm1
 34960  	blendvps	xmm6, xmm4, xmm0
 34961  	movups	xmmword ptr [r8 + 4*rsi + 32], xmm5
 34962  	movups	xmmword ptr [r8 + 4*rsi + 48], xmm6
 34963  	add	rsi, 16
 34964  	add	rdi, 2
 34965  	jne	.LBB4_952
 34966  	jmp	.LBB4_1630
 34967  .LBB4_974:
 34968  	mov	edx, r10d
 34969  	and	edx, -8
 34970  	lea	rsi, [rdx - 8]
 34971  	mov	r9, rsi
 34972  	shr	r9, 3
 34973  	add	r9, 1
 34974  	test	rsi, rsi
 34975  	je	.LBB4_1638
 34976  # %bb.975:
 34977  	mov	rdi, r9
 34978  	and	rdi, -2
 34979  	neg	rdi
 34980  	xor	esi, esi
 34981  	pxor	xmm0, xmm0
 34982  	pcmpeqd	xmm1, xmm1
 34983  	movdqa	xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1]
 34984  .LBB4_976:                              # =>This Inner Loop Header: Depth=1
 34985  	movd	xmm3, dword ptr [rcx + rsi]     # xmm3 = mem[0],zero,zero,zero
 34986  	movd	xmm4, dword ptr [rcx + rsi + 4] # xmm4 = mem[0],zero,zero,zero
 34987  	pcmpeqb	xmm3, xmm0
 34988  	pxor	xmm3, xmm1
 34989  	pmovzxbd	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
 34990  	pand	xmm3, xmm2
 34991  	pcmpeqb	xmm4, xmm0
 34992  	pxor	xmm4, xmm1
 34993  	pmovzxbd	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
 34994  	pand	xmm4, xmm2
 34995  	movdqu	xmmword ptr [r8 + 4*rsi], xmm3
 34996  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm4
 34997  	movd	xmm3, dword ptr [rcx + rsi + 8] # xmm3 = mem[0],zero,zero,zero
 34998  	movd	xmm4, dword ptr [rcx + rsi + 12] # xmm4 = mem[0],zero,zero,zero
 34999  	pcmpeqb	xmm3, xmm0
 35000  	pxor	xmm3, xmm1
 35001  	pmovzxbd	xmm3, xmm3                      # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
 35002  	pand	xmm3, xmm2
 35003  	pcmpeqb	xmm4, xmm0
 35004  	pxor	xmm4, xmm1
 35005  	pmovzxbd	xmm4, xmm4                      # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
 35006  	pand	xmm4, xmm2
 35007  	movdqu	xmmword ptr [r8 + 4*rsi + 32], xmm3
 35008  	movdqu	xmmword ptr [r8 + 4*rsi + 48], xmm4
 35009  	add	rsi, 16
 35010  	add	rdi, 2
 35011  	jne	.LBB4_976
 35012  	jmp	.LBB4_1639
 35013  .LBB4_979:
 35014  	mov	edx, r11d
 35015  	and	edx, -8
 35016  	lea	rsi, [rdx - 8]
 35017  	mov	r9, rsi
 35018  	shr	r9, 3
 35019  	add	r9, 1
 35020  	test	rsi, rsi
 35021  	je	.LBB4_1646
 35022  # %bb.980:
 35023  	mov	rdi, r9
 35024  	and	rdi, -2
 35025  	neg	rdi
 35026  	xor	esi, esi
 35027  	pxor	xmm2, xmm2
 35028  	pcmpeqd	xmm3, xmm3
 35029  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 35030  .LBB4_981:                              # =>This Inner Loop Header: Depth=1
 35031  	movdqu	xmm5, xmmword ptr [rcx + 4*rsi]
 35032  	movdqu	xmm6, xmmword ptr [rcx + 4*rsi + 16]
 35033  	movdqa	xmm0, xmm4
 35034  	pcmpgtd	xmm0, xmm5
 35035  	pcmpeqd	xmm5, xmm2
 35036  	pxor	xmm5, xmm3
 35037  	movdqa	xmm1, xmm4
 35038  	pcmpgtd	xmm1, xmm6
 35039  	pcmpeqd	xmm6, xmm2
 35040  	pxor	xmm6, xmm3
 35041  	movdqa	xmm7, xmm4
 35042  	blendvps	xmm7, xmm5, xmm0
 35043  	movdqa	xmm5, xmm4
 35044  	movdqa	xmm0, xmm1
 35045  	blendvps	xmm5, xmm6, xmm0
 35046  	movups	xmmword ptr [r8 + 4*rsi], xmm7
 35047  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm5
 35048  	movdqu	xmm5, xmmword ptr [rcx + 4*rsi + 32]
 35049  	movdqu	xmm6, xmmword ptr [rcx + 4*rsi + 48]
 35050  	movdqa	xmm0, xmm4
 35051  	pcmpgtd	xmm0, xmm5
 35052  	pcmpeqd	xmm5, xmm2
 35053  	pxor	xmm5, xmm3
 35054  	movdqa	xmm1, xmm4
 35055  	pcmpgtd	xmm1, xmm6
 35056  	pcmpeqd	xmm6, xmm2
 35057  	pxor	xmm6, xmm3
 35058  	movdqa	xmm7, xmm4
 35059  	blendvps	xmm7, xmm5, xmm0
 35060  	movdqa	xmm5, xmm4
 35061  	movdqa	xmm0, xmm1
 35062  	blendvps	xmm5, xmm6, xmm0
 35063  	movups	xmmword ptr [r8 + 4*rsi + 32], xmm7
 35064  	movups	xmmword ptr [r8 + 4*rsi + 48], xmm5
 35065  	add	rsi, 16
 35066  	add	rdi, 2
 35067  	jne	.LBB4_981
 35068  	jmp	.LBB4_1647
 35069  .LBB4_1475:
 35070  	movd	xmm0, dword ptr [rip + .LCPI4_14] # xmm0 = mem[0],zero,zero,zero
 35071  .LBB4_1476:
 35072  	jle	.LBB4_1478
 35073  # %bb.1477:
 35074  	movd	xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero
 35075  .LBB4_1478:
 35076  	movd	dword ptr [r8 + 4*rdx], xmm0
 35077  	or	rdx, 1
 35078  .LBB4_1479:
 35079  	add	rsi, rax
 35080  	je	.LBB4_1655
 35081  # %bb.1480:
 35082  	movd	xmm0, dword ptr [rip + .LCPI4_14] # xmm0 = mem[0],zero,zero,zero
 35083  	movd	xmm1, dword ptr [rip + .LCPI4_5] # xmm1 = mem[0],zero,zero,zero
 35084  	jmp	.LBB4_1482
 35085  .LBB4_1481:                             #   in Loop: Header=BB4_1482 Depth=1
 35086  	movd	dword ptr [r8 + 4*rdx + 4], xmm3
 35087  	add	rdx, 2
 35088  	cmp	rax, rdx
 35089  	je	.LBB4_1655
 35090  .LBB4_1482:                             # =>This Inner Loop Header: Depth=1
 35091  	cmp	byte ptr [rcx + rdx], 0
 35092  	movdqa	xmm2, xmm0
 35093  	jne	.LBB4_1483
 35094  # %bb.1486:                             #   in Loop: Header=BB4_1482 Depth=1
 35095  	pxor	xmm2, xmm2
 35096  	movdqa	xmm3, xmm1
 35097  	jle	.LBB4_1487
 35098  .LBB4_1484:                             #   in Loop: Header=BB4_1482 Depth=1
 35099  	movd	dword ptr [r8 + 4*rdx], xmm3
 35100  	cmp	byte ptr [rcx + rdx + 1], 0
 35101  	movdqa	xmm2, xmm0
 35102  	jne	.LBB4_1485
 35103  .LBB4_1488:                             #   in Loop: Header=BB4_1482 Depth=1
 35104  	pxor	xmm2, xmm2
 35105  	movdqa	xmm3, xmm1
 35106  	jg	.LBB4_1481
 35107  	jmp	.LBB4_1489
 35108  .LBB4_1483:                             #   in Loop: Header=BB4_1482 Depth=1
 35109  	movdqa	xmm3, xmm1
 35110  	jg	.LBB4_1484
 35111  .LBB4_1487:                             #   in Loop: Header=BB4_1482 Depth=1
 35112  	movdqa	xmm3, xmm2
 35113  	movd	dword ptr [r8 + 4*rdx], xmm3
 35114  	cmp	byte ptr [rcx + rdx + 1], 0
 35115  	movdqa	xmm2, xmm0
 35116  	je	.LBB4_1488
 35117  .LBB4_1485:                             #   in Loop: Header=BB4_1482 Depth=1
 35118  	movdqa	xmm3, xmm1
 35119  	jg	.LBB4_1481
 35120  .LBB4_1489:                             #   in Loop: Header=BB4_1482 Depth=1
 35121  	movdqa	xmm3, xmm2
 35122  	jmp	.LBB4_1481
 35123  .LBB4_994:
 35124  	xor	esi, esi
 35125  .LBB4_995:
 35126  	test	r9b, 1
 35127  	je	.LBB4_997
 35128  # %bb.996:
 35129  	movdqu	xmm0, xmmword ptr [rcx + 8*rsi]
 35130  	movdqu	xmm1, xmmword ptr [rcx + 8*rsi + 16]
 35131  	pxor	xmm2, xmm2
 35132  	pcmpeqq	xmm0, xmm2
 35133  	pshufd	xmm0, xmm0, 232                 # xmm0 = xmm0[0,2,2,3]
 35134  	movdqa	xmm3, xmmword ptr [rip + .LCPI4_16] # xmm3 = <1,1,u,u>
 35135  	pandn	xmm0, xmm3
 35136  	pcmpeqq	xmm1, xmm2
 35137  	pshufd	xmm1, xmm1, 232                 # xmm1 = xmm1[0,2,2,3]
 35138  	pandn	xmm1, xmm3
 35139  	punpcklqdq	xmm0, xmm1              # xmm0 = xmm0[0],xmm1[0]
 35140  	movdqu	xmmword ptr [r8 + 4*rsi], xmm0
 35141  .LBB4_997:
 35142  	cmp	rdx, rax
 35143  	je	.LBB4_1655
 35144  .LBB4_998:                              # =>This Inner Loop Header: Depth=1
 35145  	xor	esi, esi
 35146  	cmp	qword ptr [rcx + 8*rdx], 0
 35147  	setne	sil
 35148  	mov	dword ptr [r8 + 4*rdx], esi
 35149  	add	rdx, 1
 35150  	cmp	rax, rdx
 35151  	jne	.LBB4_998
 35152  	jmp	.LBB4_1655
 35153  .LBB4_999:
 35154  	xor	esi, esi
 35155  .LBB4_1000:
 35156  	test	r9b, 1
 35157  	je	.LBB4_1002
 35158  # %bb.1001:
 35159  	movq	xmm0, qword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero
 35160  	movq	xmm1, qword ptr [rcx + 4*rsi + 8] # xmm1 = mem[0],zero
 35161  	pxor	xmm2, xmm2
 35162  	pcmpeqd	xmm0, xmm2
 35163  	pcmpeqd	xmm3, xmm3
 35164  	pxor	xmm0, xmm3
 35165  	pmovzxdq	xmm0, xmm0                      # xmm0 = xmm0[0],zero,xmm0[1],zero
 35166  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 35167  	pand	xmm0, xmm4
 35168  	pcmpeqd	xmm1, xmm2
 35169  	pxor	xmm1, xmm3
 35170  	pmovzxdq	xmm1, xmm1                      # xmm1 = xmm1[0],zero,xmm1[1],zero
 35171  	pand	xmm1, xmm4
 35172  	movdqu	xmmword ptr [r8 + 8*rsi], xmm0
 35173  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm1
 35174  .LBB4_1002:
 35175  	cmp	rdx, rax
 35176  	je	.LBB4_1655
 35177  .LBB4_1003:                             # =>This Inner Loop Header: Depth=1
 35178  	xor	esi, esi
 35179  	cmp	dword ptr [rcx + 4*rdx], 0
 35180  	setne	sil
 35181  	mov	qword ptr [r8 + 8*rdx], rsi
 35182  	add	rdx, 1
 35183  	cmp	rax, rdx
 35184  	jne	.LBB4_1003
 35185  	jmp	.LBB4_1655
 35186  .LBB4_1004:
 35187  	xor	edi, edi
 35188  .LBB4_1005:
 35189  	test	r9b, 1
 35190  	je	.LBB4_1007
 35191  # %bb.1006:
 35192  	movupd	xmm0, xmmword ptr [rcx + 8*rdi]
 35193  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 35194  	andpd	xmm1, xmm0
 35195  	orpd	xmm1, xmmword ptr [rip + .LCPI4_1]
 35196  	movsd	xmm2, qword ptr [rip + .LCPI4_6] # xmm2 = mem[0],zero
 35197  	movapd	xmm3, xmm1
 35198  	subsd	xmm3, xmm2
 35199  	cvttsd2si	rax, xmm3
 35200  	xor	rax, r11
 35201  	cvttsd2si	rdx, xmm1
 35202  	ucomisd	xmm1, xmm2
 35203  	cmovae	rdx, rax
 35204  	movq	xmm3, rdx
 35205  	pshufd	xmm1, xmm1, 238                 # xmm1 = xmm1[2,3,2,3]
 35206  	movdqa	xmm4, xmm1
 35207  	subsd	xmm4, xmm2
 35208  	cvttsd2si	rax, xmm4
 35209  	xor	rax, r11
 35210  	cvttsd2si	rdx, xmm1
 35211  	ucomisd	xmm1, xmm2
 35212  	xorpd	xmm1, xmm1
 35213  	cmovae	rdx, rax
 35214  	movq	xmm2, rdx
 35215  	punpcklqdq	xmm3, xmm2              # xmm3 = xmm3[0],xmm2[0]
 35216  	cmpneqpd	xmm1, xmm0
 35217  	andpd	xmm1, xmm3
 35218  	movupd	xmmword ptr [r8 + 8*rdi], xmm1
 35219  .LBB4_1007:
 35220  	cmp	rsi, r10
 35221  	je	.LBB4_1655
 35222  .LBB4_1008:
 35223  	movapd	xmm0, xmmword ptr [rip + .LCPI4_0] # xmm0 = [-0.0E+0,-0.0E+0]
 35224  	movsd	xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero
 35225  	movsd	xmm2, qword ptr [rip + .LCPI4_6] # xmm2 = mem[0],zero
 35226  	xor	eax, eax
 35227  	xorpd	xmm3, xmm3
 35228  .LBB4_1009:                             # =>This Inner Loop Header: Depth=1
 35229  	movsd	xmm4, qword ptr [rcx + 8*rsi]   # xmm4 = mem[0],zero
 35230  	movapd	xmm5, xmm4
 35231  	andpd	xmm5, xmm0
 35232  	orpd	xmm5, xmm1
 35233  	movapd	xmm6, xmm5
 35234  	subsd	xmm6, xmm2
 35235  	cvttsd2si	rdx, xmm6
 35236  	xor	rdx, r11
 35237  	cvttsd2si	rdi, xmm5
 35238  	ucomisd	xmm5, xmm2
 35239  	cmovae	rdi, rdx
 35240  	ucomisd	xmm3, xmm4
 35241  	cmove	rdi, rax
 35242  	mov	qword ptr [r8 + 8*rsi], rdi
 35243  	add	rsi, 1
 35244  	cmp	r10, rsi
 35245  	jne	.LBB4_1009
 35246  	jmp	.LBB4_1655
 35247  .LBB4_1010:
 35248  	xor	esi, esi
 35249  .LBB4_1011:
 35250  	test	r9b, 1
 35251  	je	.LBB4_1013
 35252  # %bb.1012:
 35253  	movd	xmm0, dword ptr [rcx + 2*rsi]   # xmm0 = mem[0],zero,zero,zero
 35254  	movd	xmm1, dword ptr [rcx + 2*rsi + 4] # xmm1 = mem[0],zero,zero,zero
 35255  	pxor	xmm2, xmm2
 35256  	pcmpeqw	xmm0, xmm2
 35257  	pcmpeqd	xmm3, xmm3
 35258  	pxor	xmm0, xmm3
 35259  	pmovzxwq	xmm0, xmm0                      # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 35260  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 35261  	pand	xmm0, xmm4
 35262  	pcmpeqw	xmm1, xmm2
 35263  	pxor	xmm1, xmm3
 35264  	pmovzxwq	xmm1, xmm1                      # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 35265  	pand	xmm1, xmm4
 35266  	movdqu	xmmword ptr [r8 + 8*rsi], xmm0
 35267  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm1
 35268  .LBB4_1013:
 35269  	cmp	rdx, rax
 35270  	je	.LBB4_1655
 35271  .LBB4_1014:                             # =>This Inner Loop Header: Depth=1
 35272  	xor	esi, esi
 35273  	cmp	word ptr [rcx + 2*rdx], 0
 35274  	setne	sil
 35275  	mov	qword ptr [r8 + 8*rdx], rsi
 35276  	add	rdx, 1
 35277  	cmp	rax, rdx
 35278  	jne	.LBB4_1014
 35279  	jmp	.LBB4_1655
 35280  .LBB4_1015:
 35281  	xor	esi, esi
 35282  .LBB4_1016:
 35283  	test	r9b, 1
 35284  	je	.LBB4_1018
 35285  # %bb.1017:
 35286  	movd	xmm2, dword ptr [rcx + 2*rsi]   # xmm2 = mem[0],zero,zero,zero
 35287  	movd	xmm3, dword ptr [rcx + 2*rsi + 4] # xmm3 = mem[0],zero,zero,zero
 35288  	xorpd	xmm4, xmm4
 35289  	movdqa	xmm0, xmm2
 35290  	pcmpgtw	xmm0, xmm4
 35291  	pmovsxwq	xmm0, xmm0
 35292  	movdqa	xmm1, xmm3
 35293  	pcmpgtw	xmm1, xmm4
 35294  	pmovsxwq	xmm1, xmm1
 35295  	pcmpeqw	xmm2, xmm4
 35296  	pcmpeqd	xmm5, xmm5
 35297  	pxor	xmm2, xmm5
 35298  	pmovsxwq	xmm2, xmm2
 35299  	pcmpeqw	xmm3, xmm4
 35300  	pxor	xmm3, xmm5
 35301  	pmovsxwq	xmm3, xmm3
 35302  	movapd	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 35303  	blendvpd	xmm2, xmm4, xmm0
 35304  	movdqa	xmm0, xmm1
 35305  	blendvpd	xmm3, xmm4, xmm0
 35306  	movupd	xmmword ptr [r8 + 8*rsi], xmm2
 35307  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm3
 35308  .LBB4_1018:
 35309  	cmp	rdx, r10
 35310  	je	.LBB4_1655
 35311  .LBB4_1019:
 35312  	mov	esi, 1
 35313  .LBB4_1020:                             # =>This Inner Loop Header: Depth=1
 35314  	movzx	edi, word ptr [rcx + 2*rdx]
 35315  	xor	eax, eax
 35316  	test	di, di
 35317  	setne	al
 35318  	neg	rax
 35319  	test	di, di
 35320  	cmovg	rax, rsi
 35321  	mov	qword ptr [r8 + 8*rdx], rax
 35322  	add	rdx, 1
 35323  	cmp	r10, rdx
 35324  	jne	.LBB4_1020
 35325  	jmp	.LBB4_1655
 35326  .LBB4_993:
 35327  	movmskps	ecx, xmm0
 35328  	and	ecx, 1
 35329  	neg	ecx
 35330  	or	ecx, 1
 35331  	xorps	xmm0, xmm0
 35332  	cvtsi2ss	xmm0, ecx
 35333  	movss	xmm1, dword ptr [rip + .LCPI4_9] # xmm1 = mem[0],zero,zero,zero
 35334  	movaps	xmm2, xmm0
 35335  	subss	xmm2, xmm1
 35336  	cvttss2si	rcx, xmm2
 35337  	movabs	rdx, -9223372036854775808
 35338  	xor	rdx, rcx
 35339  	cvttss2si	rcx, xmm0
 35340  	ucomiss	xmm0, xmm1
 35341  	cmovae	rcx, rdx
 35342  	mov	qword ptr [r8 + 8*rax], rcx
 35343  	jmp	.LBB4_1655
 35344  .LBB4_1021:
 35345  	xor	esi, esi
 35346  .LBB4_1022:
 35347  	test	r9b, 1
 35348  	je	.LBB4_1024
 35349  # %bb.1023:
 35350  	movq	xmm2, qword ptr [rcx + 4*rsi]   # xmm2 = mem[0],zero
 35351  	movq	xmm3, qword ptr [rcx + 4*rsi + 8] # xmm3 = mem[0],zero
 35352  	xorpd	xmm4, xmm4
 35353  	movdqa	xmm0, xmm2
 35354  	pcmpgtd	xmm0, xmm4
 35355  	pmovsxdq	xmm0, xmm0
 35356  	movdqa	xmm1, xmm3
 35357  	pcmpgtd	xmm1, xmm4
 35358  	pmovsxdq	xmm1, xmm1
 35359  	pcmpeqd	xmm2, xmm4
 35360  	pcmpeqd	xmm5, xmm5
 35361  	pxor	xmm2, xmm5
 35362  	pmovsxdq	xmm2, xmm2
 35363  	pcmpeqd	xmm3, xmm4
 35364  	pxor	xmm3, xmm5
 35365  	pmovsxdq	xmm3, xmm3
 35366  	movapd	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 35367  	blendvpd	xmm2, xmm4, xmm0
 35368  	movdqa	xmm0, xmm1
 35369  	blendvpd	xmm3, xmm4, xmm0
 35370  	movupd	xmmword ptr [r8 + 8*rsi], xmm2
 35371  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm3
 35372  .LBB4_1024:
 35373  	cmp	rdx, r10
 35374  	je	.LBB4_1655
 35375  .LBB4_1025:
 35376  	mov	esi, 1
 35377  .LBB4_1026:                             # =>This Inner Loop Header: Depth=1
 35378  	mov	edi, dword ptr [rcx + 4*rdx]
 35379  	xor	eax, eax
 35380  	test	edi, edi
 35381  	setne	al
 35382  	neg	rax
 35383  	test	edi, edi
 35384  	cmovg	rax, rsi
 35385  	mov	qword ptr [r8 + 8*rdx], rax
 35386  	add	rdx, 1
 35387  	cmp	r10, rdx
 35388  	jne	.LBB4_1026
 35389  	jmp	.LBB4_1655
 35390  .LBB4_1027:
 35391  	xor	esi, esi
 35392  .LBB4_1028:
 35393  	test	r9b, 1
 35394  	je	.LBB4_1030
 35395  # %bb.1029:
 35396  	movdqu	xmm0, xmmword ptr [rcx + 8*rsi]
 35397  	movdqu	xmm1, xmmword ptr [rcx + 8*rsi + 16]
 35398  	pxor	xmm2, xmm2
 35399  	pcmpeqq	xmm0, xmm2
 35400  	pcmpeqd	xmm3, xmm3
 35401  	pxor	xmm0, xmm3
 35402  	packssdw	xmm0, xmm0
 35403  	packssdw	xmm0, xmm0
 35404  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_17] # xmm4 = <1,1,u,u,u,u,u,u>
 35405  	pand	xmm0, xmm4
 35406  	pcmpeqq	xmm1, xmm2
 35407  	pxor	xmm1, xmm3
 35408  	packssdw	xmm1, xmm1
 35409  	packssdw	xmm1, xmm1
 35410  	pand	xmm1, xmm4
 35411  	movd	dword ptr [r8 + 2*rsi], xmm0
 35412  	movd	dword ptr [r8 + 2*rsi + 4], xmm1
 35413  .LBB4_1030:
 35414  	cmp	rdx, rax
 35415  	je	.LBB4_1655
 35416  .LBB4_1031:                             # =>This Inner Loop Header: Depth=1
 35417  	xor	esi, esi
 35418  	cmp	qword ptr [rcx + 8*rdx], 0
 35419  	setne	sil
 35420  	mov	word ptr [r8 + 2*rdx], si
 35421  	add	rdx, 1
 35422  	cmp	rax, rdx
 35423  	jne	.LBB4_1031
 35424  	jmp	.LBB4_1655
 35425  .LBB4_1032:
 35426  	xor	esi, esi
 35427  .LBB4_1033:
 35428  	test	r9b, 1
 35429  	je	.LBB4_1035
 35430  # %bb.1034:
 35431  	movdqu	xmm0, xmmword ptr [rcx + 8*rsi]
 35432  	movdqu	xmm1, xmmword ptr [rcx + 8*rsi + 16]
 35433  	pxor	xmm2, xmm2
 35434  	pcmpeqq	xmm0, xmm2
 35435  	pcmpeqd	xmm3, xmm3
 35436  	pxor	xmm0, xmm3
 35437  	packssdw	xmm0, xmm0
 35438  	packssdw	xmm0, xmm0
 35439  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_17] # xmm4 = <1,1,u,u,u,u,u,u>
 35440  	pand	xmm0, xmm4
 35441  	pcmpeqq	xmm1, xmm2
 35442  	pxor	xmm1, xmm3
 35443  	packssdw	xmm1, xmm1
 35444  	packssdw	xmm1, xmm1
 35445  	pand	xmm1, xmm4
 35446  	movd	dword ptr [r8 + 2*rsi], xmm0
 35447  	movd	dword ptr [r8 + 2*rsi + 4], xmm1
 35448  .LBB4_1035:
 35449  	cmp	rdx, rax
 35450  	je	.LBB4_1655
 35451  .LBB4_1036:                             # =>This Inner Loop Header: Depth=1
 35452  	xor	esi, esi
 35453  	cmp	qword ptr [rcx + 8*rdx], 0
 35454  	setne	sil
 35455  	mov	word ptr [r8 + 2*rdx], si
 35456  	add	rdx, 1
 35457  	cmp	rax, rdx
 35458  	jne	.LBB4_1036
 35459  	jmp	.LBB4_1655
 35460  .LBB4_1037:
 35461  	xor	esi, esi
 35462  .LBB4_1038:
 35463  	test	r9b, 1
 35464  	je	.LBB4_1040
 35465  # %bb.1039:
 35466  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi]
 35467  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 16]
 35468  	pxor	xmm4, xmm4
 35469  	movdqa	xmm0, xmm2
 35470  	pcmpgtq	xmm0, xmm4
 35471  	packssdw	xmm0, xmm0
 35472  	packssdw	xmm0, xmm0
 35473  	movdqa	xmm1, xmm3
 35474  	pcmpgtq	xmm1, xmm4
 35475  	packssdw	xmm1, xmm1
 35476  	packssdw	xmm1, xmm1
 35477  	pcmpeqq	xmm2, xmm4
 35478  	pcmpeqd	xmm5, xmm5
 35479  	pxor	xmm2, xmm5
 35480  	packssdw	xmm2, xmm2
 35481  	packssdw	xmm2, xmm2
 35482  	pcmpeqq	xmm3, xmm4
 35483  	pxor	xmm3, xmm5
 35484  	packssdw	xmm3, xmm3
 35485  	packssdw	xmm3, xmm3
 35486  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_17] # xmm4 = <1,1,u,u,u,u,u,u>
 35487  	pblendvb	xmm2, xmm4, xmm0
 35488  	movdqa	xmm0, xmm1
 35489  	pblendvb	xmm3, xmm4, xmm0
 35490  	movd	dword ptr [r8 + 2*rsi], xmm2
 35491  	movd	dword ptr [r8 + 2*rsi + 4], xmm3
 35492  .LBB4_1040:
 35493  	cmp	rdx, r10
 35494  	je	.LBB4_1655
 35495  .LBB4_1041:
 35496  	mov	esi, 1
 35497  .LBB4_1042:                             # =>This Inner Loop Header: Depth=1
 35498  	mov	rdi, qword ptr [rcx + 8*rdx]
 35499  	xor	eax, eax
 35500  	test	rdi, rdi
 35501  	setne	al
 35502  	neg	eax
 35503  	test	rdi, rdi
 35504  	cmovg	eax, esi
 35505  	mov	word ptr [r8 + 2*rdx], ax
 35506  	add	rdx, 1
 35507  	cmp	r10, rdx
 35508  	jne	.LBB4_1042
 35509  	jmp	.LBB4_1655
 35510  .LBB4_1043:
 35511  	xor	esi, esi
 35512  .LBB4_1044:
 35513  	test	r9b, 1
 35514  	je	.LBB4_1046
 35515  # %bb.1045:
 35516  	movdqu	xmm2, xmmword ptr [rcx + 4*rsi]
 35517  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 16]
 35518  	pxor	xmm4, xmm4
 35519  	movdqa	xmm0, xmm2
 35520  	pcmpgtd	xmm0, xmm4
 35521  	packssdw	xmm0, xmm0
 35522  	movdqa	xmm1, xmm3
 35523  	pcmpgtd	xmm1, xmm4
 35524  	packssdw	xmm1, xmm1
 35525  	pcmpeqd	xmm2, xmm4
 35526  	pcmpeqd	xmm5, xmm5
 35527  	pxor	xmm2, xmm5
 35528  	packssdw	xmm2, xmm2
 35529  	pcmpeqd	xmm3, xmm4
 35530  	pxor	xmm3, xmm5
 35531  	packssdw	xmm3, xmm3
 35532  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_11] # xmm4 = <1,1,1,1,u,u,u,u>
 35533  	pblendvb	xmm2, xmm4, xmm0
 35534  	movdqa	xmm0, xmm1
 35535  	pblendvb	xmm3, xmm4, xmm0
 35536  	punpcklqdq	xmm2, xmm3              # xmm2 = xmm2[0],xmm3[0]
 35537  	movdqu	xmmword ptr [r8 + 2*rsi], xmm2
 35538  .LBB4_1046:
 35539  	cmp	rdx, r10
 35540  	je	.LBB4_1655
 35541  .LBB4_1047:
 35542  	mov	esi, 1
 35543  .LBB4_1048:                             # =>This Inner Loop Header: Depth=1
 35544  	mov	edi, dword ptr [rcx + 4*rdx]
 35545  	xor	eax, eax
 35546  	test	edi, edi
 35547  	setne	al
 35548  	neg	eax
 35549  	test	edi, edi
 35550  	cmovg	eax, esi
 35551  	mov	word ptr [r8 + 2*rdx], ax
 35552  	add	rdx, 1
 35553  	cmp	r10, rdx
 35554  	jne	.LBB4_1048
 35555  	jmp	.LBB4_1655
 35556  .LBB4_1049:
 35557  	xor	esi, esi
 35558  .LBB4_1050:
 35559  	test	r9b, 1
 35560  	je	.LBB4_1052
 35561  # %bb.1051:
 35562  	movdqu	xmm2, xmmword ptr [rcx + 4*rsi]
 35563  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 16]
 35564  	pxor	xmm4, xmm4
 35565  	movdqa	xmm0, xmm2
 35566  	pcmpgtd	xmm0, xmm4
 35567  	packssdw	xmm0, xmm0
 35568  	movdqa	xmm1, xmm3
 35569  	pcmpgtd	xmm1, xmm4
 35570  	packssdw	xmm1, xmm1
 35571  	pcmpeqd	xmm2, xmm4
 35572  	pcmpeqd	xmm5, xmm5
 35573  	pxor	xmm2, xmm5
 35574  	packssdw	xmm2, xmm2
 35575  	pcmpeqd	xmm3, xmm4
 35576  	pxor	xmm3, xmm5
 35577  	packssdw	xmm3, xmm3
 35578  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_11] # xmm4 = <1,1,1,1,u,u,u,u>
 35579  	pblendvb	xmm2, xmm4, xmm0
 35580  	movdqa	xmm0, xmm1
 35581  	pblendvb	xmm3, xmm4, xmm0
 35582  	punpcklqdq	xmm2, xmm3              # xmm2 = xmm2[0],xmm3[0]
 35583  	movdqu	xmmword ptr [r8 + 2*rsi], xmm2
 35584  .LBB4_1052:
 35585  	cmp	rdx, r10
 35586  	je	.LBB4_1655
 35587  .LBB4_1053:
 35588  	mov	esi, 1
 35589  .LBB4_1054:                             # =>This Inner Loop Header: Depth=1
 35590  	mov	edi, dword ptr [rcx + 4*rdx]
 35591  	xor	eax, eax
 35592  	test	edi, edi
 35593  	setne	al
 35594  	neg	eax
 35595  	test	edi, edi
 35596  	cmovg	eax, esi
 35597  	mov	word ptr [r8 + 2*rdx], ax
 35598  	add	rdx, 1
 35599  	cmp	r10, rdx
 35600  	jne	.LBB4_1054
 35601  	jmp	.LBB4_1655
 35602  .LBB4_1055:
 35603  	xor	esi, esi
 35604  .LBB4_1056:
 35605  	test	r9b, 1
 35606  	je	.LBB4_1058
 35607  # %bb.1057:
 35608  	movd	xmm0, dword ptr [rcx + 2*rsi]   # xmm0 = mem[0],zero,zero,zero
 35609  	movd	xmm1, dword ptr [rcx + 2*rsi + 4] # xmm1 = mem[0],zero,zero,zero
 35610  	pxor	xmm2, xmm2
 35611  	pcmpeqw	xmm0, xmm2
 35612  	pcmpeqd	xmm3, xmm3
 35613  	pxor	xmm0, xmm3
 35614  	pmovzxwq	xmm0, xmm0                      # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 35615  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 35616  	pand	xmm0, xmm4
 35617  	pcmpeqw	xmm1, xmm2
 35618  	pxor	xmm1, xmm3
 35619  	pmovzxwq	xmm1, xmm1                      # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 35620  	pand	xmm1, xmm4
 35621  	movdqu	xmmword ptr [r8 + 8*rsi], xmm0
 35622  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm1
 35623  .LBB4_1058:
 35624  	cmp	rdx, rax
 35625  	je	.LBB4_1655
 35626  .LBB4_1059:                             # =>This Inner Loop Header: Depth=1
 35627  	xor	esi, esi
 35628  	cmp	word ptr [rcx + 2*rdx], 0
 35629  	setne	sil
 35630  	mov	qword ptr [r8 + 8*rdx], rsi
 35631  	add	rdx, 1
 35632  	cmp	rax, rdx
 35633  	jne	.LBB4_1059
 35634  	jmp	.LBB4_1655
 35635  .LBB4_1060:
 35636  	xor	esi, esi
 35637  .LBB4_1061:
 35638  	test	r9b, 1
 35639  	je	.LBB4_1063
 35640  # %bb.1062:
 35641  	movq	xmm2, qword ptr [rcx + 4*rsi]   # xmm2 = mem[0],zero
 35642  	movq	xmm3, qword ptr [rcx + 4*rsi + 8] # xmm3 = mem[0],zero
 35643  	xorpd	xmm4, xmm4
 35644  	movdqa	xmm0, xmm2
 35645  	pcmpgtd	xmm0, xmm4
 35646  	pmovsxdq	xmm0, xmm0
 35647  	movdqa	xmm1, xmm3
 35648  	pcmpgtd	xmm1, xmm4
 35649  	pmovsxdq	xmm1, xmm1
 35650  	pcmpeqd	xmm2, xmm4
 35651  	pcmpeqd	xmm5, xmm5
 35652  	pxor	xmm2, xmm5
 35653  	pmovsxdq	xmm2, xmm2
 35654  	pcmpeqd	xmm3, xmm4
 35655  	pxor	xmm3, xmm5
 35656  	pmovsxdq	xmm3, xmm3
 35657  	movapd	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 35658  	blendvpd	xmm2, xmm4, xmm0
 35659  	movdqa	xmm0, xmm1
 35660  	blendvpd	xmm3, xmm4, xmm0
 35661  	movupd	xmmword ptr [r8 + 8*rsi], xmm2
 35662  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm3
 35663  .LBB4_1063:
 35664  	cmp	rdx, r10
 35665  	je	.LBB4_1655
 35666  .LBB4_1064:
 35667  	mov	esi, 1
 35668  .LBB4_1065:                             # =>This Inner Loop Header: Depth=1
 35669  	mov	edi, dword ptr [rcx + 4*rdx]
 35670  	xor	eax, eax
 35671  	test	edi, edi
 35672  	setne	al
 35673  	neg	rax
 35674  	test	edi, edi
 35675  	cmovg	rax, rsi
 35676  	mov	qword ptr [r8 + 8*rdx], rax
 35677  	add	rdx, 1
 35678  	cmp	r10, rdx
 35679  	jne	.LBB4_1065
 35680  	jmp	.LBB4_1655
 35681  .LBB4_1066:
 35682  	xor	esi, esi
 35683  .LBB4_1067:
 35684  	test	r9b, 1
 35685  	je	.LBB4_1069
 35686  # %bb.1068:
 35687  	movdqu	xmm2, xmmword ptr [rcx + 4*rsi]
 35688  	movdqu	xmm3, xmmword ptr [rcx + 4*rsi + 16]
 35689  	xorps	xmm4, xmm4
 35690  	movdqa	xmm0, xmm2
 35691  	pcmpgtd	xmm0, xmm4
 35692  	movdqa	xmm1, xmm3
 35693  	pcmpgtd	xmm1, xmm4
 35694  	pcmpeqd	xmm2, xmm4
 35695  	pcmpeqd	xmm5, xmm5
 35696  	pxor	xmm2, xmm5
 35697  	cvtdq2ps	xmm2, xmm2
 35698  	pcmpeqd	xmm3, xmm4
 35699  	pxor	xmm3, xmm5
 35700  	cvtdq2ps	xmm3, xmm3
 35701  	movaps	xmm4, xmmword ptr [rip + .LCPI4_19] # xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 35702  	blendvps	xmm2, xmm4, xmm0
 35703  	movdqa	xmm0, xmm1
 35704  	blendvps	xmm3, xmm4, xmm0
 35705  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 35706  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm3
 35707  .LBB4_1069:
 35708  	cmp	rdx, rax
 35709  	je	.LBB4_1655
 35710  .LBB4_1070:
 35711  	movd	xmm0, dword ptr [rip + .LCPI4_14] # xmm0 = mem[0],zero,zero,zero
 35712  	movd	xmm1, dword ptr [rip + .LCPI4_5] # xmm1 = mem[0],zero,zero,zero
 35713  	jmp	.LBB4_1072
 35714  .LBB4_1071:                             #   in Loop: Header=BB4_1072 Depth=1
 35715  	movd	dword ptr [r8 + 4*rdx], xmm3
 35716  	add	rdx, 1
 35717  	cmp	rax, rdx
 35718  	je	.LBB4_1655
 35719  .LBB4_1072:                             # =>This Inner Loop Header: Depth=1
 35720  	cmp	dword ptr [rcx + 4*rdx], 0
 35721  	movdqa	xmm2, xmm0
 35722  	jne	.LBB4_1074
 35723  # %bb.1073:                             #   in Loop: Header=BB4_1072 Depth=1
 35724  	pxor	xmm2, xmm2
 35725  .LBB4_1074:                             #   in Loop: Header=BB4_1072 Depth=1
 35726  	movdqa	xmm3, xmm1
 35727  	jg	.LBB4_1071
 35728  # %bb.1075:                             #   in Loop: Header=BB4_1072 Depth=1
 35729  	movdqa	xmm3, xmm2
 35730  	jmp	.LBB4_1071
 35731  .LBB4_1076:
 35732  	xor	edi, edi
 35733  .LBB4_1077:
 35734  	test	r9b, 1
 35735  	je	.LBB4_1079
 35736  # %bb.1078:
 35737  	movupd	xmm0, xmmword ptr [rcx + 8*rdi]
 35738  	movupd	xmm1, xmmword ptr [rcx + 8*rdi + 16]
 35739  	xorpd	xmm2, xmm2
 35740  	movapd	xmm3, xmm0
 35741  	cmpeqpd	xmm3, xmm2
 35742  	shufps	xmm3, xmm3, 232                 # xmm3 = xmm3[0,2,2,3]
 35743  	cmpeqpd	xmm2, xmm1
 35744  	shufps	xmm2, xmm2, 232                 # xmm2 = xmm2[0,2,2,3]
 35745  	movapd	xmm4, xmmword ptr [rip + .LCPI4_0] # xmm4 = [-0.0E+0,-0.0E+0]
 35746  	andpd	xmm0, xmm4
 35747  	movapd	xmm5, xmmword ptr [rip + .LCPI4_1] # xmm5 = [1.0E+0,1.0E+0]
 35748  	orpd	xmm0, xmm5
 35749  	andpd	xmm1, xmm4
 35750  	orpd	xmm1, xmm5
 35751  	cvttpd2dq	xmm0, xmm0
 35752  	cvttpd2dq	xmm1, xmm1
 35753  	andnps	xmm3, xmm0
 35754  	andnps	xmm2, xmm1
 35755  	movlhps	xmm3, xmm2                      # xmm3 = xmm3[0],xmm2[0]
 35756  	movups	xmmword ptr [r8 + 4*rdi], xmm3
 35757  .LBB4_1079:
 35758  	cmp	rsi, rax
 35759  	je	.LBB4_1655
 35760  .LBB4_1080:
 35761  	xorpd	xmm0, xmm0
 35762  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 35763  	movsd	xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero
 35764  .LBB4_1081:                             # =>This Inner Loop Header: Depth=1
 35765  	movsd	xmm3, qword ptr [rcx + 8*rsi]   # xmm3 = mem[0],zero
 35766  	ucomisd	xmm0, xmm3
 35767  	andpd	xmm3, xmm1
 35768  	orpd	xmm3, xmm2
 35769  	cvttsd2si	edx, xmm3
 35770  	cmove	edx, r10d
 35771  	mov	dword ptr [r8 + 4*rsi], edx
 35772  	add	rsi, 1
 35773  	cmp	rax, rsi
 35774  	jne	.LBB4_1081
 35775  	jmp	.LBB4_1655
 35776  .LBB4_1082:
 35777  	xor	esi, esi
 35778  .LBB4_1083:
 35779  	test	r9b, 1
 35780  	je	.LBB4_1085
 35781  # %bb.1084:
 35782  	movdqu	xmm0, xmmword ptr [rcx + 8*rsi]
 35783  	movdqu	xmm1, xmmword ptr [rcx + 8*rsi + 16]
 35784  	pxor	xmm2, xmm2
 35785  	pcmpeqq	xmm0, xmm2
 35786  	pshufd	xmm0, xmm0, 232                 # xmm0 = xmm0[0,2,2,3]
 35787  	movdqa	xmm3, xmmword ptr [rip + .LCPI4_16] # xmm3 = <1,1,u,u>
 35788  	pandn	xmm0, xmm3
 35789  	pcmpeqq	xmm1, xmm2
 35790  	pshufd	xmm1, xmm1, 232                 # xmm1 = xmm1[0,2,2,3]
 35791  	pandn	xmm1, xmm3
 35792  	punpcklqdq	xmm0, xmm1              # xmm0 = xmm0[0],xmm1[0]
 35793  	movdqu	xmmword ptr [r8 + 4*rsi], xmm0
 35794  .LBB4_1085:
 35795  	cmp	rdx, rax
 35796  	je	.LBB4_1655
 35797  .LBB4_1086:                             # =>This Inner Loop Header: Depth=1
 35798  	xor	esi, esi
 35799  	cmp	qword ptr [rcx + 8*rdx], 0
 35800  	setne	sil
 35801  	mov	dword ptr [r8 + 4*rdx], esi
 35802  	add	rdx, 1
 35803  	cmp	rax, rdx
 35804  	jne	.LBB4_1086
 35805  	jmp	.LBB4_1655
 35806  .LBB4_1087:
 35807  	xor	esi, esi
 35808  .LBB4_1088:
 35809  	test	r9b, 1
 35810  	je	.LBB4_1090
 35811  # %bb.1089:
 35812  	movq	xmm0, qword ptr [rcx + 2*rsi]   # xmm0 = mem[0],zero
 35813  	movq	xmm1, qword ptr [rcx + 2*rsi + 8] # xmm1 = mem[0],zero
 35814  	pxor	xmm2, xmm2
 35815  	pcmpeqw	xmm0, xmm2
 35816  	pcmpeqd	xmm3, xmm3
 35817  	pxor	xmm0, xmm3
 35818  	pmovzxwd	xmm0, xmm0                      # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 35819  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 35820  	pand	xmm0, xmm4
 35821  	pcmpeqw	xmm1, xmm2
 35822  	pxor	xmm1, xmm3
 35823  	pmovzxwd	xmm1, xmm1                      # xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 35824  	pand	xmm1, xmm4
 35825  	movdqu	xmmword ptr [r8 + 4*rsi], xmm0
 35826  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm1
 35827  .LBB4_1090:
 35828  	cmp	rdx, rax
 35829  	je	.LBB4_1655
 35830  .LBB4_1091:                             # =>This Inner Loop Header: Depth=1
 35831  	xor	esi, esi
 35832  	cmp	word ptr [rcx + 2*rdx], 0
 35833  	setne	sil
 35834  	mov	dword ptr [r8 + 4*rdx], esi
 35835  	add	rdx, 1
 35836  	cmp	rax, rdx
 35837  	jne	.LBB4_1091
 35838  	jmp	.LBB4_1655
 35839  .LBB4_1092:
 35840  	xor	esi, esi
 35841  .LBB4_1093:
 35842  	test	r9b, 1
 35843  	je	.LBB4_1095
 35844  # %bb.1094:
 35845  	movq	xmm2, qword ptr [rcx + 2*rsi]   # xmm2 = mem[0],zero
 35846  	movq	xmm3, qword ptr [rcx + 2*rsi + 8] # xmm3 = mem[0],zero
 35847  	xorps	xmm4, xmm4
 35848  	movdqa	xmm0, xmm2
 35849  	pcmpgtw	xmm0, xmm4
 35850  	pmovsxwd	xmm0, xmm0
 35851  	movdqa	xmm1, xmm3
 35852  	pcmpgtw	xmm1, xmm4
 35853  	pmovsxwd	xmm1, xmm1
 35854  	pcmpeqw	xmm2, xmm4
 35855  	pcmpeqd	xmm5, xmm5
 35856  	pxor	xmm2, xmm5
 35857  	pmovsxwd	xmm2, xmm2
 35858  	pcmpeqw	xmm3, xmm4
 35859  	pxor	xmm3, xmm5
 35860  	pmovsxwd	xmm3, xmm3
 35861  	movaps	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 35862  	blendvps	xmm2, xmm4, xmm0
 35863  	movdqa	xmm0, xmm1
 35864  	blendvps	xmm3, xmm4, xmm0
 35865  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 35866  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm3
 35867  .LBB4_1095:
 35868  	cmp	rdx, r10
 35869  	je	.LBB4_1655
 35870  .LBB4_1096:
 35871  	mov	esi, 1
 35872  .LBB4_1097:                             # =>This Inner Loop Header: Depth=1
 35873  	movzx	edi, word ptr [rcx + 2*rdx]
 35874  	xor	eax, eax
 35875  	test	di, di
 35876  	setne	al
 35877  	neg	eax
 35878  	test	di, di
 35879  	cmovg	eax, esi
 35880  	mov	dword ptr [r8 + 4*rdx], eax
 35881  	add	rdx, 1
 35882  	cmp	r10, rdx
 35883  	jne	.LBB4_1097
 35884  	jmp	.LBB4_1655
 35885  .LBB4_1098:
 35886  	xor	esi, esi
 35887  .LBB4_1099:
 35888  	test	r9b, 1
 35889  	je	.LBB4_1101
 35890  # %bb.1100:
 35891  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi]
 35892  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 16]
 35893  	xorps	xmm4, xmm4
 35894  	movdqa	xmm0, xmm2
 35895  	pcmpgtq	xmm0, xmm4
 35896  	pshufd	xmm0, xmm0, 232                 # xmm0 = xmm0[0,2,2,3]
 35897  	movdqa	xmm1, xmm3
 35898  	pcmpgtq	xmm1, xmm4
 35899  	pshufd	xmm1, xmm1, 232                 # xmm1 = xmm1[0,2,2,3]
 35900  	pcmpeqq	xmm2, xmm4
 35901  	pshufd	xmm2, xmm2, 232                 # xmm2 = xmm2[0,2,2,3]
 35902  	pcmpeqd	xmm5, xmm5
 35903  	pxor	xmm2, xmm5
 35904  	pcmpeqq	xmm3, xmm4
 35905  	pshufd	xmm3, xmm3, 232                 # xmm3 = xmm3[0,2,2,3]
 35906  	pxor	xmm3, xmm5
 35907  	movaps	xmm4, xmmword ptr [rip + .LCPI4_16] # xmm4 = <1,1,u,u>
 35908  	blendvps	xmm2, xmm4, xmm0
 35909  	movdqa	xmm0, xmm1
 35910  	blendvps	xmm3, xmm4, xmm0
 35911  	movlhps	xmm2, xmm3                      # xmm2 = xmm2[0],xmm3[0]
 35912  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 35913  .LBB4_1101:
 35914  	cmp	rdx, r10
 35915  	je	.LBB4_1655
 35916  .LBB4_1102:
 35917  	mov	esi, 1
 35918  .LBB4_1103:                             # =>This Inner Loop Header: Depth=1
 35919  	mov	rdi, qword ptr [rcx + 8*rdx]
 35920  	xor	eax, eax
 35921  	test	rdi, rdi
 35922  	setne	al
 35923  	neg	eax
 35924  	test	rdi, rdi
 35925  	cmovg	eax, esi
 35926  	mov	dword ptr [r8 + 4*rdx], eax
 35927  	add	rdx, 1
 35928  	cmp	r10, rdx
 35929  	jne	.LBB4_1103
 35930  	jmp	.LBB4_1655
 35931  .LBB4_1106:
 35932  	xor	edi, edi
 35933  .LBB4_1107:
 35934  	test	r9b, 1
 35935  	je	.LBB4_1109
 35936  # %bb.1108:
 35937  	movupd	xmm3, xmmword ptr [rcx + 8*rdi]
 35938  	movupd	xmm2, xmmword ptr [rcx + 8*rdi + 16]
 35939  	xorpd	xmm1, xmm1
 35940  	movapd	xmm0, xmm3
 35941  	cmpeqpd	xmm0, xmm1
 35942  	shufps	xmm0, xmm0, 232                 # xmm0 = xmm0[0,2,2,3]
 35943  	cmpeqpd	xmm1, xmm2
 35944  	movapd	xmm4, xmmword ptr [rip + .LCPI4_0] # xmm4 = [-0.0E+0,-0.0E+0]
 35945  	andpd	xmm3, xmm4
 35946  	movapd	xmm5, xmmword ptr [rip + .LCPI4_1] # xmm5 = [1.0E+0,1.0E+0]
 35947  	orpd	xmm3, xmm5
 35948  	andpd	xmm2, xmm4
 35949  	orpd	xmm2, xmm5
 35950  	pshufd	xmm4, xmm3, 238                 # xmm4 = xmm3[2,3,2,3]
 35951  	cvttsd2si	rax, xmm4
 35952  	cvttsd2si	rdx, xmm3
 35953  	movd	xmm3, edx
 35954  	pinsrd	xmm3, eax, 1
 35955  	pshufd	xmm4, xmm2, 238                 # xmm4 = xmm2[2,3,2,3]
 35956  	cvttsd2si	rax, xmm4
 35957  	cvttsd2si	rdx, xmm2
 35958  	shufps	xmm1, xmm1, 232                 # xmm1 = xmm1[0,2,2,3]
 35959  	movd	xmm2, edx
 35960  	pinsrd	xmm2, eax, 1
 35961  	andnps	xmm0, xmm3
 35962  	andnps	xmm1, xmm2
 35963  	movlhps	xmm0, xmm1                      # xmm0 = xmm0[0],xmm1[0]
 35964  	movups	xmmword ptr [r8 + 4*rdi], xmm0
 35965  .LBB4_1109:
 35966  	cmp	rsi, r11
 35967  	je	.LBB4_1655
 35968  .LBB4_1110:
 35969  	xorpd	xmm0, xmm0
 35970  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 35971  	movsd	xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero
 35972  .LBB4_1111:                             # =>This Inner Loop Header: Depth=1
 35973  	movsd	xmm3, qword ptr [rcx + 8*rsi]   # xmm3 = mem[0],zero
 35974  	ucomisd	xmm0, xmm3
 35975  	andpd	xmm3, xmm1
 35976  	orpd	xmm3, xmm2
 35977  	cvttsd2si	rax, xmm3
 35978  	cmove	eax, r10d
 35979  	mov	dword ptr [r8 + 4*rsi], eax
 35980  	add	rsi, 1
 35981  	cmp	r11, rsi
 35982  	jne	.LBB4_1111
 35983  	jmp	.LBB4_1655
 35984  .LBB4_1112:
 35985  	xor	esi, esi
 35986  .LBB4_1113:
 35987  	test	r9b, 1
 35988  	je	.LBB4_1115
 35989  # %bb.1114:
 35990  	movq	xmm0, qword ptr [rcx + 2*rsi]   # xmm0 = mem[0],zero
 35991  	movq	xmm1, qword ptr [rcx + 2*rsi + 8] # xmm1 = mem[0],zero
 35992  	pxor	xmm2, xmm2
 35993  	pcmpeqw	xmm0, xmm2
 35994  	pcmpeqd	xmm3, xmm3
 35995  	pxor	xmm0, xmm3
 35996  	pmovzxwd	xmm0, xmm0                      # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 35997  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 35998  	pand	xmm0, xmm4
 35999  	pcmpeqw	xmm1, xmm2
 36000  	pxor	xmm1, xmm3
 36001  	pmovzxwd	xmm1, xmm1                      # xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 36002  	pand	xmm1, xmm4
 36003  	movdqu	xmmword ptr [r8 + 4*rsi], xmm0
 36004  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm1
 36005  .LBB4_1115:
 36006  	cmp	rdx, rax
 36007  	je	.LBB4_1655
 36008  .LBB4_1116:                             # =>This Inner Loop Header: Depth=1
 36009  	xor	esi, esi
 36010  	cmp	word ptr [rcx + 2*rdx], 0
 36011  	setne	sil
 36012  	mov	dword ptr [r8 + 4*rdx], esi
 36013  	add	rdx, 1
 36014  	cmp	rax, rdx
 36015  	jne	.LBB4_1116
 36016  	jmp	.LBB4_1655
 36017  .LBB4_1117:
 36018  	xor	esi, esi
 36019  .LBB4_1118:
 36020  	test	r9b, 1
 36021  	je	.LBB4_1120
 36022  # %bb.1119:
 36023  	movq	xmm2, qword ptr [rcx + 2*rsi]   # xmm2 = mem[0],zero
 36024  	movq	xmm3, qword ptr [rcx + 2*rsi + 8] # xmm3 = mem[0],zero
 36025  	xorps	xmm4, xmm4
 36026  	movdqa	xmm0, xmm2
 36027  	pcmpgtw	xmm0, xmm4
 36028  	pmovsxwd	xmm0, xmm0
 36029  	movdqa	xmm1, xmm3
 36030  	pcmpgtw	xmm1, xmm4
 36031  	pmovsxwd	xmm1, xmm1
 36032  	pcmpeqw	xmm2, xmm4
 36033  	pcmpeqd	xmm5, xmm5
 36034  	pxor	xmm2, xmm5
 36035  	pmovsxwd	xmm2, xmm2
 36036  	pcmpeqw	xmm3, xmm4
 36037  	pxor	xmm3, xmm5
 36038  	pmovsxwd	xmm3, xmm3
 36039  	movaps	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 36040  	blendvps	xmm2, xmm4, xmm0
 36041  	movdqa	xmm0, xmm1
 36042  	blendvps	xmm3, xmm4, xmm0
 36043  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 36044  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm3
 36045  .LBB4_1120:
 36046  	cmp	rdx, r10
 36047  	je	.LBB4_1655
 36048  .LBB4_1121:
 36049  	mov	esi, 1
 36050  .LBB4_1122:                             # =>This Inner Loop Header: Depth=1
 36051  	movzx	edi, word ptr [rcx + 2*rdx]
 36052  	xor	eax, eax
 36053  	test	di, di
 36054  	setne	al
 36055  	neg	eax
 36056  	test	di, di
 36057  	cmovg	eax, esi
 36058  	mov	dword ptr [r8 + 4*rdx], eax
 36059  	add	rdx, 1
 36060  	cmp	r10, rdx
 36061  	jne	.LBB4_1122
 36062  	jmp	.LBB4_1655
 36063  .LBB4_1123:
 36064  	xor	esi, esi
 36065  .LBB4_1124:
 36066  	test	r9b, 1
 36067  	je	.LBB4_1126
 36068  # %bb.1125:
 36069  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi]
 36070  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 16]
 36071  	xorps	xmm4, xmm4
 36072  	movdqa	xmm0, xmm2
 36073  	pcmpgtq	xmm0, xmm4
 36074  	pshufd	xmm0, xmm0, 232                 # xmm0 = xmm0[0,2,2,3]
 36075  	movdqa	xmm1, xmm3
 36076  	pcmpgtq	xmm1, xmm4
 36077  	pshufd	xmm1, xmm1, 232                 # xmm1 = xmm1[0,2,2,3]
 36078  	pcmpeqq	xmm2, xmm4
 36079  	pshufd	xmm2, xmm2, 232                 # xmm2 = xmm2[0,2,2,3]
 36080  	pcmpeqd	xmm5, xmm5
 36081  	pxor	xmm2, xmm5
 36082  	pcmpeqq	xmm3, xmm4
 36083  	pshufd	xmm3, xmm3, 232                 # xmm3 = xmm3[0,2,2,3]
 36084  	pxor	xmm3, xmm5
 36085  	movaps	xmm4, xmmword ptr [rip + .LCPI4_16] # xmm4 = <1,1,u,u>
 36086  	blendvps	xmm2, xmm4, xmm0
 36087  	movdqa	xmm0, xmm1
 36088  	blendvps	xmm3, xmm4, xmm0
 36089  	movlhps	xmm2, xmm3                      # xmm2 = xmm2[0],xmm3[0]
 36090  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 36091  .LBB4_1126:
 36092  	cmp	rdx, r10
 36093  	je	.LBB4_1655
 36094  .LBB4_1127:
 36095  	mov	esi, 1
 36096  .LBB4_1128:                             # =>This Inner Loop Header: Depth=1
 36097  	mov	rdi, qword ptr [rcx + 8*rdx]
 36098  	xor	eax, eax
 36099  	test	rdi, rdi
 36100  	setne	al
 36101  	neg	eax
 36102  	test	rdi, rdi
 36103  	cmovg	eax, esi
 36104  	mov	dword ptr [r8 + 4*rdx], eax
 36105  	add	rdx, 1
 36106  	cmp	r10, rdx
 36107  	jne	.LBB4_1128
 36108  	jmp	.LBB4_1655
 36109  .LBB4_1129:
 36110  	xor	esi, esi
 36111  .LBB4_1130:
 36112  	test	r9b, 1
 36113  	je	.LBB4_1132
 36114  # %bb.1131:
 36115  	movups	xmm0, xmmword ptr [rcx + 4*rsi]
 36116  	xorps	xmm1, xmm1
 36117  	cmpneqps	xmm1, xmm0
 36118  	psrad	xmm0, 31
 36119  	por	xmm0, xmmword ptr [rip + .LCPI4_8]
 36120  	cvtdq2ps	xmm2, xmm0
 36121  	movaps	xmm3, xmmword ptr [rip + .LCPI4_10] # xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
 36122  	movaps	xmm0, xmm2
 36123  	cmpltps	xmm0, xmm3
 36124  	cvttps2dq	xmm4, xmm2
 36125  	subps	xmm2, xmm3
 36126  	cvttps2dq	xmm2, xmm2
 36127  	xorps	xmm2, xmmword ptr [rip + .LCPI4_4]
 36128  	blendvps	xmm2, xmm4, xmm0
 36129  	andps	xmm1, xmm2
 36130  	movups	xmmword ptr [r8 + 4*rsi], xmm1
 36131  .LBB4_1132:
 36132  	cmp	rdx, rax
 36133  	je	.LBB4_1655
 36134  .LBB4_1133:
 36135  	xorps	xmm0, xmm0
 36136  	jmp	.LBB4_1135
 36137  .LBB4_1134:                             #   in Loop: Header=BB4_1135 Depth=1
 36138  	mov	dword ptr [r8 + 4*rdx], esi
 36139  	add	rdx, 1
 36140  	cmp	rax, rdx
 36141  	je	.LBB4_1655
 36142  .LBB4_1135:                             # =>This Inner Loop Header: Depth=1
 36143  	movss	xmm1, dword ptr [rcx + 4*rdx]   # xmm1 = mem[0],zero,zero,zero
 36144  	xor	esi, esi
 36145  	ucomiss	xmm0, xmm1
 36146  	je	.LBB4_1134
 36147  # %bb.1136:                             #   in Loop: Header=BB4_1135 Depth=1
 36148  	movmskps	esi, xmm1
 36149  	and	esi, 1
 36150  	neg	esi
 36151  	or	esi, 1
 36152  	xorps	xmm1, xmm1
 36153  	cvtsi2ss	xmm1, esi
 36154  	cvttss2si	rsi, xmm1
 36155  	jmp	.LBB4_1134
 36156  .LBB4_1137:
 36157  	xor	esi, esi
 36158  .LBB4_1138:
 36159  	test	r9b, 1
 36160  	je	.LBB4_1140
 36161  # %bb.1139:
 36162  	movdqu	xmm0, xmmword ptr [rcx + 4*rsi]
 36163  	movdqu	xmm1, xmmword ptr [rcx + 4*rsi + 16]
 36164  	pxor	xmm2, xmm2
 36165  	pcmpeqd	xmm0, xmm2
 36166  	pcmpeqd	xmm3, xmm3
 36167  	pxor	xmm0, xmm3
 36168  	packssdw	xmm0, xmm0
 36169  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_11] # xmm4 = <1,1,1,1,u,u,u,u>
 36170  	pand	xmm0, xmm4
 36171  	pcmpeqd	xmm1, xmm2
 36172  	pxor	xmm1, xmm3
 36173  	packssdw	xmm1, xmm1
 36174  	pand	xmm1, xmm4
 36175  	punpcklqdq	xmm0, xmm1              # xmm0 = xmm0[0],xmm1[0]
 36176  	movdqu	xmmword ptr [r8 + 2*rsi], xmm0
 36177  .LBB4_1140:
 36178  	cmp	rdx, rax
 36179  	je	.LBB4_1655
 36180  .LBB4_1141:                             # =>This Inner Loop Header: Depth=1
 36181  	xor	esi, esi
 36182  	cmp	dword ptr [rcx + 4*rdx], 0
 36183  	setne	sil
 36184  	mov	word ptr [r8 + 2*rdx], si
 36185  	add	rdx, 1
 36186  	cmp	rax, rdx
 36187  	jne	.LBB4_1141
 36188  	jmp	.LBB4_1655
 36189  .LBB4_1142:
 36190  	xor	esi, esi
 36191  .LBB4_1143:
 36192  	test	r9b, 1
 36193  	je	.LBB4_1145
 36194  # %bb.1144:
 36195  	movdqu	xmm0, xmmword ptr [rcx + 4*rsi]
 36196  	movdqu	xmm1, xmmword ptr [rcx + 4*rsi + 16]
 36197  	pxor	xmm2, xmm2
 36198  	pcmpeqd	xmm0, xmm2
 36199  	pcmpeqd	xmm3, xmm3
 36200  	pxor	xmm0, xmm3
 36201  	packssdw	xmm0, xmm0
 36202  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_11] # xmm4 = <1,1,1,1,u,u,u,u>
 36203  	pand	xmm0, xmm4
 36204  	pcmpeqd	xmm1, xmm2
 36205  	pxor	xmm1, xmm3
 36206  	packssdw	xmm1, xmm1
 36207  	pand	xmm1, xmm4
 36208  	punpcklqdq	xmm0, xmm1              # xmm0 = xmm0[0],xmm1[0]
 36209  	movdqu	xmmword ptr [r8 + 2*rsi], xmm0
 36210  .LBB4_1145:
 36211  	cmp	rdx, rax
 36212  	je	.LBB4_1655
 36213  .LBB4_1146:                             # =>This Inner Loop Header: Depth=1
 36214  	xor	esi, esi
 36215  	cmp	dword ptr [rcx + 4*rdx], 0
 36216  	setne	sil
 36217  	mov	word ptr [r8 + 2*rdx], si
 36218  	add	rdx, 1
 36219  	cmp	rax, rdx
 36220  	jne	.LBB4_1146
 36221  	jmp	.LBB4_1655
 36222  .LBB4_1147:
 36223  	xor	edi, edi
 36224  .LBB4_1148:
 36225  	test	r9b, 1
 36226  	je	.LBB4_1150
 36227  # %bb.1149:
 36228  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 36229  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 36230  	xorpd	xmm4, xmm4
 36231  	movapd	xmm0, xmm2
 36232  	cmpeqpd	xmm0, xmm4
 36233  	packssdw	xmm0, xmm0
 36234  	packssdw	xmm0, xmm0
 36235  	movapd	xmm1, xmm3
 36236  	cmpeqpd	xmm1, xmm4
 36237  	packssdw	xmm1, xmm1
 36238  	packssdw	xmm1, xmm1
 36239  	movapd	xmm5, xmmword ptr [rip + .LCPI4_0] # xmm5 = [-0.0E+0,-0.0E+0]
 36240  	andpd	xmm2, xmm5
 36241  	movapd	xmm6, xmmword ptr [rip + .LCPI4_1] # xmm6 = [1.0E+0,1.0E+0]
 36242  	orpd	xmm2, xmm6
 36243  	andpd	xmm3, xmm5
 36244  	orpd	xmm3, xmm6
 36245  	cvttpd2dq	xmm2, xmm2
 36246  	cvttpd2dq	xmm3, xmm3
 36247  	pshuflw	xmm2, xmm2, 232                 # xmm2 = xmm2[0,2,2,3,4,5,6,7]
 36248  	pshuflw	xmm3, xmm3, 232                 # xmm3 = xmm3[0,2,2,3,4,5,6,7]
 36249  	pblendvb	xmm2, xmm4, xmm0
 36250  	movdqa	xmm0, xmm1
 36251  	pblendvb	xmm3, xmm4, xmm0
 36252  	movd	dword ptr [r8 + 2*rdi], xmm2
 36253  	movd	dword ptr [r8 + 2*rdi + 4], xmm3
 36254  .LBB4_1150:
 36255  	cmp	rsi, rax
 36256  	je	.LBB4_1655
 36257  .LBB4_1151:
 36258  	pxor	xmm0, xmm0
 36259  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 36260  	movsd	xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero
 36261  .LBB4_1152:                             # =>This Inner Loop Header: Depth=1
 36262  	movsd	xmm3, qword ptr [rcx + 8*rsi]   # xmm3 = mem[0],zero
 36263  	ucomisd	xmm0, xmm3
 36264  	andpd	xmm3, xmm1
 36265  	orpd	xmm3, xmm2
 36266  	cvttsd2si	edx, xmm3
 36267  	cmove	edx, r10d
 36268  	mov	word ptr [r8 + 2*rsi], dx
 36269  	add	rsi, 1
 36270  	cmp	rax, rsi
 36271  	jne	.LBB4_1152
 36272  	jmp	.LBB4_1655
 36273  .LBB4_1153:
 36274  	xor	edi, edi
 36275  .LBB4_1154:
 36276  	test	r9b, 1
 36277  	je	.LBB4_1156
 36278  # %bb.1155:
 36279  	movupd	xmm2, xmmword ptr [rcx + 8*rdi]
 36280  	movupd	xmm3, xmmword ptr [rcx + 8*rdi + 16]
 36281  	xorpd	xmm4, xmm4
 36282  	movapd	xmm0, xmm2
 36283  	cmpeqpd	xmm0, xmm4
 36284  	packssdw	xmm0, xmm0
 36285  	packssdw	xmm0, xmm0
 36286  	movapd	xmm1, xmm3
 36287  	cmpeqpd	xmm1, xmm4
 36288  	packssdw	xmm1, xmm1
 36289  	packssdw	xmm1, xmm1
 36290  	movapd	xmm5, xmmword ptr [rip + .LCPI4_0] # xmm5 = [-0.0E+0,-0.0E+0]
 36291  	andpd	xmm2, xmm5
 36292  	movapd	xmm6, xmmword ptr [rip + .LCPI4_1] # xmm6 = [1.0E+0,1.0E+0]
 36293  	orpd	xmm2, xmm6
 36294  	andpd	xmm3, xmm5
 36295  	orpd	xmm3, xmm6
 36296  	cvttpd2dq	xmm2, xmm2
 36297  	cvttpd2dq	xmm3, xmm3
 36298  	pshuflw	xmm2, xmm2, 232                 # xmm2 = xmm2[0,2,2,3,4,5,6,7]
 36299  	pshuflw	xmm3, xmm3, 232                 # xmm3 = xmm3[0,2,2,3,4,5,6,7]
 36300  	pblendvb	xmm2, xmm4, xmm0
 36301  	movdqa	xmm0, xmm1
 36302  	pblendvb	xmm3, xmm4, xmm0
 36303  	movd	dword ptr [r8 + 2*rdi], xmm2
 36304  	movd	dword ptr [r8 + 2*rdi + 4], xmm3
 36305  .LBB4_1156:
 36306  	cmp	rsi, rax
 36307  	je	.LBB4_1655
 36308  .LBB4_1157:
 36309  	pxor	xmm0, xmm0
 36310  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 36311  	movsd	xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero
 36312  .LBB4_1158:                             # =>This Inner Loop Header: Depth=1
 36313  	movsd	xmm3, qword ptr [rcx + 8*rsi]   # xmm3 = mem[0],zero
 36314  	ucomisd	xmm0, xmm3
 36315  	andpd	xmm3, xmm1
 36316  	orpd	xmm3, xmm2
 36317  	cvttsd2si	edx, xmm3
 36318  	cmove	edx, r10d
 36319  	mov	word ptr [r8 + 2*rsi], dx
 36320  	add	rsi, 1
 36321  	cmp	rax, rsi
 36322  	jne	.LBB4_1158
 36323  	jmp	.LBB4_1655
 36324  .LBB4_1159:
 36325  	xor	esi, esi
 36326  .LBB4_1160:
 36327  	test	r9b, 1
 36328  	je	.LBB4_1162
 36329  # %bb.1161:
 36330  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi]
 36331  	movdqu	xmm3, xmmword ptr [rcx + 8*rsi + 16]
 36332  	pxor	xmm4, xmm4
 36333  	movdqa	xmm0, xmm2
 36334  	pcmpgtq	xmm0, xmm4
 36335  	packssdw	xmm0, xmm0
 36336  	packssdw	xmm0, xmm0
 36337  	movdqa	xmm1, xmm3
 36338  	pcmpgtq	xmm1, xmm4
 36339  	packssdw	xmm1, xmm1
 36340  	packssdw	xmm1, xmm1
 36341  	pcmpeqq	xmm2, xmm4
 36342  	pcmpeqd	xmm5, xmm5
 36343  	pxor	xmm2, xmm5
 36344  	packssdw	xmm2, xmm2
 36345  	packssdw	xmm2, xmm2
 36346  	pcmpeqq	xmm3, xmm4
 36347  	pxor	xmm3, xmm5
 36348  	packssdw	xmm3, xmm3
 36349  	packssdw	xmm3, xmm3
 36350  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_17] # xmm4 = <1,1,u,u,u,u,u,u>
 36351  	pblendvb	xmm2, xmm4, xmm0
 36352  	movdqa	xmm0, xmm1
 36353  	pblendvb	xmm3, xmm4, xmm0
 36354  	movd	dword ptr [r8 + 2*rsi], xmm2
 36355  	movd	dword ptr [r8 + 2*rsi + 4], xmm3
 36356  .LBB4_1162:
 36357  	cmp	rdx, r10
 36358  	je	.LBB4_1655
 36359  .LBB4_1163:
 36360  	mov	esi, 1
 36361  .LBB4_1164:                             # =>This Inner Loop Header: Depth=1
 36362  	mov	rdi, qword ptr [rcx + 8*rdx]
 36363  	xor	eax, eax
 36364  	test	rdi, rdi
 36365  	setne	al
 36366  	neg	eax
 36367  	test	rdi, rdi
 36368  	cmovg	eax, esi
 36369  	mov	word ptr [r8 + 2*rdx], ax
 36370  	add	rdx, 1
 36371  	cmp	r10, rdx
 36372  	jne	.LBB4_1164
 36373  	jmp	.LBB4_1655
 36374  .LBB4_1165:
 36375  	xor	edi, edi
 36376  .LBB4_1166:
 36377  	test	r9b, 1
 36378  	je	.LBB4_1168
 36379  # %bb.1167:
 36380  	movups	xmm0, xmmword ptr [rcx + 4*rdi]
 36381  	movups	xmm1, xmmword ptr [rcx + 4*rdi + 16]
 36382  	xorps	xmm4, xmm4
 36383  	movaps	xmm2, xmm0
 36384  	cmpeqps	xmm2, xmm4
 36385  	packssdw	xmm2, xmm2
 36386  	movaps	xmm3, xmm1
 36387  	cmpeqps	xmm3, xmm4
 36388  	packssdw	xmm3, xmm3
 36389  	pcmpeqd	xmm5, xmm5
 36390  	pcmpgtd	xmm0, xmm5
 36391  	packssdw	xmm0, xmm0
 36392  	pcmpgtd	xmm1, xmm5
 36393  	packssdw	xmm1, xmm1
 36394  	movdqa	xmm6, xmmword ptr [rip + .LCPI4_11] # xmm6 = <1,1,1,1,u,u,u,u>
 36395  	pcmpeqd	xmm7, xmm7
 36396  	pblendvb	xmm7, xmm6, xmm0
 36397  	movdqa	xmm0, xmm1
 36398  	pblendvb	xmm5, xmm6, xmm0
 36399  	movdqa	xmm0, xmm2
 36400  	pblendvb	xmm7, xmm4, xmm0
 36401  	movdqa	xmm0, xmm3
 36402  	pblendvb	xmm5, xmm4, xmm0
 36403  	punpcklqdq	xmm7, xmm5              # xmm7 = xmm7[0],xmm5[0]
 36404  	movdqu	xmmword ptr [r8 + 2*rdi], xmm7
 36405  .LBB4_1168:
 36406  	cmp	rsi, rax
 36407  	je	.LBB4_1655
 36408  .LBB4_1169:
 36409  	pxor	xmm0, xmm0
 36410  .LBB4_1170:                             # =>This Inner Loop Header: Depth=1
 36411  	movd	xmm1, dword ptr [rcx + 4*rsi]   # xmm1 = mem[0],zero,zero,zero
 36412  	movd	edx, xmm1
 36413  	xor	edi, edi
 36414  	test	edx, edx
 36415  	setns	dil
 36416  	ucomiss	xmm0, xmm1
 36417  	lea	edx, [rdi + rdi - 1]
 36418  	cmove	edx, r10d
 36419  	mov	word ptr [r8 + 2*rsi], dx
 36420  	add	rsi, 1
 36421  	cmp	rax, rsi
 36422  	jne	.LBB4_1170
 36423  	jmp	.LBB4_1655
 36424  .LBB4_1171:
 36425  	xor	edi, edi
 36426  .LBB4_1172:
 36427  	test	r9b, 1
 36428  	je	.LBB4_1174
 36429  # %bb.1173:
 36430  	movups	xmm0, xmmword ptr [rcx + 4*rdi]
 36431  	movups	xmm1, xmmword ptr [rcx + 4*rdi + 16]
 36432  	xorps	xmm4, xmm4
 36433  	movaps	xmm2, xmm0
 36434  	cmpeqps	xmm2, xmm4
 36435  	packssdw	xmm2, xmm2
 36436  	movaps	xmm3, xmm1
 36437  	cmpeqps	xmm3, xmm4
 36438  	packssdw	xmm3, xmm3
 36439  	pcmpeqd	xmm5, xmm5
 36440  	pcmpgtd	xmm0, xmm5
 36441  	packssdw	xmm0, xmm0
 36442  	pcmpgtd	xmm1, xmm5
 36443  	packssdw	xmm1, xmm1
 36444  	movdqa	xmm6, xmmword ptr [rip + .LCPI4_11] # xmm6 = <1,1,1,1,u,u,u,u>
 36445  	pcmpeqd	xmm7, xmm7
 36446  	pblendvb	xmm7, xmm6, xmm0
 36447  	movdqa	xmm0, xmm1
 36448  	pblendvb	xmm5, xmm6, xmm0
 36449  	movdqa	xmm0, xmm2
 36450  	pblendvb	xmm7, xmm4, xmm0
 36451  	movdqa	xmm0, xmm3
 36452  	pblendvb	xmm5, xmm4, xmm0
 36453  	punpcklqdq	xmm7, xmm5              # xmm7 = xmm7[0],xmm5[0]
 36454  	movdqu	xmmword ptr [r8 + 2*rdi], xmm7
 36455  .LBB4_1174:
 36456  	cmp	rsi, rax
 36457  	je	.LBB4_1655
 36458  .LBB4_1175:
 36459  	pxor	xmm0, xmm0
 36460  .LBB4_1176:                             # =>This Inner Loop Header: Depth=1
 36461  	movd	xmm1, dword ptr [rcx + 4*rsi]   # xmm1 = mem[0],zero,zero,zero
 36462  	movd	edx, xmm1
 36463  	xor	edi, edi
 36464  	test	edx, edx
 36465  	setns	dil
 36466  	ucomiss	xmm0, xmm1
 36467  	lea	edx, [rdi + rdi - 1]
 36468  	cmove	edx, r10d
 36469  	mov	word ptr [r8 + 2*rsi], dx
 36470  	add	rsi, 1
 36471  	cmp	rax, rsi
 36472  	jne	.LBB4_1176
 36473  	jmp	.LBB4_1655
 36474  .LBB4_1177:
 36475  	xor	esi, esi
 36476  .LBB4_1178:
 36477  	test	r9b, 1
 36478  	je	.LBB4_1180
 36479  # %bb.1179:
 36480  	movq	xmm0, qword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero
 36481  	movq	xmm1, qword ptr [rcx + 4*rsi + 8] # xmm1 = mem[0],zero
 36482  	pxor	xmm2, xmm2
 36483  	pcmpeqd	xmm0, xmm2
 36484  	pcmpeqd	xmm3, xmm3
 36485  	pxor	xmm0, xmm3
 36486  	pmovzxdq	xmm0, xmm0                      # xmm0 = xmm0[0],zero,xmm0[1],zero
 36487  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 36488  	pand	xmm0, xmm4
 36489  	pcmpeqd	xmm1, xmm2
 36490  	pxor	xmm1, xmm3
 36491  	pmovzxdq	xmm1, xmm1                      # xmm1 = xmm1[0],zero,xmm1[1],zero
 36492  	pand	xmm1, xmm4
 36493  	movdqu	xmmword ptr [r8 + 8*rsi], xmm0
 36494  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm1
 36495  .LBB4_1180:
 36496  	cmp	rdx, rax
 36497  	je	.LBB4_1655
 36498  .LBB4_1181:                             # =>This Inner Loop Header: Depth=1
 36499  	xor	esi, esi
 36500  	cmp	dword ptr [rcx + 4*rdx], 0
 36501  	setne	sil
 36502  	mov	qword ptr [r8 + 8*rdx], rsi
 36503  	add	rdx, 1
 36504  	cmp	rax, rdx
 36505  	jne	.LBB4_1181
 36506  	jmp	.LBB4_1655
 36507  .LBB4_1182:
 36508  	xor	esi, esi
 36509  .LBB4_1183:
 36510  	test	r9b, 1
 36511  	je	.LBB4_1185
 36512  # %bb.1184:
 36513  	movdqu	xmm0, xmmword ptr [rcx + 4*rsi]
 36514  	movdqu	xmm1, xmmword ptr [rcx + 4*rsi + 16]
 36515  	pxor	xmm2, xmm2
 36516  	pcmpeqd	xmm0, xmm2
 36517  	movdqa	xmm3, xmmword ptr [rip + .LCPI4_19] # xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 36518  	pandn	xmm0, xmm3
 36519  	pcmpeqd	xmm1, xmm2
 36520  	pandn	xmm1, xmm3
 36521  	movdqu	xmmword ptr [r8 + 4*rsi], xmm0
 36522  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm1
 36523  .LBB4_1185:
 36524  	cmp	rdx, rax
 36525  	je	.LBB4_1655
 36526  .LBB4_1186:
 36527  	movd	xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero
 36528  	jmp	.LBB4_1188
 36529  .LBB4_1187:                             #   in Loop: Header=BB4_1188 Depth=1
 36530  	movd	dword ptr [r8 + 4*rdx], xmm1
 36531  	add	rdx, 1
 36532  	cmp	rax, rdx
 36533  	je	.LBB4_1655
 36534  .LBB4_1188:                             # =>This Inner Loop Header: Depth=1
 36535  	cmp	dword ptr [rcx + 4*rdx], 0
 36536  	movdqa	xmm1, xmm0
 36537  	jne	.LBB4_1187
 36538  # %bb.1189:                             #   in Loop: Header=BB4_1188 Depth=1
 36539  	pxor	xmm1, xmm1
 36540  	jmp	.LBB4_1187
 36541  .LBB4_1190:
 36542  	xor	esi, esi
 36543  .LBB4_1191:
 36544  	test	r9b, 1
 36545  	je	.LBB4_1193
 36546  # %bb.1192:
 36547  	movupd	xmm0, xmmword ptr [rcx + 8*rsi]
 36548  	movupd	xmm1, xmmword ptr [rcx + 8*rsi + 16]
 36549  	xorpd	xmm2, xmm2
 36550  	movapd	xmm3, xmmword ptr [rip + .LCPI4_0] # xmm3 = [-0.0E+0,-0.0E+0]
 36551  	movapd	xmm4, xmm0
 36552  	andpd	xmm4, xmm3
 36553  	movapd	xmm5, xmmword ptr [rip + .LCPI4_1] # xmm5 = [1.0E+0,1.0E+0]
 36554  	orpd	xmm4, xmm5
 36555  	andpd	xmm3, xmm1
 36556  	orpd	xmm3, xmm5
 36557  	cvttsd2si	rdi, xmm4
 36558  	movq	xmm5, rdi
 36559  	pshufd	xmm4, xmm4, 238                 # xmm4 = xmm4[2,3,2,3]
 36560  	cvttsd2si	rdi, xmm4
 36561  	movq	xmm4, rdi
 36562  	punpcklqdq	xmm5, xmm4              # xmm5 = xmm5[0],xmm4[0]
 36563  	cvttsd2si	rdi, xmm3
 36564  	movq	xmm4, rdi
 36565  	pshufd	xmm3, xmm3, 238                 # xmm3 = xmm3[2,3,2,3]
 36566  	cvttsd2si	rdi, xmm3
 36567  	movq	xmm3, rdi
 36568  	punpcklqdq	xmm4, xmm3              # xmm4 = xmm4[0],xmm3[0]
 36569  	cmpneqpd	xmm0, xmm2
 36570  	andpd	xmm0, xmm5
 36571  	cmpneqpd	xmm1, xmm2
 36572  	andpd	xmm1, xmm4
 36573  	movupd	xmmword ptr [r8 + 8*rsi], xmm0
 36574  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm1
 36575  .LBB4_1193:
 36576  	cmp	rdx, rax
 36577  	je	.LBB4_1655
 36578  .LBB4_1194:
 36579  	xor	esi, esi
 36580  	xorpd	xmm0, xmm0
 36581  	movapd	xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0]
 36582  	movsd	xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero
 36583  .LBB4_1195:                             # =>This Inner Loop Header: Depth=1
 36584  	movsd	xmm3, qword ptr [rcx + 8*rdx]   # xmm3 = mem[0],zero
 36585  	ucomisd	xmm0, xmm3
 36586  	andpd	xmm3, xmm1
 36587  	orpd	xmm3, xmm2
 36588  	cvttsd2si	rdi, xmm3
 36589  	cmove	rdi, rsi
 36590  	mov	qword ptr [r8 + 8*rdx], rdi
 36591  	add	rdx, 1
 36592  	cmp	rax, rdx
 36593  	jne	.LBB4_1195
 36594  	jmp	.LBB4_1655
 36595  .LBB4_1196:
 36596  	xor	esi, esi
 36597  .LBB4_1197:
 36598  	test	r9b, 1
 36599  	je	.LBB4_1199
 36600  # %bb.1198:
 36601  	movupd	xmm2, xmmword ptr [rcx + 8*rsi]
 36602  	movupd	xmm8, xmmword ptr [rcx + 8*rsi + 16]
 36603  	xorps	xmm0, xmm0
 36604  	cvtsd2ss	xmm3, xmm2
 36605  	cmpeqpd	xmm2, xmm0
 36606  	shufps	xmm2, xmm2, 232                 # xmm2 = xmm2[0,2,2,3]
 36607  	cvtpd2ps	xmm4, xmmword ptr [rip + .LCPI4_1]
 36608  	cmpeqpd	xmm0, xmm8
 36609  	movsd	xmm5, qword ptr [rcx + 8*rsi + 8] # xmm5 = mem[0],zero
 36610  	cvtsd2ss	xmm5, xmm5
 36611  	shufps	xmm0, xmm0, 232                 # xmm0 = xmm0[0,2,2,3]
 36612  	movaps	xmm6, xmmword ptr [rip + .LCPI4_3] # xmm6 = [NaN,NaN,NaN,NaN]
 36613  	movaps	xmm7, xmm6
 36614  	andnps	xmm7, xmm5
 36615  	movshdup	xmm5, xmm4                      # xmm5 = xmm4[1,1,3,3]
 36616  	andps	xmm5, xmm6
 36617  	orps	xmm7, xmm5
 36618  	movaps	xmm1, xmm6
 36619  	andnps	xmm1, xmm3
 36620  	andps	xmm4, xmm6
 36621  	orps	xmm1, xmm4
 36622  	unpcklps	xmm1, xmm7                      # xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
 36623  	andnps	xmm2, xmm1
 36624  	movsd	xmm1, qword ptr [rcx + 8*rsi + 24] # xmm1 = mem[0],zero
 36625  	cvtsd2ss	xmm1, xmm1
 36626  	movaps	xmm3, xmm6
 36627  	andnps	xmm3, xmm1
 36628  	orps	xmm3, xmm5
 36629  	xorps	xmm1, xmm1
 36630  	cvtsd2ss	xmm1, xmm8
 36631  	andnps	xmm6, xmm1
 36632  	orps	xmm6, xmm4
 36633  	unpcklps	xmm6, xmm3                      # xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
 36634  	andnps	xmm0, xmm6
 36635  	movlhps	xmm2, xmm0                      # xmm2 = xmm2[0],xmm0[0]
 36636  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 36637  .LBB4_1199:
 36638  	cmp	rdx, rax
 36639  	je	.LBB4_1655
 36640  .LBB4_1200:
 36641  	xorps	xmm0, xmm0
 36642  	movaps	xmm1, xmmword ptr [rip + .LCPI4_4] # xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 36643  	movss	xmm2, dword ptr [rip + .LCPI4_5] # xmm2 = mem[0],zero,zero,zero
 36644  	jmp	.LBB4_1202
 36645  .LBB4_1201:                             #   in Loop: Header=BB4_1202 Depth=1
 36646  	movss	dword ptr [r8 + 4*rdx], xmm3
 36647  	add	rdx, 1
 36648  	cmp	rax, rdx
 36649  	je	.LBB4_1655
 36650  .LBB4_1202:                             # =>This Inner Loop Header: Depth=1
 36651  	movsd	xmm4, qword ptr [rcx + 8*rdx]   # xmm4 = mem[0],zero
 36652  	ucomisd	xmm0, xmm4
 36653  	xorps	xmm3, xmm3
 36654  	je	.LBB4_1201
 36655  # %bb.1203:                             #   in Loop: Header=BB4_1202 Depth=1
 36656  	xorps	xmm3, xmm3
 36657  	cvtsd2ss	xmm3, xmm4
 36658  	andps	xmm3, xmm1
 36659  	orps	xmm3, xmm2
 36660  	jmp	.LBB4_1201
 36661  .LBB4_1204:
 36662  	xor	esi, esi
 36663  .LBB4_1205:
 36664  	test	r9b, 1
 36665  	je	.LBB4_1207
 36666  # %bb.1206:
 36667  	movq	xmm0, qword ptr [rcx + 2*rsi]   # xmm0 = mem[0],zero
 36668  	movq	xmm1, qword ptr [rcx + 2*rsi + 8] # xmm1 = mem[0],zero
 36669  	pxor	xmm2, xmm2
 36670  	pcmpeqw	xmm0, xmm2
 36671  	pcmpeqd	xmm3, xmm3
 36672  	pxor	xmm0, xmm3
 36673  	pmovzxwd	xmm0, xmm0                      # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 36674  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 36675  	pand	xmm0, xmm4
 36676  	cvtdq2ps	xmm0, xmm0
 36677  	pcmpeqw	xmm1, xmm2
 36678  	pxor	xmm1, xmm3
 36679  	pmovzxwd	xmm1, xmm1                      # xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 36680  	pand	xmm1, xmm4
 36681  	cvtdq2ps	xmm1, xmm1
 36682  	movups	xmmword ptr [r8 + 4*rsi], xmm0
 36683  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm1
 36684  .LBB4_1207:
 36685  	cmp	rdx, rax
 36686  	je	.LBB4_1655
 36687  .LBB4_1208:
 36688  	movd	xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero
 36689  	jmp	.LBB4_1210
 36690  .LBB4_1209:                             #   in Loop: Header=BB4_1210 Depth=1
 36691  	movd	dword ptr [r8 + 4*rdx], xmm1
 36692  	add	rdx, 1
 36693  	cmp	rax, rdx
 36694  	je	.LBB4_1655
 36695  .LBB4_1210:                             # =>This Inner Loop Header: Depth=1
 36696  	cmp	word ptr [rcx + 2*rdx], 0
 36697  	movdqa	xmm1, xmm0
 36698  	jne	.LBB4_1209
 36699  # %bb.1211:                             #   in Loop: Header=BB4_1210 Depth=1
 36700  	pxor	xmm1, xmm1
 36701  	jmp	.LBB4_1209
 36702  .LBB4_1212:
 36703  	xor	esi, esi
 36704  .LBB4_1213:
 36705  	test	r9b, 1
 36706  	je	.LBB4_1215
 36707  # %bb.1214:
 36708  	movd	xmm2, dword ptr [rcx + 2*rsi]   # xmm2 = mem[0],zero,zero,zero
 36709  	movd	xmm3, dword ptr [rcx + 2*rsi + 4] # xmm3 = mem[0],zero,zero,zero
 36710  	xorpd	xmm4, xmm4
 36711  	movdqa	xmm0, xmm2
 36712  	pcmpgtw	xmm0, xmm4
 36713  	pmovsxwq	xmm0, xmm0
 36714  	movdqa	xmm1, xmm3
 36715  	pcmpgtw	xmm1, xmm4
 36716  	pmovsxwq	xmm1, xmm1
 36717  	pcmpeqw	xmm2, xmm4
 36718  	pcmpeqd	xmm5, xmm5
 36719  	pxor	xmm2, xmm5
 36720  	pmovsxwq	xmm2, xmm2
 36721  	pcmpeqw	xmm3, xmm4
 36722  	pxor	xmm3, xmm5
 36723  	pmovsxwq	xmm3, xmm3
 36724  	movapd	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 36725  	blendvpd	xmm2, xmm4, xmm0
 36726  	movdqa	xmm0, xmm1
 36727  	blendvpd	xmm3, xmm4, xmm0
 36728  	movupd	xmmword ptr [r8 + 8*rsi], xmm2
 36729  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm3
 36730  .LBB4_1215:
 36731  	cmp	rdx, r10
 36732  	je	.LBB4_1655
 36733  .LBB4_1216:
 36734  	mov	esi, 1
 36735  .LBB4_1217:                             # =>This Inner Loop Header: Depth=1
 36736  	movzx	edi, word ptr [rcx + 2*rdx]
 36737  	xor	eax, eax
 36738  	test	di, di
 36739  	setne	al
 36740  	neg	rax
 36741  	test	di, di
 36742  	cmovg	rax, rsi
 36743  	mov	qword ptr [r8 + 8*rdx], rax
 36744  	add	rdx, 1
 36745  	cmp	r10, rdx
 36746  	jne	.LBB4_1217
 36747  	jmp	.LBB4_1655
 36748  .LBB4_1218:
 36749  	xor	esi, esi
 36750  .LBB4_1219:
 36751  	test	r9b, 1
 36752  	je	.LBB4_1221
 36753  # %bb.1220:
 36754  	movq	xmm2, qword ptr [rcx + 2*rsi]   # xmm2 = mem[0],zero
 36755  	movq	xmm3, qword ptr [rcx + 2*rsi + 8] # xmm3 = mem[0],zero
 36756  	xorps	xmm4, xmm4
 36757  	movdqa	xmm0, xmm2
 36758  	pcmpgtw	xmm0, xmm4
 36759  	pmovsxwd	xmm0, xmm0
 36760  	movdqa	xmm1, xmm3
 36761  	pcmpgtw	xmm1, xmm4
 36762  	pmovsxwd	xmm1, xmm1
 36763  	pcmpeqw	xmm2, xmm4
 36764  	pcmpeqd	xmm5, xmm5
 36765  	pxor	xmm2, xmm5
 36766  	pmovsxwd	xmm2, xmm2
 36767  	cvtdq2ps	xmm2, xmm2
 36768  	pcmpeqw	xmm3, xmm4
 36769  	pxor	xmm3, xmm5
 36770  	pmovsxwd	xmm3, xmm3
 36771  	cvtdq2ps	xmm3, xmm3
 36772  	movaps	xmm4, xmmword ptr [rip + .LCPI4_19] # xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 36773  	blendvps	xmm2, xmm4, xmm0
 36774  	movdqa	xmm0, xmm1
 36775  	blendvps	xmm3, xmm4, xmm0
 36776  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 36777  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm3
 36778  .LBB4_1221:
 36779  	cmp	rdx, rax
 36780  	je	.LBB4_1655
 36781  .LBB4_1222:
 36782  	movd	xmm0, dword ptr [rip + .LCPI4_14] # xmm0 = mem[0],zero,zero,zero
 36783  	movd	xmm1, dword ptr [rip + .LCPI4_5] # xmm1 = mem[0],zero,zero,zero
 36784  	jmp	.LBB4_1224
 36785  .LBB4_1223:                             #   in Loop: Header=BB4_1224 Depth=1
 36786  	movd	dword ptr [r8 + 4*rdx], xmm3
 36787  	add	rdx, 1
 36788  	cmp	rax, rdx
 36789  	je	.LBB4_1655
 36790  .LBB4_1224:                             # =>This Inner Loop Header: Depth=1
 36791  	cmp	word ptr [rcx + 2*rdx], 0
 36792  	movdqa	xmm2, xmm0
 36793  	jne	.LBB4_1226
 36794  # %bb.1225:                             #   in Loop: Header=BB4_1224 Depth=1
 36795  	pxor	xmm2, xmm2
 36796  .LBB4_1226:                             #   in Loop: Header=BB4_1224 Depth=1
 36797  	movdqa	xmm3, xmm1
 36798  	jg	.LBB4_1223
 36799  # %bb.1227:                             #   in Loop: Header=BB4_1224 Depth=1
 36800  	movdqa	xmm3, xmm2
 36801  	jmp	.LBB4_1223
 36802  .LBB4_1104:
 36803  	movmskps	ecx, xmm0
 36804  	and	ecx, 1
 36805  	neg	ecx
 36806  	or	ecx, 1
 36807  	xorps	xmm0, xmm0
 36808  	cvtsi2ss	xmm0, ecx
 36809  	cvttss2si	rcx, xmm0
 36810  .LBB4_1105:
 36811  	mov	qword ptr [r8 + 8*rax], rcx
 36812  .LBB4_1655:
 36813  	lea	rsp, [rbp - 16]
 36814  	pop	rbx
 36815  	pop	r14
 36816  	pop	rbp
 36817  	ret
 36818  .LBB4_1228:
 36819  	xor	esi, esi
 36820  .LBB4_1229:
 36821  	test	r9b, 1
 36822  	je	.LBB4_1231
 36823  # %bb.1230:
 36824  	movdqu	xmm0, xmmword ptr [rcx + 4*rsi]
 36825  	movdqu	xmm1, xmmword ptr [rcx + 4*rsi + 16]
 36826  	pxor	xmm2, xmm2
 36827  	pcmpeqd	xmm0, xmm2
 36828  	movdqa	xmm3, xmmword ptr [rip + .LCPI4_8] # xmm3 = [1,1,1,1]
 36829  	pandn	xmm0, xmm3
 36830  	pcmpeqd	xmm1, xmm2
 36831  	pandn	xmm1, xmm3
 36832  	movdqu	xmmword ptr [r8 + 4*rsi], xmm0
 36833  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm1
 36834  .LBB4_1231:
 36835  	cmp	rdx, r10
 36836  	je	.LBB4_1655
 36837  	jmp	.LBB4_1232
 36838  .LBB4_1236:
 36839  	xor	esi, esi
 36840  .LBB4_1237:
 36841  	test	r9b, 1
 36842  	je	.LBB4_1239
 36843  # %bb.1238:
 36844  	movd	xmm2, dword ptr [rcx + rsi]     # xmm2 = mem[0],zero,zero,zero
 36845  	movd	xmm3, dword ptr [rcx + rsi + 4] # xmm3 = mem[0],zero,zero,zero
 36846  	xorps	xmm4, xmm4
 36847  	movdqa	xmm0, xmm2
 36848  	pcmpgtb	xmm0, xmm4
 36849  	pmovsxbd	xmm0, xmm0
 36850  	movdqa	xmm1, xmm3
 36851  	pcmpgtb	xmm1, xmm4
 36852  	pmovsxbd	xmm1, xmm1
 36853  	pcmpeqb	xmm2, xmm4
 36854  	pcmpeqd	xmm5, xmm5
 36855  	pxor	xmm2, xmm5
 36856  	pmovsxbd	xmm2, xmm2
 36857  	pcmpeqb	xmm3, xmm4
 36858  	pxor	xmm3, xmm5
 36859  	pmovsxbd	xmm3, xmm3
 36860  	movaps	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 36861  	blendvps	xmm2, xmm4, xmm0
 36862  	movdqa	xmm0, xmm1
 36863  	blendvps	xmm3, xmm4, xmm0
 36864  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 36865  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm3
 36866  .LBB4_1239:
 36867  	cmp	rdx, r10
 36868  	je	.LBB4_1655
 36869  	jmp	.LBB4_1240
 36870  .LBB4_1245:
 36871  	xor	esi, esi
 36872  .LBB4_1246:
 36873  	test	r9b, 1
 36874  	je	.LBB4_1248
 36875  # %bb.1247:
 36876  	movd	xmm0, dword ptr [rcx + rsi]     # xmm0 = mem[0],zero,zero,zero
 36877  	movd	xmm1, dword ptr [rcx + rsi + 4] # xmm1 = mem[0],zero,zero,zero
 36878  	pxor	xmm2, xmm2
 36879  	pcmpeqb	xmm0, xmm2
 36880  	pcmpeqd	xmm3, xmm3
 36881  	pxor	xmm0, xmm3
 36882  	pmovzxbd	xmm0, xmm0                      # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 36883  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 36884  	pand	xmm0, xmm4
 36885  	pcmpeqb	xmm1, xmm2
 36886  	pxor	xmm1, xmm3
 36887  	pmovzxbd	xmm1, xmm1                      # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 36888  	pand	xmm1, xmm4
 36889  	movdqu	xmmword ptr [r8 + 4*rsi], xmm0
 36890  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm1
 36891  .LBB4_1248:
 36892  	cmp	rdx, r10
 36893  	je	.LBB4_1655
 36894  	jmp	.LBB4_1249
 36895  .LBB4_1253:
 36896  	xor	esi, esi
 36897  .LBB4_1254:
 36898  	test	r9b, 1
 36899  	je	.LBB4_1256
 36900  # %bb.1255:
 36901  	movdqu	xmm1, xmmword ptr [rcx + 4*rsi]
 36902  	movdqu	xmm2, xmmword ptr [rcx + 4*rsi + 16]
 36903  	pxor	xmm3, xmm3
 36904  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 36905  	movdqa	xmm0, xmm4
 36906  	pcmpgtd	xmm0, xmm1
 36907  	movdqa	xmm5, xmm1
 36908  	pcmpeqd	xmm5, xmm3
 36909  	pcmpeqd	xmm1, xmm1
 36910  	pxor	xmm5, xmm1
 36911  	pcmpeqd	xmm3, xmm2
 36912  	pxor	xmm3, xmm1
 36913  	movdqa	xmm1, xmm4
 36914  	pcmpgtd	xmm1, xmm2
 36915  	movdqa	xmm2, xmm4
 36916  	blendvps	xmm2, xmm5, xmm0
 36917  	movdqa	xmm0, xmm1
 36918  	blendvps	xmm4, xmm3, xmm0
 36919  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 36920  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm4
 36921  .LBB4_1256:
 36922  	cmp	rdx, r11
 36923  	je	.LBB4_1655
 36924  	jmp	.LBB4_1257
 36925  .LBB4_1262:
 36926  	xor	esi, esi
 36927  .LBB4_1263:
 36928  	test	r9b, 1
 36929  	je	.LBB4_1265
 36930  # %bb.1264:
 36931  	movupd	xmm0, xmmword ptr [rcx + 8*rsi]
 36932  	movupd	xmm1, xmmword ptr [rcx + 8*rsi + 16]
 36933  	xorpd	xmm2, xmm2
 36934  	movapd	xmm3, xmmword ptr [rip + .LCPI4_0] # xmm3 = [-0.0E+0,-0.0E+0]
 36935  	movapd	xmm4, xmm0
 36936  	andpd	xmm4, xmm3
 36937  	movapd	xmm5, xmmword ptr [rip + .LCPI4_1] # xmm5 = [1.0E+0,1.0E+0]
 36938  	orpd	xmm4, xmm5
 36939  	andpd	xmm3, xmm1
 36940  	orpd	xmm3, xmm5
 36941  	cmpneqpd	xmm0, xmm2
 36942  	andpd	xmm0, xmm4
 36943  	cmpneqpd	xmm1, xmm2
 36944  	andpd	xmm1, xmm3
 36945  	movupd	xmmword ptr [r8 + 8*rsi], xmm0
 36946  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm1
 36947  .LBB4_1265:
 36948  	cmp	rdx, rax
 36949  	je	.LBB4_1655
 36950  	jmp	.LBB4_1266
 36951  .LBB4_1271:
 36952  	xor	esi, esi
 36953  .LBB4_1272:
 36954  	test	r9b, 1
 36955  	je	.LBB4_1274
 36956  # %bb.1273:
 36957  	movdqu	xmm0, xmmword ptr [rcx + 4*rsi]
 36958  	movdqu	xmm1, xmmword ptr [rcx + 4*rsi + 16]
 36959  	pxor	xmm2, xmm2
 36960  	pcmpeqd	xmm0, xmm2
 36961  	pcmpeqd	xmm3, xmm3
 36962  	pxor	xmm0, xmm3
 36963  	packssdw	xmm0, xmm0
 36964  	packsswb	xmm0, xmm0
 36965  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_12] # xmm4 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u>
 36966  	pand	xmm0, xmm4
 36967  	pcmpeqd	xmm1, xmm2
 36968  	pxor	xmm1, xmm3
 36969  	packssdw	xmm1, xmm1
 36970  	packsswb	xmm1, xmm1
 36971  	pand	xmm1, xmm4
 36972  	movd	dword ptr [r8 + rsi], xmm0
 36973  	movd	dword ptr [r8 + rsi + 4], xmm1
 36974  .LBB4_1274:
 36975  	cmp	rdx, rax
 36976  	je	.LBB4_1655
 36977  	jmp	.LBB4_1275
 36978  .LBB4_1279:
 36979  	xor	esi, esi
 36980  .LBB4_1280:
 36981  	test	r9b, 1
 36982  	je	.LBB4_1282
 36983  # %bb.1281:
 36984  	movupd	xmm3, xmmword ptr [rcx + 8*rsi]
 36985  	movupd	xmm4, xmmword ptr [rcx + 8*rsi + 16]
 36986  	xorpd	xmm2, xmm2
 36987  	movapd	xmm0, xmm3
 36988  	cmpeqpd	xmm0, xmm2
 36989  	packssdw	xmm0, xmm0
 36990  	packssdw	xmm0, xmm0
 36991  	packsswb	xmm0, xmm0
 36992  	movapd	xmm1, xmm4
 36993  	cmpeqpd	xmm1, xmm2
 36994  	packssdw	xmm1, xmm1
 36995  	packssdw	xmm1, xmm1
 36996  	packsswb	xmm1, xmm1
 36997  	movapd	xmm5, xmmword ptr [rip + .LCPI4_0] # xmm5 = [-0.0E+0,-0.0E+0]
 36998  	andpd	xmm3, xmm5
 36999  	movapd	xmm6, xmmword ptr [rip + .LCPI4_1] # xmm6 = [1.0E+0,1.0E+0]
 37000  	orpd	xmm3, xmm6
 37001  	andpd	xmm4, xmm5
 37002  	orpd	xmm4, xmm6
 37003  	cvttpd2dq	xmm3, xmm3
 37004  	movdqa	xmm5, xmmword ptr [rip + .LCPI4_7] # xmm5 = <0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 37005  	pshufb	xmm3, xmm5
 37006  	cvttpd2dq	xmm4, xmm4
 37007  	pshufb	xmm4, xmm5
 37008  	pblendvb	xmm3, xmm2, xmm0
 37009  	movdqa	xmm0, xmm1
 37010  	pblendvb	xmm4, xmm2, xmm0
 37011  	pextrw	word ptr [r8 + rsi], xmm3, 0
 37012  	pextrw	word ptr [r8 + rsi + 2], xmm4, 0
 37013  .LBB4_1282:
 37014  	cmp	rdx, rax
 37015  	je	.LBB4_1655
 37016  	jmp	.LBB4_1283
 37017  .LBB4_1288:
 37018  	xor	eax, eax
 37019  .LBB4_1289:
 37020  	test	r9b, 1
 37021  	je	.LBB4_1291
 37022  # %bb.1290:
 37023  	movdqu	xmm1, xmmword ptr [rcx + rax]
 37024  	movdqu	xmm2, xmmword ptr [rcx + rax + 16]
 37025  	pxor	xmm3, xmm3
 37026  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_22] # xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 37027  	movdqa	xmm0, xmm4
 37028  	pcmpgtb	xmm0, xmm1
 37029  	movdqa	xmm5, xmm1
 37030  	pcmpeqb	xmm5, xmm3
 37031  	pcmpeqd	xmm1, xmm1
 37032  	pxor	xmm5, xmm1
 37033  	pcmpeqb	xmm3, xmm2
 37034  	pxor	xmm3, xmm1
 37035  	movdqa	xmm1, xmm4
 37036  	pcmpgtb	xmm1, xmm2
 37037  	movdqa	xmm2, xmm4
 37038  	pblendvb	xmm2, xmm5, xmm0
 37039  	movdqa	xmm0, xmm1
 37040  	pblendvb	xmm4, xmm3, xmm0
 37041  	movdqu	xmmword ptr [r8 + rax], xmm2
 37042  	movdqu	xmmword ptr [r8 + rax + 16], xmm4
 37043  .LBB4_1291:
 37044  	cmp	rsi, r10
 37045  	je	.LBB4_1655
 37046  	jmp	.LBB4_1292
 37047  .LBB4_1297:
 37048  	xor	esi, esi
 37049  .LBB4_1298:
 37050  	test	r9b, 1
 37051  	je	.LBB4_1300
 37052  # %bb.1299:
 37053  	movdqu	xmm0, xmmword ptr [rcx + 8*rsi]
 37054  	movdqu	xmm1, xmmword ptr [rcx + 8*rsi + 16]
 37055  	pxor	xmm2, xmm2
 37056  	pcmpeqq	xmm0, xmm2
 37057  	pcmpeqd	xmm3, xmm3
 37058  	pxor	xmm0, xmm3
 37059  	packssdw	xmm0, xmm0
 37060  	packssdw	xmm0, xmm0
 37061  	packsswb	xmm0, xmm0
 37062  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_18] # xmm4 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 37063  	pand	xmm0, xmm4
 37064  	pcmpeqq	xmm1, xmm2
 37065  	pxor	xmm1, xmm3
 37066  	packssdw	xmm1, xmm1
 37067  	packssdw	xmm1, xmm1
 37068  	packsswb	xmm1, xmm1
 37069  	pextrw	word ptr [r8 + rsi], xmm0, 0
 37070  	pand	xmm1, xmm4
 37071  	pextrw	word ptr [r8 + rsi + 2], xmm1, 0
 37072  .LBB4_1300:
 37073  	cmp	rdx, rax
 37074  	je	.LBB4_1655
 37075  	jmp	.LBB4_1301
 37076  .LBB4_1305:
 37077  	xor	esi, esi
 37078  .LBB4_1306:
 37079  	test	r9b, 1
 37080  	je	.LBB4_1308
 37081  # %bb.1307:
 37082  	movdqu	xmm0, xmmword ptr [rcx + 2*rsi]
 37083  	movdqu	xmm1, xmmword ptr [rcx + 2*rsi + 16]
 37084  	pxor	xmm2, xmm2
 37085  	pcmpeqw	xmm0, xmm2
 37086  	pcmpeqd	xmm3, xmm3
 37087  	pxor	xmm0, xmm3
 37088  	packsswb	xmm0, xmm0
 37089  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_21] # xmm4 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u>
 37090  	pand	xmm0, xmm4
 37091  	pcmpeqw	xmm1, xmm2
 37092  	pxor	xmm1, xmm3
 37093  	packsswb	xmm1, xmm1
 37094  	pand	xmm1, xmm4
 37095  	punpcklqdq	xmm0, xmm1              # xmm0 = xmm0[0],xmm1[0]
 37096  	movdqu	xmmword ptr [r8 + rsi], xmm0
 37097  .LBB4_1308:
 37098  	cmp	rdx, rax
 37099  	je	.LBB4_1655
 37100  	jmp	.LBB4_1309
 37101  .LBB4_1313:
 37102  	xor	eax, eax
 37103  .LBB4_1314:
 37104  	test	r9b, 1
 37105  	je	.LBB4_1316
 37106  # %bb.1315:
 37107  	movdqu	xmm2, xmmword ptr [rcx + 2*rax]
 37108  	movdqu	xmm3, xmmword ptr [rcx + 2*rax + 16]
 37109  	pxor	xmm4, xmm4
 37110  	movdqa	xmm0, xmm2
 37111  	pcmpgtw	xmm0, xmm4
 37112  	packsswb	xmm0, xmm0
 37113  	movdqa	xmm1, xmm3
 37114  	pcmpgtw	xmm1, xmm4
 37115  	packsswb	xmm1, xmm1
 37116  	pcmpeqw	xmm2, xmm4
 37117  	pcmpeqd	xmm5, xmm5
 37118  	pxor	xmm2, xmm5
 37119  	packsswb	xmm2, xmm2
 37120  	pcmpeqw	xmm3, xmm4
 37121  	pxor	xmm3, xmm5
 37122  	packsswb	xmm3, xmm3
 37123  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_21] # xmm4 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u>
 37124  	pblendvb	xmm2, xmm4, xmm0
 37125  	movdqa	xmm0, xmm1
 37126  	pblendvb	xmm3, xmm4, xmm0
 37127  	punpcklqdq	xmm2, xmm3              # xmm2 = xmm2[0],xmm3[0]
 37128  	movdqu	xmmword ptr [r8 + rax], xmm2
 37129  .LBB4_1316:
 37130  	cmp	rsi, r10
 37131  	je	.LBB4_1655
 37132  	jmp	.LBB4_1317
 37133  .LBB4_1322:
 37134  	xor	eax, eax
 37135  .LBB4_1323:
 37136  	test	r9b, 1
 37137  	je	.LBB4_1325
 37138  # %bb.1324:
 37139  	movdqu	xmm2, xmmword ptr [rcx + 8*rax]
 37140  	movdqu	xmm3, xmmword ptr [rcx + 8*rax + 16]
 37141  	pxor	xmm4, xmm4
 37142  	movdqa	xmm0, xmm2
 37143  	pcmpgtq	xmm0, xmm4
 37144  	packssdw	xmm0, xmm0
 37145  	packssdw	xmm0, xmm0
 37146  	packsswb	xmm0, xmm0
 37147  	movdqa	xmm1, xmm3
 37148  	pcmpgtq	xmm1, xmm4
 37149  	packssdw	xmm1, xmm1
 37150  	packssdw	xmm1, xmm1
 37151  	packsswb	xmm1, xmm1
 37152  	pcmpeqq	xmm2, xmm4
 37153  	pcmpeqd	xmm5, xmm5
 37154  	pxor	xmm2, xmm5
 37155  	packssdw	xmm2, xmm2
 37156  	packssdw	xmm2, xmm2
 37157  	packsswb	xmm2, xmm2
 37158  	pcmpeqq	xmm3, xmm4
 37159  	pxor	xmm3, xmm5
 37160  	packssdw	xmm3, xmm3
 37161  	packssdw	xmm3, xmm3
 37162  	packsswb	xmm3, xmm3
 37163  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_18] # xmm4 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 37164  	pblendvb	xmm2, xmm4, xmm0
 37165  	movdqa	xmm0, xmm1
 37166  	pblendvb	xmm3, xmm4, xmm0
 37167  	pextrw	word ptr [r8 + rax], xmm2, 0
 37168  	pextrw	word ptr [r8 + rax + 2], xmm3, 0
 37169  .LBB4_1325:
 37170  	cmp	rsi, r10
 37171  	je	.LBB4_1655
 37172  	jmp	.LBB4_1326
 37173  .LBB4_1331:
 37174  	xor	esi, esi
 37175  .LBB4_1332:
 37176  	test	r9b, 1
 37177  	je	.LBB4_1334
 37178  # %bb.1333:
 37179  	movups	xmm0, xmmword ptr [rcx + 4*rsi]
 37180  	movups	xmm1, xmmword ptr [rcx + 4*rsi + 16]
 37181  	xorps	xmm4, xmm4
 37182  	movaps	xmm2, xmm0
 37183  	cmpeqps	xmm2, xmm4
 37184  	packssdw	xmm2, xmm2
 37185  	packsswb	xmm2, xmm2
 37186  	movaps	xmm3, xmm1
 37187  	cmpeqps	xmm3, xmm4
 37188  	packssdw	xmm3, xmm3
 37189  	packsswb	xmm3, xmm3
 37190  	pcmpeqd	xmm5, xmm5
 37191  	pcmpgtd	xmm0, xmm5
 37192  	packssdw	xmm0, xmm0
 37193  	packsswb	xmm0, xmm0
 37194  	pcmpgtd	xmm1, xmm5
 37195  	packssdw	xmm1, xmm1
 37196  	packsswb	xmm1, xmm1
 37197  	movdqa	xmm6, xmmword ptr [rip + .LCPI4_12] # xmm6 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u>
 37198  	pcmpeqd	xmm7, xmm7
 37199  	pblendvb	xmm7, xmm6, xmm0
 37200  	movdqa	xmm0, xmm1
 37201  	pblendvb	xmm5, xmm6, xmm0
 37202  	movdqa	xmm0, xmm2
 37203  	pblendvb	xmm7, xmm4, xmm0
 37204  	movdqa	xmm0, xmm3
 37205  	pblendvb	xmm5, xmm4, xmm0
 37206  	movd	dword ptr [r8 + rsi], xmm7
 37207  	movd	dword ptr [r8 + rsi + 4], xmm5
 37208  .LBB4_1334:
 37209  	cmp	rdx, r10
 37210  	je	.LBB4_1655
 37211  	jmp	.LBB4_1335
 37212  .LBB4_1340:
 37213  	xor	esi, esi
 37214  .LBB4_1341:
 37215  	test	r9b, 1
 37216  	je	.LBB4_1343
 37217  # %bb.1342:
 37218  	movdqu	xmm0, xmmword ptr [rcx + rsi]
 37219  	movdqu	xmm1, xmmword ptr [rcx + rsi + 16]
 37220  	pxor	xmm2, xmm2
 37221  	pcmpeqb	xmm0, xmm2
 37222  	movdqa	xmm3, xmmword ptr [rip + .LCPI4_22] # xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 37223  	pandn	xmm0, xmm3
 37224  	pcmpeqb	xmm1, xmm2
 37225  	pandn	xmm1, xmm3
 37226  	movdqu	xmmword ptr [r8 + rsi], xmm0
 37227  	movdqu	xmmword ptr [r8 + rsi + 16], xmm1
 37228  .LBB4_1343:
 37229  	cmp	rdx, rax
 37230  	je	.LBB4_1655
 37231  	jmp	.LBB4_1344
 37232  .LBB4_1348:
 37233  	xor	eax, eax
 37234  .LBB4_1349:
 37235  	test	r9b, 1
 37236  	je	.LBB4_1351
 37237  # %bb.1350:
 37238  	movdqu	xmm2, xmmword ptr [rcx + 4*rax]
 37239  	movdqu	xmm3, xmmword ptr [rcx + 4*rax + 16]
 37240  	pxor	xmm4, xmm4
 37241  	movdqa	xmm0, xmm2
 37242  	pcmpgtd	xmm0, xmm4
 37243  	packssdw	xmm0, xmm0
 37244  	packsswb	xmm0, xmm0
 37245  	movdqa	xmm1, xmm3
 37246  	pcmpgtd	xmm1, xmm4
 37247  	packssdw	xmm1, xmm1
 37248  	packsswb	xmm1, xmm1
 37249  	pcmpeqd	xmm2, xmm4
 37250  	pcmpeqd	xmm5, xmm5
 37251  	pxor	xmm2, xmm5
 37252  	packssdw	xmm2, xmm2
 37253  	packsswb	xmm2, xmm2
 37254  	pcmpeqd	xmm3, xmm4
 37255  	pxor	xmm3, xmm5
 37256  	packssdw	xmm3, xmm3
 37257  	packsswb	xmm3, xmm3
 37258  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_12] # xmm4 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u>
 37259  	pblendvb	xmm2, xmm4, xmm0
 37260  	movdqa	xmm0, xmm1
 37261  	pblendvb	xmm3, xmm4, xmm0
 37262  	movd	dword ptr [r8 + rax], xmm2
 37263  	movd	dword ptr [r8 + rax + 4], xmm3
 37264  .LBB4_1351:
 37265  	cmp	rsi, r10
 37266  	je	.LBB4_1655
 37267  	jmp	.LBB4_1352
 37268  .LBB4_1357:
 37269  	xor	esi, esi
 37270  .LBB4_1358:
 37271  	test	r9b, 1
 37272  	je	.LBB4_1360
 37273  # %bb.1359:
 37274  	movzx	eax, word ptr [rcx + rsi]
 37275  	movd	xmm2, eax
 37276  	movzx	eax, word ptr [rcx + rsi + 2]
 37277  	movd	xmm3, eax
 37278  	xorpd	xmm4, xmm4
 37279  	movdqa	xmm0, xmm2
 37280  	pcmpgtb	xmm0, xmm4
 37281  	pmovsxbq	xmm0, xmm0
 37282  	movdqa	xmm1, xmm3
 37283  	pcmpgtb	xmm1, xmm4
 37284  	pmovsxbq	xmm1, xmm1
 37285  	pcmpeqb	xmm2, xmm4
 37286  	pcmpeqd	xmm5, xmm5
 37287  	pxor	xmm2, xmm5
 37288  	pmovsxbq	xmm2, xmm2
 37289  	pcmpeqb	xmm3, xmm4
 37290  	pxor	xmm3, xmm5
 37291  	pmovsxbq	xmm3, xmm3
 37292  	movapd	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 37293  	blendvpd	xmm2, xmm4, xmm0
 37294  	movdqa	xmm0, xmm1
 37295  	blendvpd	xmm3, xmm4, xmm0
 37296  	movupd	xmmword ptr [r8 + 8*rsi], xmm2
 37297  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm3
 37298  .LBB4_1360:
 37299  	cmp	rdx, r10
 37300  	je	.LBB4_1655
 37301  	jmp	.LBB4_1361
 37302  .LBB4_1366:
 37303  	xor	esi, esi
 37304  .LBB4_1367:
 37305  	test	r9b, 1
 37306  	je	.LBB4_1369
 37307  # %bb.1368:
 37308  	movdqu	xmm0, xmmword ptr [rcx + 8*rsi]
 37309  	movdqu	xmm1, xmmword ptr [rcx + 8*rsi + 16]
 37310  	pxor	xmm2, xmm2
 37311  	pcmpeqq	xmm0, xmm2
 37312  	movdqa	xmm3, xmmword ptr [rip + .LCPI4_15] # xmm3 = [1,1]
 37313  	pandn	xmm0, xmm3
 37314  	pcmpeqq	xmm1, xmm2
 37315  	pandn	xmm1, xmm3
 37316  	movdqu	xmmword ptr [r8 + 8*rsi], xmm0
 37317  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm1
 37318  .LBB4_1369:
 37319  	cmp	rdx, r10
 37320  	je	.LBB4_1655
 37321  	jmp	.LBB4_1370
 37322  .LBB4_1374:
 37323  	xor	esi, esi
 37324  .LBB4_1375:
 37325  	test	r9b, 1
 37326  	je	.LBB4_1377
 37327  # %bb.1376:
 37328  	movdqu	xmm1, xmmword ptr [rcx + 8*rsi]
 37329  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi + 16]
 37330  	pxor	xmm3, xmm3
 37331  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 37332  	movdqa	xmm0, xmm4
 37333  	pcmpgtq	xmm0, xmm1
 37334  	movdqa	xmm5, xmm1
 37335  	pcmpeqq	xmm5, xmm3
 37336  	pcmpeqd	xmm1, xmm1
 37337  	pxor	xmm5, xmm1
 37338  	pcmpeqq	xmm3, xmm2
 37339  	pxor	xmm3, xmm1
 37340  	movdqa	xmm1, xmm4
 37341  	pcmpgtq	xmm1, xmm2
 37342  	movdqa	xmm2, xmm4
 37343  	blendvpd	xmm2, xmm5, xmm0
 37344  	movdqa	xmm0, xmm1
 37345  	blendvpd	xmm4, xmm3, xmm0
 37346  	movupd	xmmword ptr [r8 + 8*rsi], xmm2
 37347  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm4
 37348  .LBB4_1377:
 37349  	cmp	rdx, r11
 37350  	je	.LBB4_1655
 37351  	jmp	.LBB4_1378
 37352  .LBB4_1383:
 37353  	xor	esi, esi
 37354  .LBB4_1384:
 37355  	test	r9b, 1
 37356  	je	.LBB4_1386
 37357  # %bb.1385:
 37358  	movzx	eax, word ptr [rcx + rsi]
 37359  	movd	xmm0, eax
 37360  	movzx	eax, word ptr [rcx + rsi + 2]
 37361  	movd	xmm1, eax
 37362  	pxor	xmm2, xmm2
 37363  	pcmpeqb	xmm0, xmm2
 37364  	pcmpeqd	xmm3, xmm3
 37365  	pxor	xmm0, xmm3
 37366  	pmovzxbq	xmm0, xmm0                      # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
 37367  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 37368  	pand	xmm0, xmm4
 37369  	pcmpeqb	xmm1, xmm2
 37370  	pxor	xmm1, xmm3
 37371  	pmovzxbq	xmm1, xmm1                      # xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 37372  	pand	xmm1, xmm4
 37373  	movdqu	xmmword ptr [r8 + 8*rsi], xmm0
 37374  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm1
 37375  .LBB4_1386:
 37376  	cmp	rdx, r10
 37377  	je	.LBB4_1655
 37378  	jmp	.LBB4_1387
 37379  .LBB4_1391:
 37380  	xor	esi, esi
 37381  .LBB4_1392:
 37382  	test	r9b, 1
 37383  	je	.LBB4_1394
 37384  # %bb.1393:
 37385  	movq	xmm2, qword ptr [rcx + rsi]     # xmm2 = mem[0],zero
 37386  	movq	xmm3, qword ptr [rcx + rsi + 8] # xmm3 = mem[0],zero
 37387  	pxor	xmm4, xmm4
 37388  	movdqa	xmm0, xmm2
 37389  	pcmpgtb	xmm0, xmm4
 37390  	pmovsxbw	xmm0, xmm0
 37391  	movdqa	xmm1, xmm3
 37392  	pcmpgtb	xmm1, xmm4
 37393  	pmovsxbw	xmm1, xmm1
 37394  	pcmpeqb	xmm2, xmm4
 37395  	pcmpeqd	xmm5, xmm5
 37396  	pxor	xmm2, xmm5
 37397  	pmovsxbw	xmm2, xmm2
 37398  	pcmpeqb	xmm3, xmm4
 37399  	pxor	xmm3, xmm5
 37400  	pmovsxbw	xmm3, xmm3
 37401  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1]
 37402  	pblendvb	xmm2, xmm4, xmm0
 37403  	movdqa	xmm0, xmm1
 37404  	pblendvb	xmm3, xmm4, xmm0
 37405  	movdqu	xmmword ptr [r8 + 2*rsi], xmm2
 37406  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm3
 37407  .LBB4_1394:
 37408  	cmp	rdx, r10
 37409  	je	.LBB4_1655
 37410  	jmp	.LBB4_1395
 37411  .LBB4_1400:
 37412  	xor	esi, esi
 37413  .LBB4_1401:
 37414  	test	r9b, 1
 37415  	je	.LBB4_1403
 37416  # %bb.1402:
 37417  	movq	xmm2, qword ptr [rcx + rsi]     # xmm2 = mem[0],zero
 37418  	movq	xmm3, qword ptr [rcx + rsi + 8] # xmm3 = mem[0],zero
 37419  	pxor	xmm4, xmm4
 37420  	movdqa	xmm0, xmm2
 37421  	pcmpgtb	xmm0, xmm4
 37422  	pmovsxbw	xmm0, xmm0
 37423  	movdqa	xmm1, xmm3
 37424  	pcmpgtb	xmm1, xmm4
 37425  	pmovsxbw	xmm1, xmm1
 37426  	pcmpeqb	xmm2, xmm4
 37427  	pcmpeqd	xmm5, xmm5
 37428  	pxor	xmm2, xmm5
 37429  	pmovsxbw	xmm2, xmm2
 37430  	pcmpeqb	xmm3, xmm4
 37431  	pxor	xmm3, xmm5
 37432  	pmovsxbw	xmm3, xmm3
 37433  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1]
 37434  	pblendvb	xmm2, xmm4, xmm0
 37435  	movdqa	xmm0, xmm1
 37436  	pblendvb	xmm3, xmm4, xmm0
 37437  	movdqu	xmmword ptr [r8 + 2*rsi], xmm2
 37438  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm3
 37439  .LBB4_1403:
 37440  	cmp	rdx, r10
 37441  	je	.LBB4_1655
 37442  	jmp	.LBB4_1404
 37443  .LBB4_1409:
 37444  	xor	esi, esi
 37445  .LBB4_1410:
 37446  	test	r9b, 1
 37447  	je	.LBB4_1412
 37448  # %bb.1411:
 37449  	movdqu	xmm0, xmmword ptr [rcx + 2*rsi]
 37450  	movdqu	xmm1, xmmword ptr [rcx + 2*rsi + 16]
 37451  	pxor	xmm2, xmm2
 37452  	pcmpeqw	xmm0, xmm2
 37453  	movdqa	xmm3, xmmword ptr [rip + .LCPI4_20] # xmm3 = [1,1,1,1,1,1,1,1]
 37454  	pandn	xmm0, xmm3
 37455  	pcmpeqw	xmm1, xmm2
 37456  	pandn	xmm1, xmm3
 37457  	movdqu	xmmword ptr [r8 + 2*rsi], xmm0
 37458  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm1
 37459  .LBB4_1412:
 37460  	cmp	rdx, r10
 37461  	je	.LBB4_1655
 37462  	jmp	.LBB4_1413
 37463  .LBB4_1417:
 37464  	xor	esi, esi
 37465  .LBB4_1418:
 37466  	test	r9b, 1
 37467  	je	.LBB4_1420
 37468  # %bb.1419:
 37469  	movdqu	xmm0, xmmword ptr [rcx + 2*rsi]
 37470  	movdqu	xmm1, xmmword ptr [rcx + 2*rsi + 16]
 37471  	pxor	xmm2, xmm2
 37472  	pcmpeqw	xmm0, xmm2
 37473  	movdqa	xmm3, xmmword ptr [rip + .LCPI4_20] # xmm3 = [1,1,1,1,1,1,1,1]
 37474  	pandn	xmm0, xmm3
 37475  	pcmpeqw	xmm1, xmm2
 37476  	pandn	xmm1, xmm3
 37477  	movdqu	xmmword ptr [r8 + 2*rsi], xmm0
 37478  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm1
 37479  .LBB4_1420:
 37480  	cmp	rdx, r10
 37481  	je	.LBB4_1655
 37482  	jmp	.LBB4_1421
 37483  .LBB4_1425:
 37484  	xor	esi, esi
 37485  .LBB4_1426:
 37486  	test	r9b, 1
 37487  	je	.LBB4_1428
 37488  # %bb.1427:
 37489  	movdqu	xmm1, xmmword ptr [rcx + 2*rsi]
 37490  	movdqu	xmm2, xmmword ptr [rcx + 2*rsi + 16]
 37491  	pxor	xmm3, xmm3
 37492  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1]
 37493  	movdqa	xmm0, xmm4
 37494  	pcmpgtw	xmm0, xmm1
 37495  	movdqa	xmm5, xmm1
 37496  	pcmpeqw	xmm5, xmm3
 37497  	pcmpeqd	xmm1, xmm1
 37498  	pxor	xmm5, xmm1
 37499  	pcmpeqw	xmm3, xmm2
 37500  	pxor	xmm3, xmm1
 37501  	movdqa	xmm1, xmm4
 37502  	pcmpgtw	xmm1, xmm2
 37503  	movdqa	xmm2, xmm4
 37504  	pblendvb	xmm2, xmm5, xmm0
 37505  	movdqa	xmm0, xmm1
 37506  	pblendvb	xmm4, xmm3, xmm0
 37507  	movdqu	xmmword ptr [r8 + 2*rsi], xmm2
 37508  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm4
 37509  .LBB4_1428:
 37510  	cmp	rdx, r11
 37511  	je	.LBB4_1655
 37512  	jmp	.LBB4_1429
 37513  .LBB4_1434:
 37514  	xor	esi, esi
 37515  .LBB4_1435:
 37516  	test	r9b, 1
 37517  	je	.LBB4_1437
 37518  # %bb.1436:
 37519  	movdqu	xmm1, xmmword ptr [rcx + 2*rsi]
 37520  	movdqu	xmm2, xmmword ptr [rcx + 2*rsi + 16]
 37521  	pxor	xmm3, xmm3
 37522  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1]
 37523  	movdqa	xmm0, xmm4
 37524  	pcmpgtw	xmm0, xmm1
 37525  	movdqa	xmm5, xmm1
 37526  	pcmpeqw	xmm5, xmm3
 37527  	pcmpeqd	xmm1, xmm1
 37528  	pxor	xmm5, xmm1
 37529  	pcmpeqw	xmm3, xmm2
 37530  	pxor	xmm3, xmm1
 37531  	movdqa	xmm1, xmm4
 37532  	pcmpgtw	xmm1, xmm2
 37533  	movdqa	xmm2, xmm4
 37534  	pblendvb	xmm2, xmm5, xmm0
 37535  	movdqa	xmm0, xmm1
 37536  	pblendvb	xmm4, xmm3, xmm0
 37537  	movdqu	xmmword ptr [r8 + 2*rsi], xmm2
 37538  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm4
 37539  .LBB4_1437:
 37540  	cmp	rdx, r11
 37541  	je	.LBB4_1655
 37542  	jmp	.LBB4_1438
 37543  .LBB4_1443:
 37544  	xor	esi, esi
 37545  .LBB4_1444:
 37546  	test	r9b, 1
 37547  	je	.LBB4_1446
 37548  # %bb.1445:
 37549  	movq	xmm0, qword ptr [rcx + rsi]     # xmm0 = mem[0],zero
 37550  	movq	xmm1, qword ptr [rcx + rsi + 8] # xmm1 = mem[0],zero
 37551  	pxor	xmm2, xmm2
 37552  	pcmpeqb	xmm0, xmm2
 37553  	pcmpeqd	xmm3, xmm3
 37554  	pxor	xmm0, xmm3
 37555  	pmovzxbw	xmm0, xmm0                      # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 37556  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1]
 37557  	pand	xmm0, xmm4
 37558  	pcmpeqb	xmm1, xmm2
 37559  	pxor	xmm1, xmm3
 37560  	pmovzxbw	xmm1, xmm1                      # xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 37561  	pand	xmm1, xmm4
 37562  	movdqu	xmmword ptr [r8 + 2*rsi], xmm0
 37563  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm1
 37564  .LBB4_1446:
 37565  	cmp	rdx, r10
 37566  	je	.LBB4_1655
 37567  	jmp	.LBB4_1447
 37568  .LBB4_1451:
 37569  	xor	esi, esi
 37570  .LBB4_1452:
 37571  	test	r9b, 1
 37572  	je	.LBB4_1454
 37573  # %bb.1453:
 37574  	movq	xmm0, qword ptr [rcx + rsi]     # xmm0 = mem[0],zero
 37575  	movq	xmm1, qword ptr [rcx + rsi + 8] # xmm1 = mem[0],zero
 37576  	pxor	xmm2, xmm2
 37577  	pcmpeqb	xmm0, xmm2
 37578  	pcmpeqd	xmm3, xmm3
 37579  	pxor	xmm0, xmm3
 37580  	pmovzxbw	xmm0, xmm0                      # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 37581  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1]
 37582  	pand	xmm0, xmm4
 37583  	pcmpeqb	xmm1, xmm2
 37584  	pxor	xmm1, xmm3
 37585  	pmovzxbw	xmm1, xmm1                      # xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 37586  	pand	xmm1, xmm4
 37587  	movdqu	xmmword ptr [r8 + 2*rsi], xmm0
 37588  	movdqu	xmmword ptr [r8 + 2*rsi + 16], xmm1
 37589  .LBB4_1454:
 37590  	cmp	rdx, r10
 37591  	je	.LBB4_1655
 37592  	jmp	.LBB4_1455
 37593  .LBB4_1459:
 37594  	xor	esi, esi
 37595  .LBB4_1460:
 37596  	test	r9b, 1
 37597  	je	.LBB4_1462
 37598  # %bb.1461:
 37599  	movzx	eax, word ptr [rcx + rsi]
 37600  	movd	xmm2, eax
 37601  	movzx	eax, word ptr [rcx + rsi + 2]
 37602  	movd	xmm3, eax
 37603  	xorpd	xmm4, xmm4
 37604  	movdqa	xmm0, xmm2
 37605  	pcmpgtb	xmm0, xmm4
 37606  	pmovsxbq	xmm0, xmm0
 37607  	movdqa	xmm1, xmm3
 37608  	pcmpgtb	xmm1, xmm4
 37609  	pmovsxbq	xmm1, xmm1
 37610  	pcmpeqb	xmm2, xmm4
 37611  	pcmpeqd	xmm5, xmm5
 37612  	pxor	xmm2, xmm5
 37613  	pmovsxbq	xmm2, xmm2
 37614  	pcmpeqb	xmm3, xmm4
 37615  	pxor	xmm3, xmm5
 37616  	pmovsxbq	xmm3, xmm3
 37617  	movapd	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 37618  	blendvpd	xmm2, xmm4, xmm0
 37619  	movdqa	xmm0, xmm1
 37620  	blendvpd	xmm3, xmm4, xmm0
 37621  	movupd	xmmword ptr [r8 + 8*rsi], xmm2
 37622  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm3
 37623  .LBB4_1462:
 37624  	cmp	rdx, r10
 37625  	je	.LBB4_1655
 37626  	jmp	.LBB4_1463
 37627  .LBB4_1468:
 37628  	xor	esi, esi
 37629  .LBB4_1469:
 37630  	test	r9b, 1
 37631  	je	.LBB4_1471
 37632  # %bb.1470:
 37633  	movd	xmm2, dword ptr [rcx + rsi]     # xmm2 = mem[0],zero,zero,zero
 37634  	movd	xmm3, dword ptr [rcx + rsi + 4] # xmm3 = mem[0],zero,zero,zero
 37635  	xorps	xmm4, xmm4
 37636  	movdqa	xmm0, xmm2
 37637  	pcmpgtb	xmm0, xmm4
 37638  	pmovsxbd	xmm0, xmm0
 37639  	movdqa	xmm1, xmm3
 37640  	pcmpgtb	xmm1, xmm4
 37641  	pmovsxbd	xmm1, xmm1
 37642  	pcmpeqb	xmm2, xmm4
 37643  	pcmpeqd	xmm5, xmm5
 37644  	pxor	xmm2, xmm5
 37645  	pmovsxbd	xmm2, xmm2
 37646  	cvtdq2ps	xmm2, xmm2
 37647  	pcmpeqb	xmm3, xmm4
 37648  	pxor	xmm3, xmm5
 37649  	pmovsxbd	xmm3, xmm3
 37650  	cvtdq2ps	xmm3, xmm3
 37651  	movaps	xmm4, xmmword ptr [rip + .LCPI4_19] # xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 37652  	blendvps	xmm2, xmm4, xmm0
 37653  	movdqa	xmm0, xmm1
 37654  	blendvps	xmm3, xmm4, xmm0
 37655  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 37656  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm3
 37657  .LBB4_1471:
 37658  	cmp	rdx, rax
 37659  	je	.LBB4_1655
 37660  	jmp	.LBB4_1472
 37661  .LBB4_1490:
 37662  	xor	esi, esi
 37663  .LBB4_1491:
 37664  	test	r9b, 1
 37665  	je	.LBB4_1493
 37666  # %bb.1492:
 37667  	movdqu	xmm0, xmmword ptr [rcx + 8*rsi]
 37668  	movdqu	xmm1, xmmword ptr [rcx + 8*rsi + 16]
 37669  	pxor	xmm2, xmm2
 37670  	pcmpeqq	xmm0, xmm2
 37671  	movdqa	xmm3, xmmword ptr [rip + .LCPI4_15] # xmm3 = [1,1]
 37672  	pandn	xmm0, xmm3
 37673  	pcmpeqq	xmm1, xmm2
 37674  	pandn	xmm1, xmm3
 37675  	movdqu	xmmword ptr [r8 + 8*rsi], xmm0
 37676  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm1
 37677  .LBB4_1493:
 37678  	cmp	rdx, r10
 37679  	je	.LBB4_1655
 37680  	jmp	.LBB4_1494
 37681  .LBB4_1498:
 37682  	xor	esi, esi
 37683  .LBB4_1499:
 37684  	test	r9b, 1
 37685  	je	.LBB4_1501
 37686  # %bb.1500:
 37687  	movdqu	xmm1, xmmword ptr [rcx + 8*rsi]
 37688  	movdqu	xmm2, xmmword ptr [rcx + 8*rsi + 16]
 37689  	pxor	xmm3, xmm3
 37690  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 37691  	movdqa	xmm0, xmm4
 37692  	pcmpgtq	xmm0, xmm1
 37693  	movdqa	xmm5, xmm1
 37694  	pcmpeqq	xmm5, xmm3
 37695  	pcmpeqd	xmm1, xmm1
 37696  	pxor	xmm5, xmm1
 37697  	pcmpeqq	xmm3, xmm2
 37698  	pxor	xmm3, xmm1
 37699  	movdqa	xmm1, xmm4
 37700  	pcmpgtq	xmm1, xmm2
 37701  	movdqa	xmm2, xmm4
 37702  	blendvpd	xmm2, xmm5, xmm0
 37703  	movdqa	xmm0, xmm1
 37704  	blendvpd	xmm4, xmm3, xmm0
 37705  	movupd	xmmword ptr [r8 + 8*rsi], xmm2
 37706  	movupd	xmmword ptr [r8 + 8*rsi + 16], xmm4
 37707  .LBB4_1501:
 37708  	cmp	rdx, r11
 37709  	je	.LBB4_1655
 37710  	jmp	.LBB4_1502
 37711  .LBB4_1507:
 37712  	xor	esi, esi
 37713  .LBB4_1508:
 37714  	test	r9b, 1
 37715  	je	.LBB4_1510
 37716  # %bb.1509:
 37717  	movzx	eax, word ptr [rcx + rsi]
 37718  	movd	xmm0, eax
 37719  	movzx	eax, word ptr [rcx + rsi + 2]
 37720  	movd	xmm1, eax
 37721  	pxor	xmm2, xmm2
 37722  	pcmpeqb	xmm0, xmm2
 37723  	pcmpeqd	xmm3, xmm3
 37724  	pxor	xmm0, xmm3
 37725  	pmovzxbq	xmm0, xmm0                      # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
 37726  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1]
 37727  	pand	xmm0, xmm4
 37728  	pcmpeqb	xmm1, xmm2
 37729  	pxor	xmm1, xmm3
 37730  	pmovzxbq	xmm1, xmm1                      # xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 37731  	pand	xmm1, xmm4
 37732  	movdqu	xmmword ptr [r8 + 8*rsi], xmm0
 37733  	movdqu	xmmword ptr [r8 + 8*rsi + 16], xmm1
 37734  .LBB4_1510:
 37735  	cmp	rdx, r10
 37736  	je	.LBB4_1655
 37737  	jmp	.LBB4_1511
 37738  .LBB4_1515:
 37739  	xor	esi, esi
 37740  .LBB4_1516:
 37741  	test	r9b, 1
 37742  	je	.LBB4_1518
 37743  # %bb.1517:
 37744  	movd	xmm0, dword ptr [rcx + rsi]     # xmm0 = mem[0],zero,zero,zero
 37745  	movd	xmm1, dword ptr [rcx + rsi + 4] # xmm1 = mem[0],zero,zero,zero
 37746  	pxor	xmm2, xmm2
 37747  	pcmpeqb	xmm0, xmm2
 37748  	pcmpeqd	xmm3, xmm3
 37749  	pxor	xmm0, xmm3
 37750  	pmovzxbd	xmm0, xmm0                      # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 37751  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 37752  	pand	xmm0, xmm4
 37753  	cvtdq2ps	xmm0, xmm0
 37754  	pcmpeqb	xmm1, xmm2
 37755  	pxor	xmm1, xmm3
 37756  	pmovzxbd	xmm1, xmm1                      # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 37757  	pand	xmm1, xmm4
 37758  	cvtdq2ps	xmm1, xmm1
 37759  	movups	xmmword ptr [r8 + 4*rsi], xmm0
 37760  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm1
 37761  .LBB4_1518:
 37762  	cmp	rdx, rax
 37763  	je	.LBB4_1655
 37764  	jmp	.LBB4_1519
 37765  .LBB4_1535:
 37766  	xor	esi, esi
 37767  .LBB4_1536:
 37768  	test	r9b, 1
 37769  	je	.LBB4_1538
 37770  # %bb.1537:
 37771  	movdqu	xmm0, xmmword ptr [rcx + 4*rsi]
 37772  	movdqu	xmm1, xmmword ptr [rcx + 4*rsi + 16]
 37773  	pxor	xmm2, xmm2
 37774  	pcmpeqd	xmm0, xmm2
 37775  	pcmpeqd	xmm3, xmm3
 37776  	pxor	xmm0, xmm3
 37777  	packssdw	xmm0, xmm0
 37778  	packsswb	xmm0, xmm0
 37779  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_12] # xmm4 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u>
 37780  	pand	xmm0, xmm4
 37781  	pcmpeqd	xmm1, xmm2
 37782  	pxor	xmm1, xmm3
 37783  	packssdw	xmm1, xmm1
 37784  	packsswb	xmm1, xmm1
 37785  	pand	xmm1, xmm4
 37786  	movd	dword ptr [r8 + rsi], xmm0
 37787  	movd	dword ptr [r8 + rsi + 4], xmm1
 37788  .LBB4_1538:
 37789  	cmp	rdx, rax
 37790  	je	.LBB4_1655
 37791  	jmp	.LBB4_1539
 37792  .LBB4_1543:
 37793  	xor	esi, esi
 37794  .LBB4_1544:
 37795  	test	r9b, 1
 37796  	je	.LBB4_1546
 37797  # %bb.1545:
 37798  	movupd	xmm3, xmmword ptr [rcx + 8*rsi]
 37799  	movupd	xmm4, xmmword ptr [rcx + 8*rsi + 16]
 37800  	xorpd	xmm2, xmm2
 37801  	movapd	xmm0, xmm3
 37802  	cmpeqpd	xmm0, xmm2
 37803  	packssdw	xmm0, xmm0
 37804  	packssdw	xmm0, xmm0
 37805  	packsswb	xmm0, xmm0
 37806  	movapd	xmm1, xmm4
 37807  	cmpeqpd	xmm1, xmm2
 37808  	packssdw	xmm1, xmm1
 37809  	packssdw	xmm1, xmm1
 37810  	packsswb	xmm1, xmm1
 37811  	movapd	xmm5, xmmword ptr [rip + .LCPI4_0] # xmm5 = [-0.0E+0,-0.0E+0]
 37812  	andpd	xmm3, xmm5
 37813  	movapd	xmm6, xmmword ptr [rip + .LCPI4_1] # xmm6 = [1.0E+0,1.0E+0]
 37814  	orpd	xmm3, xmm6
 37815  	andpd	xmm4, xmm5
 37816  	orpd	xmm4, xmm6
 37817  	cvttpd2dq	xmm3, xmm3
 37818  	movdqa	xmm5, xmmword ptr [rip + .LCPI4_7] # xmm5 = <0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 37819  	pshufb	xmm3, xmm5
 37820  	cvttpd2dq	xmm4, xmm4
 37821  	pshufb	xmm4, xmm5
 37822  	pblendvb	xmm3, xmm2, xmm0
 37823  	movdqa	xmm0, xmm1
 37824  	pblendvb	xmm4, xmm2, xmm0
 37825  	pextrw	word ptr [r8 + rsi], xmm3, 0
 37826  	pextrw	word ptr [r8 + rsi + 2], xmm4, 0
 37827  .LBB4_1546:
 37828  	cmp	rdx, rax
 37829  	je	.LBB4_1655
 37830  	jmp	.LBB4_1547
 37831  .LBB4_1552:
 37832  	xor	eax, eax
 37833  .LBB4_1553:
 37834  	test	r9b, 1
 37835  	je	.LBB4_1555
 37836  # %bb.1554:
 37837  	movdqu	xmm1, xmmword ptr [rcx + rax]
 37838  	movdqu	xmm2, xmmword ptr [rcx + rax + 16]
 37839  	pxor	xmm3, xmm3
 37840  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_22] # xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 37841  	movdqa	xmm0, xmm4
 37842  	pcmpgtb	xmm0, xmm1
 37843  	movdqa	xmm5, xmm1
 37844  	pcmpeqb	xmm5, xmm3
 37845  	pcmpeqd	xmm1, xmm1
 37846  	pxor	xmm5, xmm1
 37847  	pcmpeqb	xmm3, xmm2
 37848  	pxor	xmm3, xmm1
 37849  	movdqa	xmm1, xmm4
 37850  	pcmpgtb	xmm1, xmm2
 37851  	movdqa	xmm2, xmm4
 37852  	pblendvb	xmm2, xmm5, xmm0
 37853  	movdqa	xmm0, xmm1
 37854  	pblendvb	xmm4, xmm3, xmm0
 37855  	movdqu	xmmword ptr [r8 + rax], xmm2
 37856  	movdqu	xmmword ptr [r8 + rax + 16], xmm4
 37857  .LBB4_1555:
 37858  	cmp	rsi, r10
 37859  	je	.LBB4_1655
 37860  	jmp	.LBB4_1556
 37861  .LBB4_1561:
 37862  	xor	esi, esi
 37863  .LBB4_1562:
 37864  	test	r9b, 1
 37865  	je	.LBB4_1564
 37866  # %bb.1563:
 37867  	movdqu	xmm0, xmmword ptr [rcx + 8*rsi]
 37868  	movdqu	xmm1, xmmword ptr [rcx + 8*rsi + 16]
 37869  	pxor	xmm2, xmm2
 37870  	pcmpeqq	xmm0, xmm2
 37871  	pcmpeqd	xmm3, xmm3
 37872  	pxor	xmm0, xmm3
 37873  	packssdw	xmm0, xmm0
 37874  	packssdw	xmm0, xmm0
 37875  	packsswb	xmm0, xmm0
 37876  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_18] # xmm4 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 37877  	pand	xmm0, xmm4
 37878  	pcmpeqq	xmm1, xmm2
 37879  	pxor	xmm1, xmm3
 37880  	packssdw	xmm1, xmm1
 37881  	packssdw	xmm1, xmm1
 37882  	packsswb	xmm1, xmm1
 37883  	pextrw	word ptr [r8 + rsi], xmm0, 0
 37884  	pand	xmm1, xmm4
 37885  	pextrw	word ptr [r8 + rsi + 2], xmm1, 0
 37886  .LBB4_1564:
 37887  	cmp	rdx, rax
 37888  	je	.LBB4_1655
 37889  	jmp	.LBB4_1565
 37890  .LBB4_1569:
 37891  	xor	esi, esi
 37892  .LBB4_1570:
 37893  	test	r9b, 1
 37894  	je	.LBB4_1572
 37895  # %bb.1571:
 37896  	movdqu	xmm0, xmmword ptr [rcx + 2*rsi]
 37897  	movdqu	xmm1, xmmword ptr [rcx + 2*rsi + 16]
 37898  	pxor	xmm2, xmm2
 37899  	pcmpeqw	xmm0, xmm2
 37900  	pcmpeqd	xmm3, xmm3
 37901  	pxor	xmm0, xmm3
 37902  	packsswb	xmm0, xmm0
 37903  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_21] # xmm4 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u>
 37904  	pand	xmm0, xmm4
 37905  	pcmpeqw	xmm1, xmm2
 37906  	pxor	xmm1, xmm3
 37907  	packsswb	xmm1, xmm1
 37908  	pand	xmm1, xmm4
 37909  	punpcklqdq	xmm0, xmm1              # xmm0 = xmm0[0],xmm1[0]
 37910  	movdqu	xmmword ptr [r8 + rsi], xmm0
 37911  .LBB4_1572:
 37912  	cmp	rdx, rax
 37913  	je	.LBB4_1655
 37914  	jmp	.LBB4_1573
 37915  .LBB4_1577:
 37916  	xor	eax, eax
 37917  .LBB4_1578:
 37918  	test	r9b, 1
 37919  	je	.LBB4_1580
 37920  # %bb.1579:
 37921  	movdqu	xmm2, xmmword ptr [rcx + 2*rax]
 37922  	movdqu	xmm3, xmmword ptr [rcx + 2*rax + 16]
 37923  	pxor	xmm4, xmm4
 37924  	movdqa	xmm0, xmm2
 37925  	pcmpgtw	xmm0, xmm4
 37926  	packsswb	xmm0, xmm0
 37927  	movdqa	xmm1, xmm3
 37928  	pcmpgtw	xmm1, xmm4
 37929  	packsswb	xmm1, xmm1
 37930  	pcmpeqw	xmm2, xmm4
 37931  	pcmpeqd	xmm5, xmm5
 37932  	pxor	xmm2, xmm5
 37933  	packsswb	xmm2, xmm2
 37934  	pcmpeqw	xmm3, xmm4
 37935  	pxor	xmm3, xmm5
 37936  	packsswb	xmm3, xmm3
 37937  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_21] # xmm4 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u>
 37938  	pblendvb	xmm2, xmm4, xmm0
 37939  	movdqa	xmm0, xmm1
 37940  	pblendvb	xmm3, xmm4, xmm0
 37941  	punpcklqdq	xmm2, xmm3              # xmm2 = xmm2[0],xmm3[0]
 37942  	movdqu	xmmword ptr [r8 + rax], xmm2
 37943  .LBB4_1580:
 37944  	cmp	rsi, r10
 37945  	je	.LBB4_1655
 37946  	jmp	.LBB4_1581
 37947  .LBB4_1586:
 37948  	xor	eax, eax
 37949  .LBB4_1587:
 37950  	test	r9b, 1
 37951  	je	.LBB4_1589
 37952  # %bb.1588:
 37953  	movdqu	xmm2, xmmword ptr [rcx + 8*rax]
 37954  	movdqu	xmm3, xmmword ptr [rcx + 8*rax + 16]
 37955  	pxor	xmm4, xmm4
 37956  	movdqa	xmm0, xmm2
 37957  	pcmpgtq	xmm0, xmm4
 37958  	packssdw	xmm0, xmm0
 37959  	packssdw	xmm0, xmm0
 37960  	packsswb	xmm0, xmm0
 37961  	movdqa	xmm1, xmm3
 37962  	pcmpgtq	xmm1, xmm4
 37963  	packssdw	xmm1, xmm1
 37964  	packssdw	xmm1, xmm1
 37965  	packsswb	xmm1, xmm1
 37966  	pcmpeqq	xmm2, xmm4
 37967  	pcmpeqd	xmm5, xmm5
 37968  	pxor	xmm2, xmm5
 37969  	packssdw	xmm2, xmm2
 37970  	packssdw	xmm2, xmm2
 37971  	packsswb	xmm2, xmm2
 37972  	pcmpeqq	xmm3, xmm4
 37973  	pxor	xmm3, xmm5
 37974  	packssdw	xmm3, xmm3
 37975  	packssdw	xmm3, xmm3
 37976  	packsswb	xmm3, xmm3
 37977  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_18] # xmm4 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 37978  	pblendvb	xmm2, xmm4, xmm0
 37979  	movdqa	xmm0, xmm1
 37980  	pblendvb	xmm3, xmm4, xmm0
 37981  	pextrw	word ptr [r8 + rax], xmm2, 0
 37982  	pextrw	word ptr [r8 + rax + 2], xmm3, 0
 37983  .LBB4_1589:
 37984  	cmp	rsi, r10
 37985  	je	.LBB4_1655
 37986  	jmp	.LBB4_1590
 37987  .LBB4_1595:
 37988  	xor	esi, esi
 37989  .LBB4_1596:
 37990  	test	r9b, 1
 37991  	je	.LBB4_1598
 37992  # %bb.1597:
 37993  	movups	xmm0, xmmword ptr [rcx + 4*rsi]
 37994  	movups	xmm1, xmmword ptr [rcx + 4*rsi + 16]
 37995  	xorps	xmm4, xmm4
 37996  	movaps	xmm2, xmm0
 37997  	cmpeqps	xmm2, xmm4
 37998  	packssdw	xmm2, xmm2
 37999  	packsswb	xmm2, xmm2
 38000  	movaps	xmm3, xmm1
 38001  	cmpeqps	xmm3, xmm4
 38002  	packssdw	xmm3, xmm3
 38003  	packsswb	xmm3, xmm3
 38004  	pcmpeqd	xmm5, xmm5
 38005  	pcmpgtd	xmm0, xmm5
 38006  	packssdw	xmm0, xmm0
 38007  	packsswb	xmm0, xmm0
 38008  	pcmpgtd	xmm1, xmm5
 38009  	packssdw	xmm1, xmm1
 38010  	packsswb	xmm1, xmm1
 38011  	movdqa	xmm6, xmmword ptr [rip + .LCPI4_12] # xmm6 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u>
 38012  	pcmpeqd	xmm7, xmm7
 38013  	pblendvb	xmm7, xmm6, xmm0
 38014  	movdqa	xmm0, xmm1
 38015  	pblendvb	xmm5, xmm6, xmm0
 38016  	movdqa	xmm0, xmm2
 38017  	pblendvb	xmm7, xmm4, xmm0
 38018  	movdqa	xmm0, xmm3
 38019  	pblendvb	xmm5, xmm4, xmm0
 38020  	movd	dword ptr [r8 + rsi], xmm7
 38021  	movd	dword ptr [r8 + rsi + 4], xmm5
 38022  .LBB4_1598:
 38023  	cmp	rdx, r10
 38024  	je	.LBB4_1655
 38025  	jmp	.LBB4_1599
 38026  .LBB4_1604:
 38027  	xor	esi, esi
 38028  .LBB4_1605:
 38029  	test	r9b, 1
 38030  	je	.LBB4_1607
 38031  # %bb.1606:
 38032  	movdqu	xmm0, xmmword ptr [rcx + rsi]
 38033  	movdqu	xmm1, xmmword ptr [rcx + rsi + 16]
 38034  	pxor	xmm2, xmm2
 38035  	pcmpeqb	xmm0, xmm2
 38036  	movdqa	xmm3, xmmword ptr [rip + .LCPI4_22] # xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 38037  	pandn	xmm0, xmm3
 38038  	pcmpeqb	xmm1, xmm2
 38039  	pandn	xmm1, xmm3
 38040  	movdqu	xmmword ptr [r8 + rsi], xmm0
 38041  	movdqu	xmmword ptr [r8 + rsi + 16], xmm1
 38042  .LBB4_1607:
 38043  	cmp	rdx, rax
 38044  	je	.LBB4_1655
 38045  	jmp	.LBB4_1608
 38046  .LBB4_1612:
 38047  	xor	eax, eax
 38048  .LBB4_1613:
 38049  	test	r9b, 1
 38050  	je	.LBB4_1615
 38051  # %bb.1614:
 38052  	movdqu	xmm2, xmmword ptr [rcx + 4*rax]
 38053  	movdqu	xmm3, xmmword ptr [rcx + 4*rax + 16]
 38054  	pxor	xmm4, xmm4
 38055  	movdqa	xmm0, xmm2
 38056  	pcmpgtd	xmm0, xmm4
 38057  	packssdw	xmm0, xmm0
 38058  	packsswb	xmm0, xmm0
 38059  	movdqa	xmm1, xmm3
 38060  	pcmpgtd	xmm1, xmm4
 38061  	packssdw	xmm1, xmm1
 38062  	packsswb	xmm1, xmm1
 38063  	pcmpeqd	xmm2, xmm4
 38064  	pcmpeqd	xmm5, xmm5
 38065  	pxor	xmm2, xmm5
 38066  	packssdw	xmm2, xmm2
 38067  	packsswb	xmm2, xmm2
 38068  	pcmpeqd	xmm3, xmm4
 38069  	pxor	xmm3, xmm5
 38070  	packssdw	xmm3, xmm3
 38071  	packsswb	xmm3, xmm3
 38072  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_12] # xmm4 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u>
 38073  	pblendvb	xmm2, xmm4, xmm0
 38074  	movdqa	xmm0, xmm1
 38075  	pblendvb	xmm3, xmm4, xmm0
 38076  	movd	dword ptr [r8 + rax], xmm2
 38077  	movd	dword ptr [r8 + rax + 4], xmm3
 38078  .LBB4_1615:
 38079  	cmp	rsi, r10
 38080  	je	.LBB4_1655
 38081  	jmp	.LBB4_1616
 38082  .LBB4_1621:
 38083  	xor	esi, esi
 38084  .LBB4_1622:
 38085  	test	r9b, 1
 38086  	je	.LBB4_1624
 38087  # %bb.1623:
 38088  	movdqu	xmm0, xmmword ptr [rcx + 4*rsi]
 38089  	movdqu	xmm1, xmmword ptr [rcx + 4*rsi + 16]
 38090  	pxor	xmm2, xmm2
 38091  	pcmpeqd	xmm0, xmm2
 38092  	movdqa	xmm3, xmmword ptr [rip + .LCPI4_8] # xmm3 = [1,1,1,1]
 38093  	pandn	xmm0, xmm3
 38094  	pcmpeqd	xmm1, xmm2
 38095  	pandn	xmm1, xmm3
 38096  	movdqu	xmmword ptr [r8 + 4*rsi], xmm0
 38097  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm1
 38098  .LBB4_1624:
 38099  	cmp	rdx, r10
 38100  	je	.LBB4_1655
 38101  	jmp	.LBB4_1625
 38102  .LBB4_1629:
 38103  	xor	esi, esi
 38104  .LBB4_1630:
 38105  	test	r9b, 1
 38106  	je	.LBB4_1632
 38107  # %bb.1631:
 38108  	movd	xmm2, dword ptr [rcx + rsi]     # xmm2 = mem[0],zero,zero,zero
 38109  	movd	xmm3, dword ptr [rcx + rsi + 4] # xmm3 = mem[0],zero,zero,zero
 38110  	xorps	xmm4, xmm4
 38111  	movdqa	xmm0, xmm2
 38112  	pcmpgtb	xmm0, xmm4
 38113  	pmovsxbd	xmm0, xmm0
 38114  	movdqa	xmm1, xmm3
 38115  	pcmpgtb	xmm1, xmm4
 38116  	pmovsxbd	xmm1, xmm1
 38117  	pcmpeqb	xmm2, xmm4
 38118  	pcmpeqd	xmm5, xmm5
 38119  	pxor	xmm2, xmm5
 38120  	pmovsxbd	xmm2, xmm2
 38121  	pcmpeqb	xmm3, xmm4
 38122  	pxor	xmm3, xmm5
 38123  	pmovsxbd	xmm3, xmm3
 38124  	movaps	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 38125  	blendvps	xmm2, xmm4, xmm0
 38126  	movdqa	xmm0, xmm1
 38127  	blendvps	xmm3, xmm4, xmm0
 38128  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 38129  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm3
 38130  .LBB4_1632:
 38131  	cmp	rdx, r10
 38132  	je	.LBB4_1655
 38133  	jmp	.LBB4_1633
 38134  .LBB4_1638:
 38135  	xor	esi, esi
 38136  .LBB4_1639:
 38137  	test	r9b, 1
 38138  	je	.LBB4_1641
 38139  # %bb.1640:
 38140  	movd	xmm0, dword ptr [rcx + rsi]     # xmm0 = mem[0],zero,zero,zero
 38141  	movd	xmm1, dword ptr [rcx + rsi + 4] # xmm1 = mem[0],zero,zero,zero
 38142  	pxor	xmm2, xmm2
 38143  	pcmpeqb	xmm0, xmm2
 38144  	pcmpeqd	xmm3, xmm3
 38145  	pxor	xmm0, xmm3
 38146  	pmovzxbd	xmm0, xmm0                      # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 38147  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 38148  	pand	xmm0, xmm4
 38149  	pcmpeqb	xmm1, xmm2
 38150  	pxor	xmm1, xmm3
 38151  	pmovzxbd	xmm1, xmm1                      # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 38152  	pand	xmm1, xmm4
 38153  	movdqu	xmmword ptr [r8 + 4*rsi], xmm0
 38154  	movdqu	xmmword ptr [r8 + 4*rsi + 16], xmm1
 38155  .LBB4_1641:
 38156  	cmp	rdx, r10
 38157  	je	.LBB4_1655
 38158  	jmp	.LBB4_1642
 38159  .LBB4_1646:
 38160  	xor	esi, esi
 38161  .LBB4_1647:
 38162  	test	r9b, 1
 38163  	je	.LBB4_1649
 38164  # %bb.1648:
 38165  	movdqu	xmm1, xmmword ptr [rcx + 4*rsi]
 38166  	movdqu	xmm2, xmmword ptr [rcx + 4*rsi + 16]
 38167  	pxor	xmm3, xmm3
 38168  	movdqa	xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1]
 38169  	movdqa	xmm0, xmm4
 38170  	pcmpgtd	xmm0, xmm1
 38171  	movdqa	xmm5, xmm1
 38172  	pcmpeqd	xmm5, xmm3
 38173  	pcmpeqd	xmm1, xmm1
 38174  	pxor	xmm5, xmm1
 38175  	pcmpeqd	xmm3, xmm2
 38176  	pxor	xmm3, xmm1
 38177  	movdqa	xmm1, xmm4
 38178  	pcmpgtd	xmm1, xmm2
 38179  	movdqa	xmm2, xmm4
 38180  	blendvps	xmm2, xmm5, xmm0
 38181  	movdqa	xmm0, xmm1
 38182  	blendvps	xmm4, xmm3, xmm0
 38183  	movups	xmmword ptr [r8 + 4*rsi], xmm2
 38184  	movups	xmmword ptr [r8 + 4*rsi + 16], xmm4
 38185  .LBB4_1649:
 38186  	cmp	rdx, r11
 38187  	je	.LBB4_1655
 38188  	jmp	.LBB4_1650
 38189  .Lfunc_end4:
 38190  	.size	arithmetic_unary_diff_type_sse4, .Lfunc_end4-arithmetic_unary_diff_type_sse4
 38191                                          # -- End function
 38192  	.ident	"Ubuntu clang version 11.1.0-6"
 38193  	.section	".note.GNU-stack","",@progbits
 38194  	.addrsig