github.com/apache/arrow/go/v16@v16.1.0/arrow/compute/internal/kernels/_lib/base_arithmetic_sse4_amd64.s (about) 1 .text 2 .intel_syntax noprefix 3 .file "base_arithmetic.cc" 4 .section .rodata.cst16,"aM",@progbits,16 5 .p2align 4 # -- Begin function arithmetic_binary_sse4 6 .LCPI0_0: 7 .short 255 # 0xff 8 .short 255 # 0xff 9 .short 255 # 0xff 10 .short 255 # 0xff 11 .short 255 # 0xff 12 .short 255 # 0xff 13 .short 255 # 0xff 14 .short 255 # 0xff 15 .text 16 .globl arithmetic_binary_sse4 17 .p2align 4, 0x90 18 .type arithmetic_binary_sse4,@function 19 arithmetic_binary_sse4: # @arithmetic_binary_sse4 20 # %bb.0: 21 push rbp 22 mov rbp, rsp 23 and rsp, -8 24 cmp sil, 20 25 jg .LBB0_11 26 # %bb.1: 27 test sil, sil 28 je .LBB0_21 29 # %bb.2: 30 cmp sil, 1 31 je .LBB0_367 32 # %bb.3: 33 cmp sil, 2 34 jne .LBB0_1013 35 # %bb.4: 36 cmp edi, 6 37 jg .LBB0_719 38 # %bb.5: 39 cmp edi, 3 40 jle .LBB0_6 41 # %bb.713: 42 cmp edi, 4 43 je .LBB0_760 44 # %bb.714: 45 cmp edi, 5 46 je .LBB0_776 47 # %bb.715: 48 cmp edi, 6 49 jne .LBB0_1013 50 # %bb.716: 51 test r9d, r9d 52 jle .LBB0_1013 53 # %bb.717: 54 mov r10d, r9d 55 cmp r9d, 8 56 jae .LBB0_792 57 # %bb.718: 58 xor esi, esi 59 .LBB0_801: 60 mov r9, rsi 61 not r9 62 add r9, r10 63 mov rdi, r10 64 and rdi, 3 65 je .LBB0_803 66 .LBB0_802: # =>This Inner Loop Header: Depth=1 67 mov eax, dword ptr [rcx + 4*rsi] 68 imul eax, dword ptr [rdx + 4*rsi] 69 mov dword ptr [r8 + 4*rsi], eax 70 add rsi, 1 71 add rdi, -1 72 jne .LBB0_802 73 .LBB0_803: 74 cmp r9, 3 75 jb .LBB0_1013 76 .LBB0_804: # =>This Inner Loop Header: Depth=1 77 mov eax, dword ptr [rcx + 4*rsi] 78 imul eax, dword ptr [rdx + 4*rsi] 79 mov dword ptr [r8 + 4*rsi], eax 80 mov eax, dword ptr [rcx + 4*rsi + 4] 81 imul eax, dword ptr [rdx + 4*rsi + 4] 82 mov dword ptr [r8 + 4*rsi + 4], eax 83 mov eax, dword ptr [rcx + 4*rsi + 8] 84 imul eax, dword ptr [rdx + 4*rsi + 8] 85 mov dword ptr [r8 + 4*rsi + 8], eax 86 mov eax, dword ptr [rcx + 4*rsi + 12] 87 imul eax, dword ptr [rdx + 4*rsi + 12] 88 mov dword ptr [r8 + 4*rsi + 12], eax 89 add rsi, 4 90 cmp r10, rsi 91 jne .LBB0_804 92 jmp .LBB0_1013 93 .LBB0_11: 94 cmp sil, 21 95 je .LBB0_194 96 # %bb.12: 97 cmp sil, 22 98 je .LBB0_540 99 # %bb.13: 100 cmp sil, 23 101 jne .LBB0_1013 102 # %bb.14: 103 cmp edi, 6 104 jg .LBB0_869 105 # %bb.15: 106 cmp edi, 3 107 jle .LBB0_16 108 # %bb.863: 109 cmp edi, 4 110 je .LBB0_910 111 # %bb.864: 112 cmp edi, 5 113 je .LBB0_926 114 # %bb.865: 115 cmp edi, 6 116 jne .LBB0_1013 117 # %bb.866: 118 test r9d, r9d 119 jle .LBB0_1013 120 # %bb.867: 121 mov r10d, r9d 122 cmp r9d, 8 123 jae .LBB0_942 124 # %bb.868: 125 xor esi, esi 126 .LBB0_951: 127 mov r9, rsi 128 not r9 129 add r9, r10 130 mov rdi, r10 131 and rdi, 3 132 je .LBB0_953 133 .LBB0_952: # =>This Inner Loop Header: Depth=1 134 mov eax, dword ptr [rcx + 4*rsi] 135 imul eax, dword ptr [rdx + 4*rsi] 136 mov dword ptr [r8 + 4*rsi], eax 137 add rsi, 1 138 add rdi, -1 139 jne .LBB0_952 140 .LBB0_953: 141 cmp r9, 3 142 jb .LBB0_1013 143 .LBB0_954: # =>This Inner Loop Header: Depth=1 144 mov eax, dword ptr [rcx + 4*rsi] 145 imul eax, dword ptr [rdx + 4*rsi] 146 mov dword ptr [r8 + 4*rsi], eax 147 mov eax, dword ptr [rcx + 4*rsi + 4] 148 imul eax, dword ptr [rdx + 4*rsi + 4] 149 mov dword ptr [r8 + 4*rsi + 4], eax 150 mov eax, dword ptr [rcx + 4*rsi + 8] 151 imul eax, dword ptr [rdx + 4*rsi + 8] 152 mov dword ptr [r8 + 4*rsi + 8], eax 153 mov eax, dword ptr [rcx + 4*rsi + 12] 154 imul eax, dword ptr [rdx + 4*rsi + 12] 155 mov dword ptr [r8 + 4*rsi + 12], eax 156 add rsi, 4 157 cmp r10, rsi 158 jne .LBB0_954 159 jmp .LBB0_1013 160 .LBB0_21: 161 cmp edi, 6 162 jg .LBB0_34 163 # %bb.22: 164 cmp edi, 3 165 jle .LBB0_23 166 # %bb.28: 167 cmp edi, 4 168 je .LBB0_75 169 # %bb.29: 170 cmp edi, 5 171 je .LBB0_91 172 # %bb.30: 173 cmp edi, 6 174 jne .LBB0_1013 175 # %bb.31: 176 test r9d, r9d 177 jle .LBB0_1013 178 # %bb.32: 179 mov r10d, r9d 180 cmp r9d, 8 181 jae .LBB0_107 182 # %bb.33: 183 xor esi, esi 184 .LBB0_116: 185 mov r9, rsi 186 not r9 187 add r9, r10 188 mov rdi, r10 189 and rdi, 3 190 je .LBB0_118 191 .LBB0_117: # =>This Inner Loop Header: Depth=1 192 mov eax, dword ptr [rcx + 4*rsi] 193 add eax, dword ptr [rdx + 4*rsi] 194 mov dword ptr [r8 + 4*rsi], eax 195 add rsi, 1 196 add rdi, -1 197 jne .LBB0_117 198 .LBB0_118: 199 cmp r9, 3 200 jb .LBB0_1013 201 .LBB0_119: # =>This Inner Loop Header: Depth=1 202 mov eax, dword ptr [rcx + 4*rsi] 203 add eax, dword ptr [rdx + 4*rsi] 204 mov dword ptr [r8 + 4*rsi], eax 205 mov eax, dword ptr [rcx + 4*rsi + 4] 206 add eax, dword ptr [rdx + 4*rsi + 4] 207 mov dword ptr [r8 + 4*rsi + 4], eax 208 mov eax, dword ptr [rcx + 4*rsi + 8] 209 add eax, dword ptr [rdx + 4*rsi + 8] 210 mov dword ptr [r8 + 4*rsi + 8], eax 211 mov eax, dword ptr [rcx + 4*rsi + 12] 212 add eax, dword ptr [rdx + 4*rsi + 12] 213 mov dword ptr [r8 + 4*rsi + 12], eax 214 add rsi, 4 215 cmp r10, rsi 216 jne .LBB0_119 217 jmp .LBB0_1013 218 .LBB0_367: 219 cmp edi, 6 220 jg .LBB0_380 221 # %bb.368: 222 cmp edi, 3 223 jle .LBB0_369 224 # %bb.374: 225 cmp edi, 4 226 je .LBB0_421 227 # %bb.375: 228 cmp edi, 5 229 je .LBB0_437 230 # %bb.376: 231 cmp edi, 6 232 jne .LBB0_1013 233 # %bb.377: 234 test r9d, r9d 235 jle .LBB0_1013 236 # %bb.378: 237 mov r10d, r9d 238 cmp r9d, 8 239 jae .LBB0_453 240 # %bb.379: 241 xor esi, esi 242 .LBB0_462: 243 mov r9, rsi 244 not r9 245 add r9, r10 246 mov rdi, r10 247 and rdi, 3 248 je .LBB0_464 249 .LBB0_463: # =>This Inner Loop Header: Depth=1 250 mov eax, dword ptr [rdx + 4*rsi] 251 sub eax, dword ptr [rcx + 4*rsi] 252 mov dword ptr [r8 + 4*rsi], eax 253 add rsi, 1 254 add rdi, -1 255 jne .LBB0_463 256 .LBB0_464: 257 cmp r9, 3 258 jb .LBB0_1013 259 .LBB0_465: # =>This Inner Loop Header: Depth=1 260 mov eax, dword ptr [rdx + 4*rsi] 261 sub eax, dword ptr [rcx + 4*rsi] 262 mov dword ptr [r8 + 4*rsi], eax 263 mov eax, dword ptr [rdx + 4*rsi + 4] 264 sub eax, dword ptr [rcx + 4*rsi + 4] 265 mov dword ptr [r8 + 4*rsi + 4], eax 266 mov eax, dword ptr [rdx + 4*rsi + 8] 267 sub eax, dword ptr [rcx + 4*rsi + 8] 268 mov dword ptr [r8 + 4*rsi + 8], eax 269 mov eax, dword ptr [rdx + 4*rsi + 12] 270 sub eax, dword ptr [rcx + 4*rsi + 12] 271 mov dword ptr [r8 + 4*rsi + 12], eax 272 add rsi, 4 273 cmp r10, rsi 274 jne .LBB0_465 275 jmp .LBB0_1013 276 .LBB0_194: 277 cmp edi, 6 278 jg .LBB0_207 279 # %bb.195: 280 cmp edi, 3 281 jle .LBB0_196 282 # %bb.201: 283 cmp edi, 4 284 je .LBB0_248 285 # %bb.202: 286 cmp edi, 5 287 je .LBB0_264 288 # %bb.203: 289 cmp edi, 6 290 jne .LBB0_1013 291 # %bb.204: 292 test r9d, r9d 293 jle .LBB0_1013 294 # %bb.205: 295 mov r10d, r9d 296 cmp r9d, 8 297 jae .LBB0_280 298 # %bb.206: 299 xor esi, esi 300 .LBB0_289: 301 mov r9, rsi 302 not r9 303 add r9, r10 304 mov rdi, r10 305 and rdi, 3 306 je .LBB0_291 307 .LBB0_290: # =>This Inner Loop Header: Depth=1 308 mov eax, dword ptr [rcx + 4*rsi] 309 add eax, dword ptr [rdx + 4*rsi] 310 mov dword ptr [r8 + 4*rsi], eax 311 add rsi, 1 312 add rdi, -1 313 jne .LBB0_290 314 .LBB0_291: 315 cmp r9, 3 316 jb .LBB0_1013 317 .LBB0_292: # =>This Inner Loop Header: Depth=1 318 mov eax, dword ptr [rcx + 4*rsi] 319 add eax, dword ptr [rdx + 4*rsi] 320 mov dword ptr [r8 + 4*rsi], eax 321 mov eax, dword ptr [rcx + 4*rsi + 4] 322 add eax, dword ptr [rdx + 4*rsi + 4] 323 mov dword ptr [r8 + 4*rsi + 4], eax 324 mov eax, dword ptr [rcx + 4*rsi + 8] 325 add eax, dword ptr [rdx + 4*rsi + 8] 326 mov dword ptr [r8 + 4*rsi + 8], eax 327 mov eax, dword ptr [rcx + 4*rsi + 12] 328 add eax, dword ptr [rdx + 4*rsi + 12] 329 mov dword ptr [r8 + 4*rsi + 12], eax 330 add rsi, 4 331 cmp r10, rsi 332 jne .LBB0_292 333 jmp .LBB0_1013 334 .LBB0_540: 335 cmp edi, 6 336 jg .LBB0_553 337 # %bb.541: 338 cmp edi, 3 339 jle .LBB0_542 340 # %bb.547: 341 cmp edi, 4 342 je .LBB0_594 343 # %bb.548: 344 cmp edi, 5 345 je .LBB0_610 346 # %bb.549: 347 cmp edi, 6 348 jne .LBB0_1013 349 # %bb.550: 350 test r9d, r9d 351 jle .LBB0_1013 352 # %bb.551: 353 mov r10d, r9d 354 cmp r9d, 8 355 jae .LBB0_626 356 # %bb.552: 357 xor esi, esi 358 .LBB0_635: 359 mov r9, rsi 360 not r9 361 add r9, r10 362 mov rdi, r10 363 and rdi, 3 364 je .LBB0_637 365 .LBB0_636: # =>This Inner Loop Header: Depth=1 366 mov eax, dword ptr [rdx + 4*rsi] 367 sub eax, dword ptr [rcx + 4*rsi] 368 mov dword ptr [r8 + 4*rsi], eax 369 add rsi, 1 370 add rdi, -1 371 jne .LBB0_636 372 .LBB0_637: 373 cmp r9, 3 374 jb .LBB0_1013 375 .LBB0_638: # =>This Inner Loop Header: Depth=1 376 mov eax, dword ptr [rdx + 4*rsi] 377 sub eax, dword ptr [rcx + 4*rsi] 378 mov dword ptr [r8 + 4*rsi], eax 379 mov eax, dword ptr [rdx + 4*rsi + 4] 380 sub eax, dword ptr [rcx + 4*rsi + 4] 381 mov dword ptr [r8 + 4*rsi + 4], eax 382 mov eax, dword ptr [rdx + 4*rsi + 8] 383 sub eax, dword ptr [rcx + 4*rsi + 8] 384 mov dword ptr [r8 + 4*rsi + 8], eax 385 mov eax, dword ptr [rdx + 4*rsi + 12] 386 sub eax, dword ptr [rcx + 4*rsi + 12] 387 mov dword ptr [r8 + 4*rsi + 12], eax 388 add rsi, 4 389 cmp r10, rsi 390 jne .LBB0_638 391 jmp .LBB0_1013 392 .LBB0_719: 393 cmp edi, 8 394 jle .LBB0_720 395 # %bb.725: 396 cmp edi, 9 397 je .LBB0_826 398 # %bb.726: 399 cmp edi, 11 400 je .LBB0_834 401 # %bb.727: 402 cmp edi, 12 403 jne .LBB0_1013 404 # %bb.728: 405 test r9d, r9d 406 jle .LBB0_1013 407 # %bb.729: 408 mov r10d, r9d 409 cmp r9d, 4 410 jae .LBB0_850 411 # %bb.730: 412 xor esi, esi 413 .LBB0_859: 414 mov rax, rsi 415 not rax 416 add rax, r10 417 mov rdi, r10 418 and rdi, 3 419 je .LBB0_861 420 .LBB0_860: # =>This Inner Loop Header: Depth=1 421 movsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero 422 mulsd xmm0, qword ptr [rdx + 8*rsi] 423 movsd qword ptr [r8 + 8*rsi], xmm0 424 add rsi, 1 425 add rdi, -1 426 jne .LBB0_860 427 .LBB0_861: 428 cmp rax, 3 429 jb .LBB0_1013 430 .LBB0_862: # =>This Inner Loop Header: Depth=1 431 movsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero 432 mulsd xmm0, qword ptr [rdx + 8*rsi] 433 movsd qword ptr [r8 + 8*rsi], xmm0 434 movsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero 435 mulsd xmm0, qword ptr [rdx + 8*rsi + 8] 436 movsd qword ptr [r8 + 8*rsi + 8], xmm0 437 movsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero 438 mulsd xmm0, qword ptr [rdx + 8*rsi + 16] 439 movsd qword ptr [r8 + 8*rsi + 16], xmm0 440 movsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero 441 mulsd xmm0, qword ptr [rdx + 8*rsi + 24] 442 movsd qword ptr [r8 + 8*rsi + 24], xmm0 443 add rsi, 4 444 cmp r10, rsi 445 jne .LBB0_862 446 jmp .LBB0_1013 447 .LBB0_869: 448 cmp edi, 8 449 jle .LBB0_870 450 # %bb.875: 451 cmp edi, 9 452 je .LBB0_976 453 # %bb.876: 454 cmp edi, 11 455 je .LBB0_984 456 # %bb.877: 457 cmp edi, 12 458 jne .LBB0_1013 459 # %bb.878: 460 test r9d, r9d 461 jle .LBB0_1013 462 # %bb.879: 463 mov r10d, r9d 464 cmp r9d, 4 465 jae .LBB0_1000 466 # %bb.880: 467 xor esi, esi 468 .LBB0_1009: 469 mov rax, rsi 470 not rax 471 add rax, r10 472 mov rdi, r10 473 and rdi, 3 474 je .LBB0_1011 475 .LBB0_1010: # =>This Inner Loop Header: Depth=1 476 movsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero 477 mulsd xmm0, qword ptr [rdx + 8*rsi] 478 movsd qword ptr [r8 + 8*rsi], xmm0 479 add rsi, 1 480 add rdi, -1 481 jne .LBB0_1010 482 .LBB0_1011: 483 cmp rax, 3 484 jb .LBB0_1013 485 .LBB0_1012: # =>This Inner Loop Header: Depth=1 486 movsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero 487 mulsd xmm0, qword ptr [rdx + 8*rsi] 488 movsd qword ptr [r8 + 8*rsi], xmm0 489 movsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero 490 mulsd xmm0, qword ptr [rdx + 8*rsi + 8] 491 movsd qword ptr [r8 + 8*rsi + 8], xmm0 492 movsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero 493 mulsd xmm0, qword ptr [rdx + 8*rsi + 16] 494 movsd qword ptr [r8 + 8*rsi + 16], xmm0 495 movsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero 496 mulsd xmm0, qword ptr [rdx + 8*rsi + 24] 497 movsd qword ptr [r8 + 8*rsi + 24], xmm0 498 add rsi, 4 499 cmp r10, rsi 500 jne .LBB0_1012 501 jmp .LBB0_1013 502 .LBB0_34: 503 cmp edi, 8 504 jle .LBB0_35 505 # %bb.40: 506 cmp edi, 9 507 je .LBB0_149 508 # %bb.41: 509 cmp edi, 11 510 je .LBB0_165 511 # %bb.42: 512 cmp edi, 12 513 jne .LBB0_1013 514 # %bb.43: 515 test r9d, r9d 516 jle .LBB0_1013 517 # %bb.44: 518 mov r10d, r9d 519 cmp r9d, 4 520 jae .LBB0_181 521 # %bb.45: 522 xor esi, esi 523 .LBB0_190: 524 mov rax, rsi 525 not rax 526 add rax, r10 527 mov rdi, r10 528 and rdi, 3 529 je .LBB0_192 530 .LBB0_191: # =>This Inner Loop Header: Depth=1 531 movsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero 532 addsd xmm0, qword ptr [rdx + 8*rsi] 533 movsd qword ptr [r8 + 8*rsi], xmm0 534 add rsi, 1 535 add rdi, -1 536 jne .LBB0_191 537 .LBB0_192: 538 cmp rax, 3 539 jb .LBB0_1013 540 .LBB0_193: # =>This Inner Loop Header: Depth=1 541 movsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero 542 addsd xmm0, qword ptr [rdx + 8*rsi] 543 movsd qword ptr [r8 + 8*rsi], xmm0 544 movsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero 545 addsd xmm0, qword ptr [rdx + 8*rsi + 8] 546 movsd qword ptr [r8 + 8*rsi + 8], xmm0 547 movsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero 548 addsd xmm0, qword ptr [rdx + 8*rsi + 16] 549 movsd qword ptr [r8 + 8*rsi + 16], xmm0 550 movsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero 551 addsd xmm0, qword ptr [rdx + 8*rsi + 24] 552 movsd qword ptr [r8 + 8*rsi + 24], xmm0 553 add rsi, 4 554 cmp r10, rsi 555 jne .LBB0_193 556 jmp .LBB0_1013 557 .LBB0_380: 558 cmp edi, 8 559 jle .LBB0_381 560 # %bb.386: 561 cmp edi, 9 562 je .LBB0_495 563 # %bb.387: 564 cmp edi, 11 565 je .LBB0_511 566 # %bb.388: 567 cmp edi, 12 568 jne .LBB0_1013 569 # %bb.389: 570 test r9d, r9d 571 jle .LBB0_1013 572 # %bb.390: 573 mov r10d, r9d 574 cmp r9d, 4 575 jae .LBB0_527 576 # %bb.391: 577 xor esi, esi 578 .LBB0_536: 579 mov rax, rsi 580 not rax 581 add rax, r10 582 mov rdi, r10 583 and rdi, 3 584 je .LBB0_538 585 .LBB0_537: # =>This Inner Loop Header: Depth=1 586 movsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero 587 subsd xmm0, qword ptr [rcx + 8*rsi] 588 movsd qword ptr [r8 + 8*rsi], xmm0 589 add rsi, 1 590 add rdi, -1 591 jne .LBB0_537 592 .LBB0_538: 593 cmp rax, 3 594 jb .LBB0_1013 595 .LBB0_539: # =>This Inner Loop Header: Depth=1 596 movsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero 597 subsd xmm0, qword ptr [rcx + 8*rsi] 598 movsd qword ptr [r8 + 8*rsi], xmm0 599 movsd xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero 600 subsd xmm0, qword ptr [rcx + 8*rsi + 8] 601 movsd qword ptr [r8 + 8*rsi + 8], xmm0 602 movsd xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero 603 subsd xmm0, qword ptr [rcx + 8*rsi + 16] 604 movsd qword ptr [r8 + 8*rsi + 16], xmm0 605 movsd xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero 606 subsd xmm0, qword ptr [rcx + 8*rsi + 24] 607 movsd qword ptr [r8 + 8*rsi + 24], xmm0 608 add rsi, 4 609 cmp r10, rsi 610 jne .LBB0_539 611 jmp .LBB0_1013 612 .LBB0_207: 613 cmp edi, 8 614 jle .LBB0_208 615 # %bb.213: 616 cmp edi, 9 617 je .LBB0_322 618 # %bb.214: 619 cmp edi, 11 620 je .LBB0_338 621 # %bb.215: 622 cmp edi, 12 623 jne .LBB0_1013 624 # %bb.216: 625 test r9d, r9d 626 jle .LBB0_1013 627 # %bb.217: 628 mov r10d, r9d 629 cmp r9d, 4 630 jae .LBB0_354 631 # %bb.218: 632 xor esi, esi 633 .LBB0_363: 634 mov rax, rsi 635 not rax 636 add rax, r10 637 mov rdi, r10 638 and rdi, 3 639 je .LBB0_365 640 .LBB0_364: # =>This Inner Loop Header: Depth=1 641 movsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero 642 addsd xmm0, qword ptr [rdx + 8*rsi] 643 movsd qword ptr [r8 + 8*rsi], xmm0 644 add rsi, 1 645 add rdi, -1 646 jne .LBB0_364 647 .LBB0_365: 648 cmp rax, 3 649 jb .LBB0_1013 650 .LBB0_366: # =>This Inner Loop Header: Depth=1 651 movsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero 652 addsd xmm0, qword ptr [rdx + 8*rsi] 653 movsd qword ptr [r8 + 8*rsi], xmm0 654 movsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero 655 addsd xmm0, qword ptr [rdx + 8*rsi + 8] 656 movsd qword ptr [r8 + 8*rsi + 8], xmm0 657 movsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero 658 addsd xmm0, qword ptr [rdx + 8*rsi + 16] 659 movsd qword ptr [r8 + 8*rsi + 16], xmm0 660 movsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero 661 addsd xmm0, qword ptr [rdx + 8*rsi + 24] 662 movsd qword ptr [r8 + 8*rsi + 24], xmm0 663 add rsi, 4 664 cmp r10, rsi 665 jne .LBB0_366 666 jmp .LBB0_1013 667 .LBB0_553: 668 cmp edi, 8 669 jle .LBB0_554 670 # %bb.559: 671 cmp edi, 9 672 je .LBB0_668 673 # %bb.560: 674 cmp edi, 11 675 je .LBB0_684 676 # %bb.561: 677 cmp edi, 12 678 jne .LBB0_1013 679 # %bb.562: 680 test r9d, r9d 681 jle .LBB0_1013 682 # %bb.563: 683 mov r10d, r9d 684 cmp r9d, 4 685 jae .LBB0_700 686 # %bb.564: 687 xor esi, esi 688 .LBB0_709: 689 mov rax, rsi 690 not rax 691 add rax, r10 692 mov rdi, r10 693 and rdi, 3 694 je .LBB0_711 695 .LBB0_710: # =>This Inner Loop Header: Depth=1 696 movsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero 697 subsd xmm0, qword ptr [rcx + 8*rsi] 698 movsd qword ptr [r8 + 8*rsi], xmm0 699 add rsi, 1 700 add rdi, -1 701 jne .LBB0_710 702 .LBB0_711: 703 cmp rax, 3 704 jb .LBB0_1013 705 .LBB0_712: # =>This Inner Loop Header: Depth=1 706 movsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero 707 subsd xmm0, qword ptr [rcx + 8*rsi] 708 movsd qword ptr [r8 + 8*rsi], xmm0 709 movsd xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero 710 subsd xmm0, qword ptr [rcx + 8*rsi + 8] 711 movsd qword ptr [r8 + 8*rsi + 8], xmm0 712 movsd xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero 713 subsd xmm0, qword ptr [rcx + 8*rsi + 16] 714 movsd qword ptr [r8 + 8*rsi + 16], xmm0 715 movsd xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero 716 subsd xmm0, qword ptr [rcx + 8*rsi + 24] 717 movsd qword ptr [r8 + 8*rsi + 24], xmm0 718 add rsi, 4 719 cmp r10, rsi 720 jne .LBB0_712 721 jmp .LBB0_1013 722 .LBB0_6: 723 cmp edi, 2 724 je .LBB0_731 725 # %bb.7: 726 cmp edi, 3 727 jne .LBB0_1013 728 # %bb.8: 729 test r9d, r9d 730 jle .LBB0_1013 731 # %bb.9: 732 mov r10d, r9d 733 cmp r9d, 32 734 jae .LBB0_747 735 # %bb.10: 736 xor edi, edi 737 .LBB0_756: 738 mov r9, rdi 739 not r9 740 add r9, r10 741 mov rsi, r10 742 and rsi, 3 743 je .LBB0_758 744 .LBB0_757: # =>This Inner Loop Header: Depth=1 745 movzx eax, byte ptr [rcx + rdi] 746 mul byte ptr [rdx + rdi] 747 mov byte ptr [r8 + rdi], al 748 add rdi, 1 749 add rsi, -1 750 jne .LBB0_757 751 .LBB0_758: 752 cmp r9, 3 753 jb .LBB0_1013 754 .LBB0_759: # =>This Inner Loop Header: Depth=1 755 movzx eax, byte ptr [rcx + rdi] 756 mul byte ptr [rdx + rdi] 757 mov byte ptr [r8 + rdi], al 758 movzx eax, byte ptr [rcx + rdi + 1] 759 mul byte ptr [rdx + rdi + 1] 760 mov byte ptr [r8 + rdi + 1], al 761 movzx eax, byte ptr [rcx + rdi + 2] 762 mul byte ptr [rdx + rdi + 2] 763 mov byte ptr [r8 + rdi + 2], al 764 movzx eax, byte ptr [rcx + rdi + 3] 765 mul byte ptr [rdx + rdi + 3] 766 mov byte ptr [r8 + rdi + 3], al 767 add rdi, 4 768 cmp r10, rdi 769 jne .LBB0_759 770 jmp .LBB0_1013 771 .LBB0_16: 772 cmp edi, 2 773 je .LBB0_881 774 # %bb.17: 775 cmp edi, 3 776 jne .LBB0_1013 777 # %bb.18: 778 test r9d, r9d 779 jle .LBB0_1013 780 # %bb.19: 781 mov r10d, r9d 782 cmp r9d, 32 783 jae .LBB0_897 784 # %bb.20: 785 xor edi, edi 786 .LBB0_906: 787 mov r9, rdi 788 not r9 789 add r9, r10 790 mov rsi, r10 791 and rsi, 3 792 je .LBB0_908 793 .LBB0_907: # =>This Inner Loop Header: Depth=1 794 movzx eax, byte ptr [rcx + rdi] 795 mul byte ptr [rdx + rdi] 796 mov byte ptr [r8 + rdi], al 797 add rdi, 1 798 add rsi, -1 799 jne .LBB0_907 800 .LBB0_908: 801 cmp r9, 3 802 jb .LBB0_1013 803 .LBB0_909: # =>This Inner Loop Header: Depth=1 804 movzx eax, byte ptr [rcx + rdi] 805 mul byte ptr [rdx + rdi] 806 mov byte ptr [r8 + rdi], al 807 movzx eax, byte ptr [rcx + rdi + 1] 808 mul byte ptr [rdx + rdi + 1] 809 mov byte ptr [r8 + rdi + 1], al 810 movzx eax, byte ptr [rcx + rdi + 2] 811 mul byte ptr [rdx + rdi + 2] 812 mov byte ptr [r8 + rdi + 2], al 813 movzx eax, byte ptr [rcx + rdi + 3] 814 mul byte ptr [rdx + rdi + 3] 815 mov byte ptr [r8 + rdi + 3], al 816 add rdi, 4 817 cmp r10, rdi 818 jne .LBB0_909 819 jmp .LBB0_1013 820 .LBB0_23: 821 cmp edi, 2 822 je .LBB0_46 823 # %bb.24: 824 cmp edi, 3 825 jne .LBB0_1013 826 # %bb.25: 827 test r9d, r9d 828 jle .LBB0_1013 829 # %bb.26: 830 mov r10d, r9d 831 cmp r9d, 32 832 jae .LBB0_62 833 # %bb.27: 834 xor esi, esi 835 .LBB0_71: 836 mov r9, rsi 837 not r9 838 add r9, r10 839 mov rdi, r10 840 and rdi, 3 841 je .LBB0_73 842 .LBB0_72: # =>This Inner Loop Header: Depth=1 843 movzx eax, byte ptr [rcx + rsi] 844 add al, byte ptr [rdx + rsi] 845 mov byte ptr [r8 + rsi], al 846 add rsi, 1 847 add rdi, -1 848 jne .LBB0_72 849 .LBB0_73: 850 cmp r9, 3 851 jb .LBB0_1013 852 .LBB0_74: # =>This Inner Loop Header: Depth=1 853 movzx eax, byte ptr [rcx + rsi] 854 add al, byte ptr [rdx + rsi] 855 mov byte ptr [r8 + rsi], al 856 movzx eax, byte ptr [rcx + rsi + 1] 857 add al, byte ptr [rdx + rsi + 1] 858 mov byte ptr [r8 + rsi + 1], al 859 movzx eax, byte ptr [rcx + rsi + 2] 860 add al, byte ptr [rdx + rsi + 2] 861 mov byte ptr [r8 + rsi + 2], al 862 movzx eax, byte ptr [rcx + rsi + 3] 863 add al, byte ptr [rdx + rsi + 3] 864 mov byte ptr [r8 + rsi + 3], al 865 add rsi, 4 866 cmp r10, rsi 867 jne .LBB0_74 868 jmp .LBB0_1013 869 .LBB0_369: 870 cmp edi, 2 871 je .LBB0_392 872 # %bb.370: 873 cmp edi, 3 874 jne .LBB0_1013 875 # %bb.371: 876 test r9d, r9d 877 jle .LBB0_1013 878 # %bb.372: 879 mov r10d, r9d 880 cmp r9d, 32 881 jae .LBB0_408 882 # %bb.373: 883 xor esi, esi 884 .LBB0_417: 885 mov r9, rsi 886 not r9 887 add r9, r10 888 mov rdi, r10 889 and rdi, 3 890 je .LBB0_419 891 .LBB0_418: # =>This Inner Loop Header: Depth=1 892 movzx eax, byte ptr [rdx + rsi] 893 sub al, byte ptr [rcx + rsi] 894 mov byte ptr [r8 + rsi], al 895 add rsi, 1 896 add rdi, -1 897 jne .LBB0_418 898 .LBB0_419: 899 cmp r9, 3 900 jb .LBB0_1013 901 .LBB0_420: # =>This Inner Loop Header: Depth=1 902 movzx eax, byte ptr [rdx + rsi] 903 sub al, byte ptr [rcx + rsi] 904 mov byte ptr [r8 + rsi], al 905 movzx eax, byte ptr [rdx + rsi + 1] 906 sub al, byte ptr [rcx + rsi + 1] 907 mov byte ptr [r8 + rsi + 1], al 908 movzx eax, byte ptr [rdx + rsi + 2] 909 sub al, byte ptr [rcx + rsi + 2] 910 mov byte ptr [r8 + rsi + 2], al 911 movzx eax, byte ptr [rdx + rsi + 3] 912 sub al, byte ptr [rcx + rsi + 3] 913 mov byte ptr [r8 + rsi + 3], al 914 add rsi, 4 915 cmp r10, rsi 916 jne .LBB0_420 917 jmp .LBB0_1013 918 .LBB0_196: 919 cmp edi, 2 920 je .LBB0_219 921 # %bb.197: 922 cmp edi, 3 923 jne .LBB0_1013 924 # %bb.198: 925 test r9d, r9d 926 jle .LBB0_1013 927 # %bb.199: 928 mov r10d, r9d 929 cmp r9d, 32 930 jae .LBB0_235 931 # %bb.200: 932 xor esi, esi 933 .LBB0_244: 934 mov r9, rsi 935 not r9 936 add r9, r10 937 mov rdi, r10 938 and rdi, 3 939 je .LBB0_246 940 .LBB0_245: # =>This Inner Loop Header: Depth=1 941 movzx eax, byte ptr [rcx + rsi] 942 add al, byte ptr [rdx + rsi] 943 mov byte ptr [r8 + rsi], al 944 add rsi, 1 945 add rdi, -1 946 jne .LBB0_245 947 .LBB0_246: 948 cmp r9, 3 949 jb .LBB0_1013 950 .LBB0_247: # =>This Inner Loop Header: Depth=1 951 movzx eax, byte ptr [rcx + rsi] 952 add al, byte ptr [rdx + rsi] 953 mov byte ptr [r8 + rsi], al 954 movzx eax, byte ptr [rcx + rsi + 1] 955 add al, byte ptr [rdx + rsi + 1] 956 mov byte ptr [r8 + rsi + 1], al 957 movzx eax, byte ptr [rcx + rsi + 2] 958 add al, byte ptr [rdx + rsi + 2] 959 mov byte ptr [r8 + rsi + 2], al 960 movzx eax, byte ptr [rcx + rsi + 3] 961 add al, byte ptr [rdx + rsi + 3] 962 mov byte ptr [r8 + rsi + 3], al 963 add rsi, 4 964 cmp r10, rsi 965 jne .LBB0_247 966 jmp .LBB0_1013 967 .LBB0_542: 968 cmp edi, 2 969 je .LBB0_565 970 # %bb.543: 971 cmp edi, 3 972 jne .LBB0_1013 973 # %bb.544: 974 test r9d, r9d 975 jle .LBB0_1013 976 # %bb.545: 977 mov r10d, r9d 978 cmp r9d, 32 979 jae .LBB0_581 980 # %bb.546: 981 xor esi, esi 982 .LBB0_590: 983 mov r9, rsi 984 not r9 985 add r9, r10 986 mov rdi, r10 987 and rdi, 3 988 je .LBB0_592 989 .LBB0_591: # =>This Inner Loop Header: Depth=1 990 movzx eax, byte ptr [rdx + rsi] 991 sub al, byte ptr [rcx + rsi] 992 mov byte ptr [r8 + rsi], al 993 add rsi, 1 994 add rdi, -1 995 jne .LBB0_591 996 .LBB0_592: 997 cmp r9, 3 998 jb .LBB0_1013 999 .LBB0_593: # =>This Inner Loop Header: Depth=1 1000 movzx eax, byte ptr [rdx + rsi] 1001 sub al, byte ptr [rcx + rsi] 1002 mov byte ptr [r8 + rsi], al 1003 movzx eax, byte ptr [rdx + rsi + 1] 1004 sub al, byte ptr [rcx + rsi + 1] 1005 mov byte ptr [r8 + rsi + 1], al 1006 movzx eax, byte ptr [rdx + rsi + 2] 1007 sub al, byte ptr [rcx + rsi + 2] 1008 mov byte ptr [r8 + rsi + 2], al 1009 movzx eax, byte ptr [rdx + rsi + 3] 1010 sub al, byte ptr [rcx + rsi + 3] 1011 mov byte ptr [r8 + rsi + 3], al 1012 add rsi, 4 1013 cmp r10, rsi 1014 jne .LBB0_593 1015 jmp .LBB0_1013 1016 .LBB0_720: 1017 cmp edi, 7 1018 je .LBB0_805 1019 # %bb.721: 1020 cmp edi, 8 1021 jne .LBB0_1013 1022 # %bb.722: 1023 test r9d, r9d 1024 jle .LBB0_1013 1025 # %bb.723: 1026 mov esi, r9d 1027 lea rdi, [rsi - 1] 1028 mov r9d, esi 1029 and r9d, 3 1030 cmp rdi, 3 1031 jae .LBB0_821 1032 # %bb.724: 1033 xor edi, edi 1034 jmp .LBB0_823 1035 .LBB0_870: 1036 cmp edi, 7 1037 je .LBB0_955 1038 # %bb.871: 1039 cmp edi, 8 1040 jne .LBB0_1013 1041 # %bb.872: 1042 test r9d, r9d 1043 jle .LBB0_1013 1044 # %bb.873: 1045 mov esi, r9d 1046 lea rdi, [rsi - 1] 1047 mov r9d, esi 1048 and r9d, 3 1049 cmp rdi, 3 1050 jae .LBB0_971 1051 # %bb.874: 1052 xor edi, edi 1053 jmp .LBB0_973 1054 .LBB0_35: 1055 cmp edi, 7 1056 je .LBB0_120 1057 # %bb.36: 1058 cmp edi, 8 1059 jne .LBB0_1013 1060 # %bb.37: 1061 test r9d, r9d 1062 jle .LBB0_1013 1063 # %bb.38: 1064 mov r10d, r9d 1065 cmp r9d, 4 1066 jae .LBB0_136 1067 # %bb.39: 1068 xor esi, esi 1069 .LBB0_145: 1070 mov r9, rsi 1071 not r9 1072 add r9, r10 1073 mov rdi, r10 1074 and rdi, 3 1075 je .LBB0_147 1076 .LBB0_146: # =>This Inner Loop Header: Depth=1 1077 mov rax, qword ptr [rcx + 8*rsi] 1078 add rax, qword ptr [rdx + 8*rsi] 1079 mov qword ptr [r8 + 8*rsi], rax 1080 add rsi, 1 1081 add rdi, -1 1082 jne .LBB0_146 1083 .LBB0_147: 1084 cmp r9, 3 1085 jb .LBB0_1013 1086 .LBB0_148: # =>This Inner Loop Header: Depth=1 1087 mov rax, qword ptr [rcx + 8*rsi] 1088 add rax, qword ptr [rdx + 8*rsi] 1089 mov qword ptr [r8 + 8*rsi], rax 1090 mov rax, qword ptr [rcx + 8*rsi + 8] 1091 add rax, qword ptr [rdx + 8*rsi + 8] 1092 mov qword ptr [r8 + 8*rsi + 8], rax 1093 mov rax, qword ptr [rcx + 8*rsi + 16] 1094 add rax, qword ptr [rdx + 8*rsi + 16] 1095 mov qword ptr [r8 + 8*rsi + 16], rax 1096 mov rax, qword ptr [rcx + 8*rsi + 24] 1097 add rax, qword ptr [rdx + 8*rsi + 24] 1098 mov qword ptr [r8 + 8*rsi + 24], rax 1099 add rsi, 4 1100 cmp r10, rsi 1101 jne .LBB0_148 1102 jmp .LBB0_1013 1103 .LBB0_381: 1104 cmp edi, 7 1105 je .LBB0_466 1106 # %bb.382: 1107 cmp edi, 8 1108 jne .LBB0_1013 1109 # %bb.383: 1110 test r9d, r9d 1111 jle .LBB0_1013 1112 # %bb.384: 1113 mov r10d, r9d 1114 cmp r9d, 4 1115 jae .LBB0_482 1116 # %bb.385: 1117 xor esi, esi 1118 .LBB0_491: 1119 mov r9, rsi 1120 not r9 1121 add r9, r10 1122 mov rdi, r10 1123 and rdi, 3 1124 je .LBB0_493 1125 .LBB0_492: # =>This Inner Loop Header: Depth=1 1126 mov rax, qword ptr [rdx + 8*rsi] 1127 sub rax, qword ptr [rcx + 8*rsi] 1128 mov qword ptr [r8 + 8*rsi], rax 1129 add rsi, 1 1130 add rdi, -1 1131 jne .LBB0_492 1132 .LBB0_493: 1133 cmp r9, 3 1134 jb .LBB0_1013 1135 .LBB0_494: # =>This Inner Loop Header: Depth=1 1136 mov rax, qword ptr [rdx + 8*rsi] 1137 sub rax, qword ptr [rcx + 8*rsi] 1138 mov qword ptr [r8 + 8*rsi], rax 1139 mov rax, qword ptr [rdx + 8*rsi + 8] 1140 sub rax, qword ptr [rcx + 8*rsi + 8] 1141 mov qword ptr [r8 + 8*rsi + 8], rax 1142 mov rax, qword ptr [rdx + 8*rsi + 16] 1143 sub rax, qword ptr [rcx + 8*rsi + 16] 1144 mov qword ptr [r8 + 8*rsi + 16], rax 1145 mov rax, qword ptr [rdx + 8*rsi + 24] 1146 sub rax, qword ptr [rcx + 8*rsi + 24] 1147 mov qword ptr [r8 + 8*rsi + 24], rax 1148 add rsi, 4 1149 cmp r10, rsi 1150 jne .LBB0_494 1151 jmp .LBB0_1013 1152 .LBB0_208: 1153 cmp edi, 7 1154 je .LBB0_293 1155 # %bb.209: 1156 cmp edi, 8 1157 jne .LBB0_1013 1158 # %bb.210: 1159 test r9d, r9d 1160 jle .LBB0_1013 1161 # %bb.211: 1162 mov r10d, r9d 1163 cmp r9d, 4 1164 jae .LBB0_309 1165 # %bb.212: 1166 xor esi, esi 1167 .LBB0_318: 1168 mov r9, rsi 1169 not r9 1170 add r9, r10 1171 mov rdi, r10 1172 and rdi, 3 1173 je .LBB0_320 1174 .LBB0_319: # =>This Inner Loop Header: Depth=1 1175 mov rax, qword ptr [rcx + 8*rsi] 1176 add rax, qword ptr [rdx + 8*rsi] 1177 mov qword ptr [r8 + 8*rsi], rax 1178 add rsi, 1 1179 add rdi, -1 1180 jne .LBB0_319 1181 .LBB0_320: 1182 cmp r9, 3 1183 jb .LBB0_1013 1184 .LBB0_321: # =>This Inner Loop Header: Depth=1 1185 mov rax, qword ptr [rcx + 8*rsi] 1186 add rax, qword ptr [rdx + 8*rsi] 1187 mov qword ptr [r8 + 8*rsi], rax 1188 mov rax, qword ptr [rcx + 8*rsi + 8] 1189 add rax, qword ptr [rdx + 8*rsi + 8] 1190 mov qword ptr [r8 + 8*rsi + 8], rax 1191 mov rax, qword ptr [rcx + 8*rsi + 16] 1192 add rax, qword ptr [rdx + 8*rsi + 16] 1193 mov qword ptr [r8 + 8*rsi + 16], rax 1194 mov rax, qword ptr [rcx + 8*rsi + 24] 1195 add rax, qword ptr [rdx + 8*rsi + 24] 1196 mov qword ptr [r8 + 8*rsi + 24], rax 1197 add rsi, 4 1198 cmp r10, rsi 1199 jne .LBB0_321 1200 jmp .LBB0_1013 1201 .LBB0_554: 1202 cmp edi, 7 1203 je .LBB0_639 1204 # %bb.555: 1205 cmp edi, 8 1206 jne .LBB0_1013 1207 # %bb.556: 1208 test r9d, r9d 1209 jle .LBB0_1013 1210 # %bb.557: 1211 mov r10d, r9d 1212 cmp r9d, 4 1213 jae .LBB0_655 1214 # %bb.558: 1215 xor esi, esi 1216 .LBB0_664: 1217 mov r9, rsi 1218 not r9 1219 add r9, r10 1220 mov rdi, r10 1221 and rdi, 3 1222 je .LBB0_666 1223 .LBB0_665: # =>This Inner Loop Header: Depth=1 1224 mov rax, qword ptr [rdx + 8*rsi] 1225 sub rax, qword ptr [rcx + 8*rsi] 1226 mov qword ptr [r8 + 8*rsi], rax 1227 add rsi, 1 1228 add rdi, -1 1229 jne .LBB0_665 1230 .LBB0_666: 1231 cmp r9, 3 1232 jb .LBB0_1013 1233 .LBB0_667: # =>This Inner Loop Header: Depth=1 1234 mov rax, qword ptr [rdx + 8*rsi] 1235 sub rax, qword ptr [rcx + 8*rsi] 1236 mov qword ptr [r8 + 8*rsi], rax 1237 mov rax, qword ptr [rdx + 8*rsi + 8] 1238 sub rax, qword ptr [rcx + 8*rsi + 8] 1239 mov qword ptr [r8 + 8*rsi + 8], rax 1240 mov rax, qword ptr [rdx + 8*rsi + 16] 1241 sub rax, qword ptr [rcx + 8*rsi + 16] 1242 mov qword ptr [r8 + 8*rsi + 16], rax 1243 mov rax, qword ptr [rdx + 8*rsi + 24] 1244 sub rax, qword ptr [rcx + 8*rsi + 24] 1245 mov qword ptr [r8 + 8*rsi + 24], rax 1246 add rsi, 4 1247 cmp r10, rsi 1248 jne .LBB0_667 1249 jmp .LBB0_1013 1250 .LBB0_760: 1251 test r9d, r9d 1252 jle .LBB0_1013 1253 # %bb.761: 1254 mov r10d, r9d 1255 cmp r9d, 16 1256 jae .LBB0_763 1257 # %bb.762: 1258 xor esi, esi 1259 .LBB0_772: 1260 mov r9, rsi 1261 not r9 1262 add r9, r10 1263 mov rdi, r10 1264 and rdi, 3 1265 je .LBB0_774 1266 .LBB0_773: # =>This Inner Loop Header: Depth=1 1267 movzx eax, word ptr [rcx + 2*rsi] 1268 imul ax, word ptr [rdx + 2*rsi] 1269 mov word ptr [r8 + 2*rsi], ax 1270 add rsi, 1 1271 add rdi, -1 1272 jne .LBB0_773 1273 .LBB0_774: 1274 cmp r9, 3 1275 jb .LBB0_1013 1276 .LBB0_775: # =>This Inner Loop Header: Depth=1 1277 movzx eax, word ptr [rcx + 2*rsi] 1278 imul ax, word ptr [rdx + 2*rsi] 1279 mov word ptr [r8 + 2*rsi], ax 1280 movzx eax, word ptr [rcx + 2*rsi + 2] 1281 imul ax, word ptr [rdx + 2*rsi + 2] 1282 mov word ptr [r8 + 2*rsi + 2], ax 1283 movzx eax, word ptr [rcx + 2*rsi + 4] 1284 imul ax, word ptr [rdx + 2*rsi + 4] 1285 mov word ptr [r8 + 2*rsi + 4], ax 1286 movzx eax, word ptr [rcx + 2*rsi + 6] 1287 imul ax, word ptr [rdx + 2*rsi + 6] 1288 mov word ptr [r8 + 2*rsi + 6], ax 1289 add rsi, 4 1290 cmp r10, rsi 1291 jne .LBB0_775 1292 jmp .LBB0_1013 1293 .LBB0_776: 1294 test r9d, r9d 1295 jle .LBB0_1013 1296 # %bb.777: 1297 mov r10d, r9d 1298 cmp r9d, 16 1299 jae .LBB0_779 1300 # %bb.778: 1301 xor esi, esi 1302 .LBB0_788: 1303 mov r9, rsi 1304 not r9 1305 add r9, r10 1306 mov rdi, r10 1307 and rdi, 3 1308 je .LBB0_790 1309 .LBB0_789: # =>This Inner Loop Header: Depth=1 1310 movzx eax, word ptr [rcx + 2*rsi] 1311 imul ax, word ptr [rdx + 2*rsi] 1312 mov word ptr [r8 + 2*rsi], ax 1313 add rsi, 1 1314 add rdi, -1 1315 jne .LBB0_789 1316 .LBB0_790: 1317 cmp r9, 3 1318 jb .LBB0_1013 1319 .LBB0_791: # =>This Inner Loop Header: Depth=1 1320 movzx eax, word ptr [rcx + 2*rsi] 1321 imul ax, word ptr [rdx + 2*rsi] 1322 mov word ptr [r8 + 2*rsi], ax 1323 movzx eax, word ptr [rcx + 2*rsi + 2] 1324 imul ax, word ptr [rdx + 2*rsi + 2] 1325 mov word ptr [r8 + 2*rsi + 2], ax 1326 movzx eax, word ptr [rcx + 2*rsi + 4] 1327 imul ax, word ptr [rdx + 2*rsi + 4] 1328 mov word ptr [r8 + 2*rsi + 4], ax 1329 movzx eax, word ptr [rcx + 2*rsi + 6] 1330 imul ax, word ptr [rdx + 2*rsi + 6] 1331 mov word ptr [r8 + 2*rsi + 6], ax 1332 add rsi, 4 1333 cmp r10, rsi 1334 jne .LBB0_791 1335 jmp .LBB0_1013 1336 .LBB0_910: 1337 test r9d, r9d 1338 jle .LBB0_1013 1339 # %bb.911: 1340 mov r10d, r9d 1341 cmp r9d, 16 1342 jae .LBB0_913 1343 # %bb.912: 1344 xor esi, esi 1345 .LBB0_922: 1346 mov r9, rsi 1347 not r9 1348 add r9, r10 1349 mov rdi, r10 1350 and rdi, 3 1351 je .LBB0_924 1352 .LBB0_923: # =>This Inner Loop Header: Depth=1 1353 movzx eax, word ptr [rcx + 2*rsi] 1354 imul ax, word ptr [rdx + 2*rsi] 1355 mov word ptr [r8 + 2*rsi], ax 1356 add rsi, 1 1357 add rdi, -1 1358 jne .LBB0_923 1359 .LBB0_924: 1360 cmp r9, 3 1361 jb .LBB0_1013 1362 .LBB0_925: # =>This Inner Loop Header: Depth=1 1363 movzx eax, word ptr [rcx + 2*rsi] 1364 imul ax, word ptr [rdx + 2*rsi] 1365 mov word ptr [r8 + 2*rsi], ax 1366 movzx eax, word ptr [rcx + 2*rsi + 2] 1367 imul ax, word ptr [rdx + 2*rsi + 2] 1368 mov word ptr [r8 + 2*rsi + 2], ax 1369 movzx eax, word ptr [rcx + 2*rsi + 4] 1370 imul ax, word ptr [rdx + 2*rsi + 4] 1371 mov word ptr [r8 + 2*rsi + 4], ax 1372 movzx eax, word ptr [rcx + 2*rsi + 6] 1373 imul ax, word ptr [rdx + 2*rsi + 6] 1374 mov word ptr [r8 + 2*rsi + 6], ax 1375 add rsi, 4 1376 cmp r10, rsi 1377 jne .LBB0_925 1378 jmp .LBB0_1013 1379 .LBB0_926: 1380 test r9d, r9d 1381 jle .LBB0_1013 1382 # %bb.927: 1383 mov r10d, r9d 1384 cmp r9d, 16 1385 jae .LBB0_929 1386 # %bb.928: 1387 xor esi, esi 1388 .LBB0_938: 1389 mov r9, rsi 1390 not r9 1391 add r9, r10 1392 mov rdi, r10 1393 and rdi, 3 1394 je .LBB0_940 1395 .LBB0_939: # =>This Inner Loop Header: Depth=1 1396 movzx eax, word ptr [rcx + 2*rsi] 1397 imul ax, word ptr [rdx + 2*rsi] 1398 mov word ptr [r8 + 2*rsi], ax 1399 add rsi, 1 1400 add rdi, -1 1401 jne .LBB0_939 1402 .LBB0_940: 1403 cmp r9, 3 1404 jb .LBB0_1013 1405 .LBB0_941: # =>This Inner Loop Header: Depth=1 1406 movzx eax, word ptr [rcx + 2*rsi] 1407 imul ax, word ptr [rdx + 2*rsi] 1408 mov word ptr [r8 + 2*rsi], ax 1409 movzx eax, word ptr [rcx + 2*rsi + 2] 1410 imul ax, word ptr [rdx + 2*rsi + 2] 1411 mov word ptr [r8 + 2*rsi + 2], ax 1412 movzx eax, word ptr [rcx + 2*rsi + 4] 1413 imul ax, word ptr [rdx + 2*rsi + 4] 1414 mov word ptr [r8 + 2*rsi + 4], ax 1415 movzx eax, word ptr [rcx + 2*rsi + 6] 1416 imul ax, word ptr [rdx + 2*rsi + 6] 1417 mov word ptr [r8 + 2*rsi + 6], ax 1418 add rsi, 4 1419 cmp r10, rsi 1420 jne .LBB0_941 1421 jmp .LBB0_1013 1422 .LBB0_75: 1423 test r9d, r9d 1424 jle .LBB0_1013 1425 # %bb.76: 1426 mov r10d, r9d 1427 cmp r9d, 16 1428 jae .LBB0_78 1429 # %bb.77: 1430 xor esi, esi 1431 .LBB0_87: 1432 mov r9, rsi 1433 not r9 1434 add r9, r10 1435 mov rdi, r10 1436 and rdi, 3 1437 je .LBB0_89 1438 .LBB0_88: # =>This Inner Loop Header: Depth=1 1439 movzx eax, word ptr [rcx + 2*rsi] 1440 add ax, word ptr [rdx + 2*rsi] 1441 mov word ptr [r8 + 2*rsi], ax 1442 add rsi, 1 1443 add rdi, -1 1444 jne .LBB0_88 1445 .LBB0_89: 1446 cmp r9, 3 1447 jb .LBB0_1013 1448 .LBB0_90: # =>This Inner Loop Header: Depth=1 1449 movzx eax, word ptr [rcx + 2*rsi] 1450 add ax, word ptr [rdx + 2*rsi] 1451 mov word ptr [r8 + 2*rsi], ax 1452 movzx eax, word ptr [rcx + 2*rsi + 2] 1453 add ax, word ptr [rdx + 2*rsi + 2] 1454 mov word ptr [r8 + 2*rsi + 2], ax 1455 movzx eax, word ptr [rcx + 2*rsi + 4] 1456 add ax, word ptr [rdx + 2*rsi + 4] 1457 mov word ptr [r8 + 2*rsi + 4], ax 1458 movzx eax, word ptr [rcx + 2*rsi + 6] 1459 add ax, word ptr [rdx + 2*rsi + 6] 1460 mov word ptr [r8 + 2*rsi + 6], ax 1461 add rsi, 4 1462 cmp r10, rsi 1463 jne .LBB0_90 1464 jmp .LBB0_1013 1465 .LBB0_91: 1466 test r9d, r9d 1467 jle .LBB0_1013 1468 # %bb.92: 1469 mov r10d, r9d 1470 cmp r9d, 16 1471 jae .LBB0_94 1472 # %bb.93: 1473 xor esi, esi 1474 .LBB0_103: 1475 mov r9, rsi 1476 not r9 1477 add r9, r10 1478 mov rdi, r10 1479 and rdi, 3 1480 je .LBB0_105 1481 .LBB0_104: # =>This Inner Loop Header: Depth=1 1482 movzx eax, word ptr [rcx + 2*rsi] 1483 add ax, word ptr [rdx + 2*rsi] 1484 mov word ptr [r8 + 2*rsi], ax 1485 add rsi, 1 1486 add rdi, -1 1487 jne .LBB0_104 1488 .LBB0_105: 1489 cmp r9, 3 1490 jb .LBB0_1013 1491 .LBB0_106: # =>This Inner Loop Header: Depth=1 1492 movzx eax, word ptr [rcx + 2*rsi] 1493 add ax, word ptr [rdx + 2*rsi] 1494 mov word ptr [r8 + 2*rsi], ax 1495 movzx eax, word ptr [rcx + 2*rsi + 2] 1496 add ax, word ptr [rdx + 2*rsi + 2] 1497 mov word ptr [r8 + 2*rsi + 2], ax 1498 movzx eax, word ptr [rcx + 2*rsi + 4] 1499 add ax, word ptr [rdx + 2*rsi + 4] 1500 mov word ptr [r8 + 2*rsi + 4], ax 1501 movzx eax, word ptr [rcx + 2*rsi + 6] 1502 add ax, word ptr [rdx + 2*rsi + 6] 1503 mov word ptr [r8 + 2*rsi + 6], ax 1504 add rsi, 4 1505 cmp r10, rsi 1506 jne .LBB0_106 1507 jmp .LBB0_1013 1508 .LBB0_421: 1509 test r9d, r9d 1510 jle .LBB0_1013 1511 # %bb.422: 1512 mov r10d, r9d 1513 cmp r9d, 16 1514 jae .LBB0_424 1515 # %bb.423: 1516 xor esi, esi 1517 .LBB0_433: 1518 mov r9, rsi 1519 not r9 1520 add r9, r10 1521 mov rdi, r10 1522 and rdi, 3 1523 je .LBB0_435 1524 .LBB0_434: # =>This Inner Loop Header: Depth=1 1525 movzx eax, word ptr [rdx + 2*rsi] 1526 sub ax, word ptr [rcx + 2*rsi] 1527 mov word ptr [r8 + 2*rsi], ax 1528 add rsi, 1 1529 add rdi, -1 1530 jne .LBB0_434 1531 .LBB0_435: 1532 cmp r9, 3 1533 jb .LBB0_1013 1534 .LBB0_436: # =>This Inner Loop Header: Depth=1 1535 movzx eax, word ptr [rdx + 2*rsi] 1536 sub ax, word ptr [rcx + 2*rsi] 1537 mov word ptr [r8 + 2*rsi], ax 1538 movzx eax, word ptr [rdx + 2*rsi + 2] 1539 sub ax, word ptr [rcx + 2*rsi + 2] 1540 mov word ptr [r8 + 2*rsi + 2], ax 1541 movzx eax, word ptr [rdx + 2*rsi + 4] 1542 sub ax, word ptr [rcx + 2*rsi + 4] 1543 mov word ptr [r8 + 2*rsi + 4], ax 1544 movzx eax, word ptr [rdx + 2*rsi + 6] 1545 sub ax, word ptr [rcx + 2*rsi + 6] 1546 mov word ptr [r8 + 2*rsi + 6], ax 1547 add rsi, 4 1548 cmp r10, rsi 1549 jne .LBB0_436 1550 jmp .LBB0_1013 1551 .LBB0_437: 1552 test r9d, r9d 1553 jle .LBB0_1013 1554 # %bb.438: 1555 mov r10d, r9d 1556 cmp r9d, 16 1557 jae .LBB0_440 1558 # %bb.439: 1559 xor esi, esi 1560 .LBB0_449: 1561 mov r9, rsi 1562 not r9 1563 add r9, r10 1564 mov rdi, r10 1565 and rdi, 3 1566 je .LBB0_451 1567 .LBB0_450: # =>This Inner Loop Header: Depth=1 1568 movzx eax, word ptr [rdx + 2*rsi] 1569 sub ax, word ptr [rcx + 2*rsi] 1570 mov word ptr [r8 + 2*rsi], ax 1571 add rsi, 1 1572 add rdi, -1 1573 jne .LBB0_450 1574 .LBB0_451: 1575 cmp r9, 3 1576 jb .LBB0_1013 1577 .LBB0_452: # =>This Inner Loop Header: Depth=1 1578 movzx eax, word ptr [rdx + 2*rsi] 1579 sub ax, word ptr [rcx + 2*rsi] 1580 mov word ptr [r8 + 2*rsi], ax 1581 movzx eax, word ptr [rdx + 2*rsi + 2] 1582 sub ax, word ptr [rcx + 2*rsi + 2] 1583 mov word ptr [r8 + 2*rsi + 2], ax 1584 movzx eax, word ptr [rdx + 2*rsi + 4] 1585 sub ax, word ptr [rcx + 2*rsi + 4] 1586 mov word ptr [r8 + 2*rsi + 4], ax 1587 movzx eax, word ptr [rdx + 2*rsi + 6] 1588 sub ax, word ptr [rcx + 2*rsi + 6] 1589 mov word ptr [r8 + 2*rsi + 6], ax 1590 add rsi, 4 1591 cmp r10, rsi 1592 jne .LBB0_452 1593 jmp .LBB0_1013 1594 .LBB0_248: 1595 test r9d, r9d 1596 jle .LBB0_1013 1597 # %bb.249: 1598 mov r10d, r9d 1599 cmp r9d, 16 1600 jae .LBB0_251 1601 # %bb.250: 1602 xor esi, esi 1603 .LBB0_260: 1604 mov r9, rsi 1605 not r9 1606 add r9, r10 1607 mov rdi, r10 1608 and rdi, 3 1609 je .LBB0_262 1610 .LBB0_261: # =>This Inner Loop Header: Depth=1 1611 movzx eax, word ptr [rcx + 2*rsi] 1612 add ax, word ptr [rdx + 2*rsi] 1613 mov word ptr [r8 + 2*rsi], ax 1614 add rsi, 1 1615 add rdi, -1 1616 jne .LBB0_261 1617 .LBB0_262: 1618 cmp r9, 3 1619 jb .LBB0_1013 1620 .LBB0_263: # =>This Inner Loop Header: Depth=1 1621 movzx eax, word ptr [rcx + 2*rsi] 1622 add ax, word ptr [rdx + 2*rsi] 1623 mov word ptr [r8 + 2*rsi], ax 1624 movzx eax, word ptr [rcx + 2*rsi + 2] 1625 add ax, word ptr [rdx + 2*rsi + 2] 1626 mov word ptr [r8 + 2*rsi + 2], ax 1627 movzx eax, word ptr [rcx + 2*rsi + 4] 1628 add ax, word ptr [rdx + 2*rsi + 4] 1629 mov word ptr [r8 + 2*rsi + 4], ax 1630 movzx eax, word ptr [rcx + 2*rsi + 6] 1631 add ax, word ptr [rdx + 2*rsi + 6] 1632 mov word ptr [r8 + 2*rsi + 6], ax 1633 add rsi, 4 1634 cmp r10, rsi 1635 jne .LBB0_263 1636 jmp .LBB0_1013 1637 .LBB0_264: 1638 test r9d, r9d 1639 jle .LBB0_1013 1640 # %bb.265: 1641 mov r10d, r9d 1642 cmp r9d, 16 1643 jae .LBB0_267 1644 # %bb.266: 1645 xor esi, esi 1646 .LBB0_276: 1647 mov r9, rsi 1648 not r9 1649 add r9, r10 1650 mov rdi, r10 1651 and rdi, 3 1652 je .LBB0_278 1653 .LBB0_277: # =>This Inner Loop Header: Depth=1 1654 movzx eax, word ptr [rcx + 2*rsi] 1655 add ax, word ptr [rdx + 2*rsi] 1656 mov word ptr [r8 + 2*rsi], ax 1657 add rsi, 1 1658 add rdi, -1 1659 jne .LBB0_277 1660 .LBB0_278: 1661 cmp r9, 3 1662 jb .LBB0_1013 1663 .LBB0_279: # =>This Inner Loop Header: Depth=1 1664 movzx eax, word ptr [rcx + 2*rsi] 1665 add ax, word ptr [rdx + 2*rsi] 1666 mov word ptr [r8 + 2*rsi], ax 1667 movzx eax, word ptr [rcx + 2*rsi + 2] 1668 add ax, word ptr [rdx + 2*rsi + 2] 1669 mov word ptr [r8 + 2*rsi + 2], ax 1670 movzx eax, word ptr [rcx + 2*rsi + 4] 1671 add ax, word ptr [rdx + 2*rsi + 4] 1672 mov word ptr [r8 + 2*rsi + 4], ax 1673 movzx eax, word ptr [rcx + 2*rsi + 6] 1674 add ax, word ptr [rdx + 2*rsi + 6] 1675 mov word ptr [r8 + 2*rsi + 6], ax 1676 add rsi, 4 1677 cmp r10, rsi 1678 jne .LBB0_279 1679 jmp .LBB0_1013 1680 .LBB0_594: 1681 test r9d, r9d 1682 jle .LBB0_1013 1683 # %bb.595: 1684 mov r10d, r9d 1685 cmp r9d, 16 1686 jae .LBB0_597 1687 # %bb.596: 1688 xor esi, esi 1689 .LBB0_606: 1690 mov r9, rsi 1691 not r9 1692 add r9, r10 1693 mov rdi, r10 1694 and rdi, 3 1695 je .LBB0_608 1696 .LBB0_607: # =>This Inner Loop Header: Depth=1 1697 movzx eax, word ptr [rdx + 2*rsi] 1698 sub ax, word ptr [rcx + 2*rsi] 1699 mov word ptr [r8 + 2*rsi], ax 1700 add rsi, 1 1701 add rdi, -1 1702 jne .LBB0_607 1703 .LBB0_608: 1704 cmp r9, 3 1705 jb .LBB0_1013 1706 .LBB0_609: # =>This Inner Loop Header: Depth=1 1707 movzx eax, word ptr [rdx + 2*rsi] 1708 sub ax, word ptr [rcx + 2*rsi] 1709 mov word ptr [r8 + 2*rsi], ax 1710 movzx eax, word ptr [rdx + 2*rsi + 2] 1711 sub ax, word ptr [rcx + 2*rsi + 2] 1712 mov word ptr [r8 + 2*rsi + 2], ax 1713 movzx eax, word ptr [rdx + 2*rsi + 4] 1714 sub ax, word ptr [rcx + 2*rsi + 4] 1715 mov word ptr [r8 + 2*rsi + 4], ax 1716 movzx eax, word ptr [rdx + 2*rsi + 6] 1717 sub ax, word ptr [rcx + 2*rsi + 6] 1718 mov word ptr [r8 + 2*rsi + 6], ax 1719 add rsi, 4 1720 cmp r10, rsi 1721 jne .LBB0_609 1722 jmp .LBB0_1013 1723 .LBB0_610: 1724 test r9d, r9d 1725 jle .LBB0_1013 1726 # %bb.611: 1727 mov r10d, r9d 1728 cmp r9d, 16 1729 jae .LBB0_613 1730 # %bb.612: 1731 xor esi, esi 1732 .LBB0_622: 1733 mov r9, rsi 1734 not r9 1735 add r9, r10 1736 mov rdi, r10 1737 and rdi, 3 1738 je .LBB0_624 1739 .LBB0_623: # =>This Inner Loop Header: Depth=1 1740 movzx eax, word ptr [rdx + 2*rsi] 1741 sub ax, word ptr [rcx + 2*rsi] 1742 mov word ptr [r8 + 2*rsi], ax 1743 add rsi, 1 1744 add rdi, -1 1745 jne .LBB0_623 1746 .LBB0_624: 1747 cmp r9, 3 1748 jb .LBB0_1013 1749 .LBB0_625: # =>This Inner Loop Header: Depth=1 1750 movzx eax, word ptr [rdx + 2*rsi] 1751 sub ax, word ptr [rcx + 2*rsi] 1752 mov word ptr [r8 + 2*rsi], ax 1753 movzx eax, word ptr [rdx + 2*rsi + 2] 1754 sub ax, word ptr [rcx + 2*rsi + 2] 1755 mov word ptr [r8 + 2*rsi + 2], ax 1756 movzx eax, word ptr [rdx + 2*rsi + 4] 1757 sub ax, word ptr [rcx + 2*rsi + 4] 1758 mov word ptr [r8 + 2*rsi + 4], ax 1759 movzx eax, word ptr [rdx + 2*rsi + 6] 1760 sub ax, word ptr [rcx + 2*rsi + 6] 1761 mov word ptr [r8 + 2*rsi + 6], ax 1762 add rsi, 4 1763 cmp r10, rsi 1764 jne .LBB0_625 1765 jmp .LBB0_1013 1766 .LBB0_826: 1767 test r9d, r9d 1768 jle .LBB0_1013 1769 # %bb.827: 1770 mov esi, r9d 1771 lea rdi, [rsi - 1] 1772 mov r9d, esi 1773 and r9d, 3 1774 cmp rdi, 3 1775 jae .LBB0_829 1776 # %bb.828: 1777 xor edi, edi 1778 jmp .LBB0_831 1779 .LBB0_834: 1780 test r9d, r9d 1781 jle .LBB0_1013 1782 # %bb.835: 1783 mov r10d, r9d 1784 cmp r9d, 8 1785 jae .LBB0_837 1786 # %bb.836: 1787 xor esi, esi 1788 .LBB0_846: 1789 mov rax, rsi 1790 not rax 1791 add rax, r10 1792 mov rdi, r10 1793 and rdi, 3 1794 je .LBB0_848 1795 .LBB0_847: # =>This Inner Loop Header: Depth=1 1796 movss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero 1797 mulss xmm0, dword ptr [rdx + 4*rsi] 1798 movss dword ptr [r8 + 4*rsi], xmm0 1799 add rsi, 1 1800 add rdi, -1 1801 jne .LBB0_847 1802 .LBB0_848: 1803 cmp rax, 3 1804 jb .LBB0_1013 1805 .LBB0_849: # =>This Inner Loop Header: Depth=1 1806 movss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero 1807 mulss xmm0, dword ptr [rdx + 4*rsi] 1808 movss dword ptr [r8 + 4*rsi], xmm0 1809 movss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero 1810 mulss xmm0, dword ptr [rdx + 4*rsi + 4] 1811 movss dword ptr [r8 + 4*rsi + 4], xmm0 1812 movss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero 1813 mulss xmm0, dword ptr [rdx + 4*rsi + 8] 1814 movss dword ptr [r8 + 4*rsi + 8], xmm0 1815 movss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero 1816 mulss xmm0, dword ptr [rdx + 4*rsi + 12] 1817 movss dword ptr [r8 + 4*rsi + 12], xmm0 1818 add rsi, 4 1819 cmp r10, rsi 1820 jne .LBB0_849 1821 jmp .LBB0_1013 1822 .LBB0_976: 1823 test r9d, r9d 1824 jle .LBB0_1013 1825 # %bb.977: 1826 mov esi, r9d 1827 lea rdi, [rsi - 1] 1828 mov r9d, esi 1829 and r9d, 3 1830 cmp rdi, 3 1831 jae .LBB0_979 1832 # %bb.978: 1833 xor edi, edi 1834 jmp .LBB0_981 1835 .LBB0_984: 1836 test r9d, r9d 1837 jle .LBB0_1013 1838 # %bb.985: 1839 mov r10d, r9d 1840 cmp r9d, 8 1841 jae .LBB0_987 1842 # %bb.986: 1843 xor esi, esi 1844 .LBB0_996: 1845 mov rax, rsi 1846 not rax 1847 add rax, r10 1848 mov rdi, r10 1849 and rdi, 3 1850 je .LBB0_998 1851 .LBB0_997: # =>This Inner Loop Header: Depth=1 1852 movss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero 1853 mulss xmm0, dword ptr [rdx + 4*rsi] 1854 movss dword ptr [r8 + 4*rsi], xmm0 1855 add rsi, 1 1856 add rdi, -1 1857 jne .LBB0_997 1858 .LBB0_998: 1859 cmp rax, 3 1860 jb .LBB0_1013 1861 .LBB0_999: # =>This Inner Loop Header: Depth=1 1862 movss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero 1863 mulss xmm0, dword ptr [rdx + 4*rsi] 1864 movss dword ptr [r8 + 4*rsi], xmm0 1865 movss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero 1866 mulss xmm0, dword ptr [rdx + 4*rsi + 4] 1867 movss dword ptr [r8 + 4*rsi + 4], xmm0 1868 movss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero 1869 mulss xmm0, dword ptr [rdx + 4*rsi + 8] 1870 movss dword ptr [r8 + 4*rsi + 8], xmm0 1871 movss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero 1872 mulss xmm0, dword ptr [rdx + 4*rsi + 12] 1873 movss dword ptr [r8 + 4*rsi + 12], xmm0 1874 add rsi, 4 1875 cmp r10, rsi 1876 jne .LBB0_999 1877 jmp .LBB0_1013 1878 .LBB0_149: 1879 test r9d, r9d 1880 jle .LBB0_1013 1881 # %bb.150: 1882 mov r10d, r9d 1883 cmp r9d, 4 1884 jae .LBB0_152 1885 # %bb.151: 1886 xor esi, esi 1887 .LBB0_161: 1888 mov r9, rsi 1889 not r9 1890 add r9, r10 1891 mov rdi, r10 1892 and rdi, 3 1893 je .LBB0_163 1894 .LBB0_162: # =>This Inner Loop Header: Depth=1 1895 mov rax, qword ptr [rcx + 8*rsi] 1896 add rax, qword ptr [rdx + 8*rsi] 1897 mov qword ptr [r8 + 8*rsi], rax 1898 add rsi, 1 1899 add rdi, -1 1900 jne .LBB0_162 1901 .LBB0_163: 1902 cmp r9, 3 1903 jb .LBB0_1013 1904 .LBB0_164: # =>This Inner Loop Header: Depth=1 1905 mov rax, qword ptr [rcx + 8*rsi] 1906 add rax, qword ptr [rdx + 8*rsi] 1907 mov qword ptr [r8 + 8*rsi], rax 1908 mov rax, qword ptr [rcx + 8*rsi + 8] 1909 add rax, qword ptr [rdx + 8*rsi + 8] 1910 mov qword ptr [r8 + 8*rsi + 8], rax 1911 mov rax, qword ptr [rcx + 8*rsi + 16] 1912 add rax, qword ptr [rdx + 8*rsi + 16] 1913 mov qword ptr [r8 + 8*rsi + 16], rax 1914 mov rax, qword ptr [rcx + 8*rsi + 24] 1915 add rax, qword ptr [rdx + 8*rsi + 24] 1916 mov qword ptr [r8 + 8*rsi + 24], rax 1917 add rsi, 4 1918 cmp r10, rsi 1919 jne .LBB0_164 1920 jmp .LBB0_1013 1921 .LBB0_165: 1922 test r9d, r9d 1923 jle .LBB0_1013 1924 # %bb.166: 1925 mov r10d, r9d 1926 cmp r9d, 8 1927 jae .LBB0_168 1928 # %bb.167: 1929 xor esi, esi 1930 .LBB0_177: 1931 mov rax, rsi 1932 not rax 1933 add rax, r10 1934 mov rdi, r10 1935 and rdi, 3 1936 je .LBB0_179 1937 .LBB0_178: # =>This Inner Loop Header: Depth=1 1938 movss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero 1939 addss xmm0, dword ptr [rdx + 4*rsi] 1940 movss dword ptr [r8 + 4*rsi], xmm0 1941 add rsi, 1 1942 add rdi, -1 1943 jne .LBB0_178 1944 .LBB0_179: 1945 cmp rax, 3 1946 jb .LBB0_1013 1947 .LBB0_180: # =>This Inner Loop Header: Depth=1 1948 movss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero 1949 addss xmm0, dword ptr [rdx + 4*rsi] 1950 movss dword ptr [r8 + 4*rsi], xmm0 1951 movss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero 1952 addss xmm0, dword ptr [rdx + 4*rsi + 4] 1953 movss dword ptr [r8 + 4*rsi + 4], xmm0 1954 movss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero 1955 addss xmm0, dword ptr [rdx + 4*rsi + 8] 1956 movss dword ptr [r8 + 4*rsi + 8], xmm0 1957 movss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero 1958 addss xmm0, dword ptr [rdx + 4*rsi + 12] 1959 movss dword ptr [r8 + 4*rsi + 12], xmm0 1960 add rsi, 4 1961 cmp r10, rsi 1962 jne .LBB0_180 1963 jmp .LBB0_1013 1964 .LBB0_495: 1965 test r9d, r9d 1966 jle .LBB0_1013 1967 # %bb.496: 1968 mov r10d, r9d 1969 cmp r9d, 4 1970 jae .LBB0_498 1971 # %bb.497: 1972 xor esi, esi 1973 .LBB0_507: 1974 mov r9, rsi 1975 not r9 1976 add r9, r10 1977 mov rdi, r10 1978 and rdi, 3 1979 je .LBB0_509 1980 .LBB0_508: # =>This Inner Loop Header: Depth=1 1981 mov rax, qword ptr [rdx + 8*rsi] 1982 sub rax, qword ptr [rcx + 8*rsi] 1983 mov qword ptr [r8 + 8*rsi], rax 1984 add rsi, 1 1985 add rdi, -1 1986 jne .LBB0_508 1987 .LBB0_509: 1988 cmp r9, 3 1989 jb .LBB0_1013 1990 .LBB0_510: # =>This Inner Loop Header: Depth=1 1991 mov rax, qword ptr [rdx + 8*rsi] 1992 sub rax, qword ptr [rcx + 8*rsi] 1993 mov qword ptr [r8 + 8*rsi], rax 1994 mov rax, qword ptr [rdx + 8*rsi + 8] 1995 sub rax, qword ptr [rcx + 8*rsi + 8] 1996 mov qword ptr [r8 + 8*rsi + 8], rax 1997 mov rax, qword ptr [rdx + 8*rsi + 16] 1998 sub rax, qword ptr [rcx + 8*rsi + 16] 1999 mov qword ptr [r8 + 8*rsi + 16], rax 2000 mov rax, qword ptr [rdx + 8*rsi + 24] 2001 sub rax, qword ptr [rcx + 8*rsi + 24] 2002 mov qword ptr [r8 + 8*rsi + 24], rax 2003 add rsi, 4 2004 cmp r10, rsi 2005 jne .LBB0_510 2006 jmp .LBB0_1013 2007 .LBB0_511: 2008 test r9d, r9d 2009 jle .LBB0_1013 2010 # %bb.512: 2011 mov r10d, r9d 2012 cmp r9d, 8 2013 jae .LBB0_514 2014 # %bb.513: 2015 xor esi, esi 2016 .LBB0_523: 2017 mov rax, rsi 2018 not rax 2019 add rax, r10 2020 mov rdi, r10 2021 and rdi, 3 2022 je .LBB0_525 2023 .LBB0_524: # =>This Inner Loop Header: Depth=1 2024 movss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero 2025 subss xmm0, dword ptr [rcx + 4*rsi] 2026 movss dword ptr [r8 + 4*rsi], xmm0 2027 add rsi, 1 2028 add rdi, -1 2029 jne .LBB0_524 2030 .LBB0_525: 2031 cmp rax, 3 2032 jb .LBB0_1013 2033 .LBB0_526: # =>This Inner Loop Header: Depth=1 2034 movss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero 2035 subss xmm0, dword ptr [rcx + 4*rsi] 2036 movss dword ptr [r8 + 4*rsi], xmm0 2037 movss xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero 2038 subss xmm0, dword ptr [rcx + 4*rsi + 4] 2039 movss dword ptr [r8 + 4*rsi + 4], xmm0 2040 movss xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero 2041 subss xmm0, dword ptr [rcx + 4*rsi + 8] 2042 movss dword ptr [r8 + 4*rsi + 8], xmm0 2043 movss xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero 2044 subss xmm0, dword ptr [rcx + 4*rsi + 12] 2045 movss dword ptr [r8 + 4*rsi + 12], xmm0 2046 add rsi, 4 2047 cmp r10, rsi 2048 jne .LBB0_526 2049 jmp .LBB0_1013 2050 .LBB0_322: 2051 test r9d, r9d 2052 jle .LBB0_1013 2053 # %bb.323: 2054 mov r10d, r9d 2055 cmp r9d, 4 2056 jae .LBB0_325 2057 # %bb.324: 2058 xor esi, esi 2059 .LBB0_334: 2060 mov r9, rsi 2061 not r9 2062 add r9, r10 2063 mov rdi, r10 2064 and rdi, 3 2065 je .LBB0_336 2066 .LBB0_335: # =>This Inner Loop Header: Depth=1 2067 mov rax, qword ptr [rcx + 8*rsi] 2068 add rax, qword ptr [rdx + 8*rsi] 2069 mov qword ptr [r8 + 8*rsi], rax 2070 add rsi, 1 2071 add rdi, -1 2072 jne .LBB0_335 2073 .LBB0_336: 2074 cmp r9, 3 2075 jb .LBB0_1013 2076 .LBB0_337: # =>This Inner Loop Header: Depth=1 2077 mov rax, qword ptr [rcx + 8*rsi] 2078 add rax, qword ptr [rdx + 8*rsi] 2079 mov qword ptr [r8 + 8*rsi], rax 2080 mov rax, qword ptr [rcx + 8*rsi + 8] 2081 add rax, qword ptr [rdx + 8*rsi + 8] 2082 mov qword ptr [r8 + 8*rsi + 8], rax 2083 mov rax, qword ptr [rcx + 8*rsi + 16] 2084 add rax, qword ptr [rdx + 8*rsi + 16] 2085 mov qword ptr [r8 + 8*rsi + 16], rax 2086 mov rax, qword ptr [rcx + 8*rsi + 24] 2087 add rax, qword ptr [rdx + 8*rsi + 24] 2088 mov qword ptr [r8 + 8*rsi + 24], rax 2089 add rsi, 4 2090 cmp r10, rsi 2091 jne .LBB0_337 2092 jmp .LBB0_1013 2093 .LBB0_338: 2094 test r9d, r9d 2095 jle .LBB0_1013 2096 # %bb.339: 2097 mov r10d, r9d 2098 cmp r9d, 8 2099 jae .LBB0_341 2100 # %bb.340: 2101 xor esi, esi 2102 .LBB0_350: 2103 mov rax, rsi 2104 not rax 2105 add rax, r10 2106 mov rdi, r10 2107 and rdi, 3 2108 je .LBB0_352 2109 .LBB0_351: # =>This Inner Loop Header: Depth=1 2110 movss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero 2111 addss xmm0, dword ptr [rdx + 4*rsi] 2112 movss dword ptr [r8 + 4*rsi], xmm0 2113 add rsi, 1 2114 add rdi, -1 2115 jne .LBB0_351 2116 .LBB0_352: 2117 cmp rax, 3 2118 jb .LBB0_1013 2119 .LBB0_353: # =>This Inner Loop Header: Depth=1 2120 movss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero 2121 addss xmm0, dword ptr [rdx + 4*rsi] 2122 movss dword ptr [r8 + 4*rsi], xmm0 2123 movss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero 2124 addss xmm0, dword ptr [rdx + 4*rsi + 4] 2125 movss dword ptr [r8 + 4*rsi + 4], xmm0 2126 movss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero 2127 addss xmm0, dword ptr [rdx + 4*rsi + 8] 2128 movss dword ptr [r8 + 4*rsi + 8], xmm0 2129 movss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero 2130 addss xmm0, dword ptr [rdx + 4*rsi + 12] 2131 movss dword ptr [r8 + 4*rsi + 12], xmm0 2132 add rsi, 4 2133 cmp r10, rsi 2134 jne .LBB0_353 2135 jmp .LBB0_1013 2136 .LBB0_668: 2137 test r9d, r9d 2138 jle .LBB0_1013 2139 # %bb.669: 2140 mov r10d, r9d 2141 cmp r9d, 4 2142 jae .LBB0_671 2143 # %bb.670: 2144 xor esi, esi 2145 .LBB0_680: 2146 mov r9, rsi 2147 not r9 2148 add r9, r10 2149 mov rdi, r10 2150 and rdi, 3 2151 je .LBB0_682 2152 .LBB0_681: # =>This Inner Loop Header: Depth=1 2153 mov rax, qword ptr [rdx + 8*rsi] 2154 sub rax, qword ptr [rcx + 8*rsi] 2155 mov qword ptr [r8 + 8*rsi], rax 2156 add rsi, 1 2157 add rdi, -1 2158 jne .LBB0_681 2159 .LBB0_682: 2160 cmp r9, 3 2161 jb .LBB0_1013 2162 .LBB0_683: # =>This Inner Loop Header: Depth=1 2163 mov rax, qword ptr [rdx + 8*rsi] 2164 sub rax, qword ptr [rcx + 8*rsi] 2165 mov qword ptr [r8 + 8*rsi], rax 2166 mov rax, qword ptr [rdx + 8*rsi + 8] 2167 sub rax, qword ptr [rcx + 8*rsi + 8] 2168 mov qword ptr [r8 + 8*rsi + 8], rax 2169 mov rax, qword ptr [rdx + 8*rsi + 16] 2170 sub rax, qword ptr [rcx + 8*rsi + 16] 2171 mov qword ptr [r8 + 8*rsi + 16], rax 2172 mov rax, qword ptr [rdx + 8*rsi + 24] 2173 sub rax, qword ptr [rcx + 8*rsi + 24] 2174 mov qword ptr [r8 + 8*rsi + 24], rax 2175 add rsi, 4 2176 cmp r10, rsi 2177 jne .LBB0_683 2178 jmp .LBB0_1013 2179 .LBB0_684: 2180 test r9d, r9d 2181 jle .LBB0_1013 2182 # %bb.685: 2183 mov r10d, r9d 2184 cmp r9d, 8 2185 jae .LBB0_687 2186 # %bb.686: 2187 xor esi, esi 2188 .LBB0_696: 2189 mov rax, rsi 2190 not rax 2191 add rax, r10 2192 mov rdi, r10 2193 and rdi, 3 2194 je .LBB0_698 2195 .LBB0_697: # =>This Inner Loop Header: Depth=1 2196 movss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero 2197 subss xmm0, dword ptr [rcx + 4*rsi] 2198 movss dword ptr [r8 + 4*rsi], xmm0 2199 add rsi, 1 2200 add rdi, -1 2201 jne .LBB0_697 2202 .LBB0_698: 2203 cmp rax, 3 2204 jb .LBB0_1013 2205 .LBB0_699: # =>This Inner Loop Header: Depth=1 2206 movss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero 2207 subss xmm0, dword ptr [rcx + 4*rsi] 2208 movss dword ptr [r8 + 4*rsi], xmm0 2209 movss xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero 2210 subss xmm0, dword ptr [rcx + 4*rsi + 4] 2211 movss dword ptr [r8 + 4*rsi + 4], xmm0 2212 movss xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero 2213 subss xmm0, dword ptr [rcx + 4*rsi + 8] 2214 movss dword ptr [r8 + 4*rsi + 8], xmm0 2215 movss xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero 2216 subss xmm0, dword ptr [rcx + 4*rsi + 12] 2217 movss dword ptr [r8 + 4*rsi + 12], xmm0 2218 add rsi, 4 2219 cmp r10, rsi 2220 jne .LBB0_699 2221 jmp .LBB0_1013 2222 .LBB0_731: 2223 test r9d, r9d 2224 jle .LBB0_1013 2225 # %bb.732: 2226 mov r10d, r9d 2227 cmp r9d, 32 2228 jae .LBB0_734 2229 # %bb.733: 2230 xor edi, edi 2231 .LBB0_743: 2232 mov r9, rdi 2233 not r9 2234 add r9, r10 2235 mov rsi, r10 2236 and rsi, 3 2237 je .LBB0_745 2238 .LBB0_744: # =>This Inner Loop Header: Depth=1 2239 movzx eax, byte ptr [rcx + rdi] 2240 mul byte ptr [rdx + rdi] 2241 mov byte ptr [r8 + rdi], al 2242 add rdi, 1 2243 add rsi, -1 2244 jne .LBB0_744 2245 .LBB0_745: 2246 cmp r9, 3 2247 jb .LBB0_1013 2248 .LBB0_746: # =>This Inner Loop Header: Depth=1 2249 movzx eax, byte ptr [rcx + rdi] 2250 mul byte ptr [rdx + rdi] 2251 mov byte ptr [r8 + rdi], al 2252 movzx eax, byte ptr [rcx + rdi + 1] 2253 mul byte ptr [rdx + rdi + 1] 2254 mov byte ptr [r8 + rdi + 1], al 2255 movzx eax, byte ptr [rcx + rdi + 2] 2256 mul byte ptr [rdx + rdi + 2] 2257 mov byte ptr [r8 + rdi + 2], al 2258 movzx eax, byte ptr [rcx + rdi + 3] 2259 mul byte ptr [rdx + rdi + 3] 2260 mov byte ptr [r8 + rdi + 3], al 2261 add rdi, 4 2262 cmp r10, rdi 2263 jne .LBB0_746 2264 jmp .LBB0_1013 2265 .LBB0_881: 2266 test r9d, r9d 2267 jle .LBB0_1013 2268 # %bb.882: 2269 mov r10d, r9d 2270 cmp r9d, 32 2271 jae .LBB0_884 2272 # %bb.883: 2273 xor edi, edi 2274 .LBB0_893: 2275 mov r9, rdi 2276 not r9 2277 add r9, r10 2278 mov rsi, r10 2279 and rsi, 3 2280 je .LBB0_895 2281 .LBB0_894: # =>This Inner Loop Header: Depth=1 2282 movzx eax, byte ptr [rcx + rdi] 2283 mul byte ptr [rdx + rdi] 2284 mov byte ptr [r8 + rdi], al 2285 add rdi, 1 2286 add rsi, -1 2287 jne .LBB0_894 2288 .LBB0_895: 2289 cmp r9, 3 2290 jb .LBB0_1013 2291 .LBB0_896: # =>This Inner Loop Header: Depth=1 2292 movzx eax, byte ptr [rcx + rdi] 2293 mul byte ptr [rdx + rdi] 2294 mov byte ptr [r8 + rdi], al 2295 movzx eax, byte ptr [rcx + rdi + 1] 2296 mul byte ptr [rdx + rdi + 1] 2297 mov byte ptr [r8 + rdi + 1], al 2298 movzx eax, byte ptr [rcx + rdi + 2] 2299 mul byte ptr [rdx + rdi + 2] 2300 mov byte ptr [r8 + rdi + 2], al 2301 movzx eax, byte ptr [rcx + rdi + 3] 2302 mul byte ptr [rdx + rdi + 3] 2303 mov byte ptr [r8 + rdi + 3], al 2304 add rdi, 4 2305 cmp r10, rdi 2306 jne .LBB0_896 2307 jmp .LBB0_1013 2308 .LBB0_46: 2309 test r9d, r9d 2310 jle .LBB0_1013 2311 # %bb.47: 2312 mov r10d, r9d 2313 cmp r9d, 32 2314 jae .LBB0_49 2315 # %bb.48: 2316 xor esi, esi 2317 .LBB0_58: 2318 mov r9, rsi 2319 not r9 2320 add r9, r10 2321 mov rdi, r10 2322 and rdi, 3 2323 je .LBB0_60 2324 .LBB0_59: # =>This Inner Loop Header: Depth=1 2325 movzx eax, byte ptr [rcx + rsi] 2326 add al, byte ptr [rdx + rsi] 2327 mov byte ptr [r8 + rsi], al 2328 add rsi, 1 2329 add rdi, -1 2330 jne .LBB0_59 2331 .LBB0_60: 2332 cmp r9, 3 2333 jb .LBB0_1013 2334 .LBB0_61: # =>This Inner Loop Header: Depth=1 2335 movzx eax, byte ptr [rcx + rsi] 2336 add al, byte ptr [rdx + rsi] 2337 mov byte ptr [r8 + rsi], al 2338 movzx eax, byte ptr [rcx + rsi + 1] 2339 add al, byte ptr [rdx + rsi + 1] 2340 mov byte ptr [r8 + rsi + 1], al 2341 movzx eax, byte ptr [rcx + rsi + 2] 2342 add al, byte ptr [rdx + rsi + 2] 2343 mov byte ptr [r8 + rsi + 2], al 2344 movzx eax, byte ptr [rcx + rsi + 3] 2345 add al, byte ptr [rdx + rsi + 3] 2346 mov byte ptr [r8 + rsi + 3], al 2347 add rsi, 4 2348 cmp r10, rsi 2349 jne .LBB0_61 2350 jmp .LBB0_1013 2351 .LBB0_392: 2352 test r9d, r9d 2353 jle .LBB0_1013 2354 # %bb.393: 2355 mov r10d, r9d 2356 cmp r9d, 32 2357 jae .LBB0_395 2358 # %bb.394: 2359 xor esi, esi 2360 .LBB0_404: 2361 mov r9, rsi 2362 not r9 2363 add r9, r10 2364 mov rdi, r10 2365 and rdi, 3 2366 je .LBB0_406 2367 .LBB0_405: # =>This Inner Loop Header: Depth=1 2368 movzx eax, byte ptr [rdx + rsi] 2369 sub al, byte ptr [rcx + rsi] 2370 mov byte ptr [r8 + rsi], al 2371 add rsi, 1 2372 add rdi, -1 2373 jne .LBB0_405 2374 .LBB0_406: 2375 cmp r9, 3 2376 jb .LBB0_1013 2377 .LBB0_407: # =>This Inner Loop Header: Depth=1 2378 movzx eax, byte ptr [rdx + rsi] 2379 sub al, byte ptr [rcx + rsi] 2380 mov byte ptr [r8 + rsi], al 2381 movzx eax, byte ptr [rdx + rsi + 1] 2382 sub al, byte ptr [rcx + rsi + 1] 2383 mov byte ptr [r8 + rsi + 1], al 2384 movzx eax, byte ptr [rdx + rsi + 2] 2385 sub al, byte ptr [rcx + rsi + 2] 2386 mov byte ptr [r8 + rsi + 2], al 2387 movzx eax, byte ptr [rdx + rsi + 3] 2388 sub al, byte ptr [rcx + rsi + 3] 2389 mov byte ptr [r8 + rsi + 3], al 2390 add rsi, 4 2391 cmp r10, rsi 2392 jne .LBB0_407 2393 jmp .LBB0_1013 2394 .LBB0_219: 2395 test r9d, r9d 2396 jle .LBB0_1013 2397 # %bb.220: 2398 mov r10d, r9d 2399 cmp r9d, 32 2400 jae .LBB0_222 2401 # %bb.221: 2402 xor esi, esi 2403 .LBB0_231: 2404 mov r9, rsi 2405 not r9 2406 add r9, r10 2407 mov rdi, r10 2408 and rdi, 3 2409 je .LBB0_233 2410 .LBB0_232: # =>This Inner Loop Header: Depth=1 2411 movzx eax, byte ptr [rcx + rsi] 2412 add al, byte ptr [rdx + rsi] 2413 mov byte ptr [r8 + rsi], al 2414 add rsi, 1 2415 add rdi, -1 2416 jne .LBB0_232 2417 .LBB0_233: 2418 cmp r9, 3 2419 jb .LBB0_1013 2420 .LBB0_234: # =>This Inner Loop Header: Depth=1 2421 movzx eax, byte ptr [rcx + rsi] 2422 add al, byte ptr [rdx + rsi] 2423 mov byte ptr [r8 + rsi], al 2424 movzx eax, byte ptr [rcx + rsi + 1] 2425 add al, byte ptr [rdx + rsi + 1] 2426 mov byte ptr [r8 + rsi + 1], al 2427 movzx eax, byte ptr [rcx + rsi + 2] 2428 add al, byte ptr [rdx + rsi + 2] 2429 mov byte ptr [r8 + rsi + 2], al 2430 movzx eax, byte ptr [rcx + rsi + 3] 2431 add al, byte ptr [rdx + rsi + 3] 2432 mov byte ptr [r8 + rsi + 3], al 2433 add rsi, 4 2434 cmp r10, rsi 2435 jne .LBB0_234 2436 jmp .LBB0_1013 2437 .LBB0_565: 2438 test r9d, r9d 2439 jle .LBB0_1013 2440 # %bb.566: 2441 mov r10d, r9d 2442 cmp r9d, 32 2443 jae .LBB0_568 2444 # %bb.567: 2445 xor esi, esi 2446 .LBB0_577: 2447 mov r9, rsi 2448 not r9 2449 add r9, r10 2450 mov rdi, r10 2451 and rdi, 3 2452 je .LBB0_579 2453 .LBB0_578: # =>This Inner Loop Header: Depth=1 2454 movzx eax, byte ptr [rdx + rsi] 2455 sub al, byte ptr [rcx + rsi] 2456 mov byte ptr [r8 + rsi], al 2457 add rsi, 1 2458 add rdi, -1 2459 jne .LBB0_578 2460 .LBB0_579: 2461 cmp r9, 3 2462 jb .LBB0_1013 2463 .LBB0_580: # =>This Inner Loop Header: Depth=1 2464 movzx eax, byte ptr [rdx + rsi] 2465 sub al, byte ptr [rcx + rsi] 2466 mov byte ptr [r8 + rsi], al 2467 movzx eax, byte ptr [rdx + rsi + 1] 2468 sub al, byte ptr [rcx + rsi + 1] 2469 mov byte ptr [r8 + rsi + 1], al 2470 movzx eax, byte ptr [rdx + rsi + 2] 2471 sub al, byte ptr [rcx + rsi + 2] 2472 mov byte ptr [r8 + rsi + 2], al 2473 movzx eax, byte ptr [rdx + rsi + 3] 2474 sub al, byte ptr [rcx + rsi + 3] 2475 mov byte ptr [r8 + rsi + 3], al 2476 add rsi, 4 2477 cmp r10, rsi 2478 jne .LBB0_580 2479 jmp .LBB0_1013 2480 .LBB0_805: 2481 test r9d, r9d 2482 jle .LBB0_1013 2483 # %bb.806: 2484 mov r10d, r9d 2485 cmp r9d, 8 2486 jae .LBB0_808 2487 # %bb.807: 2488 xor esi, esi 2489 .LBB0_817: 2490 mov r9, rsi 2491 not r9 2492 add r9, r10 2493 mov rdi, r10 2494 and rdi, 3 2495 je .LBB0_819 2496 .LBB0_818: # =>This Inner Loop Header: Depth=1 2497 mov eax, dword ptr [rcx + 4*rsi] 2498 imul eax, dword ptr [rdx + 4*rsi] 2499 mov dword ptr [r8 + 4*rsi], eax 2500 add rsi, 1 2501 add rdi, -1 2502 jne .LBB0_818 2503 .LBB0_819: 2504 cmp r9, 3 2505 jb .LBB0_1013 2506 .LBB0_820: # =>This Inner Loop Header: Depth=1 2507 mov eax, dword ptr [rcx + 4*rsi] 2508 imul eax, dword ptr [rdx + 4*rsi] 2509 mov dword ptr [r8 + 4*rsi], eax 2510 mov eax, dword ptr [rcx + 4*rsi + 4] 2511 imul eax, dword ptr [rdx + 4*rsi + 4] 2512 mov dword ptr [r8 + 4*rsi + 4], eax 2513 mov eax, dword ptr [rcx + 4*rsi + 8] 2514 imul eax, dword ptr [rdx + 4*rsi + 8] 2515 mov dword ptr [r8 + 4*rsi + 8], eax 2516 mov eax, dword ptr [rcx + 4*rsi + 12] 2517 imul eax, dword ptr [rdx + 4*rsi + 12] 2518 mov dword ptr [r8 + 4*rsi + 12], eax 2519 add rsi, 4 2520 cmp r10, rsi 2521 jne .LBB0_820 2522 jmp .LBB0_1013 2523 .LBB0_955: 2524 test r9d, r9d 2525 jle .LBB0_1013 2526 # %bb.956: 2527 mov r10d, r9d 2528 cmp r9d, 8 2529 jae .LBB0_958 2530 # %bb.957: 2531 xor esi, esi 2532 .LBB0_967: 2533 mov r9, rsi 2534 not r9 2535 add r9, r10 2536 mov rdi, r10 2537 and rdi, 3 2538 je .LBB0_969 2539 .LBB0_968: # =>This Inner Loop Header: Depth=1 2540 mov eax, dword ptr [rcx + 4*rsi] 2541 imul eax, dword ptr [rdx + 4*rsi] 2542 mov dword ptr [r8 + 4*rsi], eax 2543 add rsi, 1 2544 add rdi, -1 2545 jne .LBB0_968 2546 .LBB0_969: 2547 cmp r9, 3 2548 jb .LBB0_1013 2549 .LBB0_970: # =>This Inner Loop Header: Depth=1 2550 mov eax, dword ptr [rcx + 4*rsi] 2551 imul eax, dword ptr [rdx + 4*rsi] 2552 mov dword ptr [r8 + 4*rsi], eax 2553 mov eax, dword ptr [rcx + 4*rsi + 4] 2554 imul eax, dword ptr [rdx + 4*rsi + 4] 2555 mov dword ptr [r8 + 4*rsi + 4], eax 2556 mov eax, dword ptr [rcx + 4*rsi + 8] 2557 imul eax, dword ptr [rdx + 4*rsi + 8] 2558 mov dword ptr [r8 + 4*rsi + 8], eax 2559 mov eax, dword ptr [rcx + 4*rsi + 12] 2560 imul eax, dword ptr [rdx + 4*rsi + 12] 2561 mov dword ptr [r8 + 4*rsi + 12], eax 2562 add rsi, 4 2563 cmp r10, rsi 2564 jne .LBB0_970 2565 jmp .LBB0_1013 2566 .LBB0_120: 2567 test r9d, r9d 2568 jle .LBB0_1013 2569 # %bb.121: 2570 mov r10d, r9d 2571 cmp r9d, 8 2572 jae .LBB0_123 2573 # %bb.122: 2574 xor esi, esi 2575 .LBB0_132: 2576 mov r9, rsi 2577 not r9 2578 add r9, r10 2579 mov rdi, r10 2580 and rdi, 3 2581 je .LBB0_134 2582 .LBB0_133: # =>This Inner Loop Header: Depth=1 2583 mov eax, dword ptr [rcx + 4*rsi] 2584 add eax, dword ptr [rdx + 4*rsi] 2585 mov dword ptr [r8 + 4*rsi], eax 2586 add rsi, 1 2587 add rdi, -1 2588 jne .LBB0_133 2589 .LBB0_134: 2590 cmp r9, 3 2591 jb .LBB0_1013 2592 .LBB0_135: # =>This Inner Loop Header: Depth=1 2593 mov eax, dword ptr [rcx + 4*rsi] 2594 add eax, dword ptr [rdx + 4*rsi] 2595 mov dword ptr [r8 + 4*rsi], eax 2596 mov eax, dword ptr [rcx + 4*rsi + 4] 2597 add eax, dword ptr [rdx + 4*rsi + 4] 2598 mov dword ptr [r8 + 4*rsi + 4], eax 2599 mov eax, dword ptr [rcx + 4*rsi + 8] 2600 add eax, dword ptr [rdx + 4*rsi + 8] 2601 mov dword ptr [r8 + 4*rsi + 8], eax 2602 mov eax, dword ptr [rcx + 4*rsi + 12] 2603 add eax, dword ptr [rdx + 4*rsi + 12] 2604 mov dword ptr [r8 + 4*rsi + 12], eax 2605 add rsi, 4 2606 cmp r10, rsi 2607 jne .LBB0_135 2608 jmp .LBB0_1013 2609 .LBB0_466: 2610 test r9d, r9d 2611 jle .LBB0_1013 2612 # %bb.467: 2613 mov r10d, r9d 2614 cmp r9d, 8 2615 jae .LBB0_469 2616 # %bb.468: 2617 xor esi, esi 2618 .LBB0_478: 2619 mov r9, rsi 2620 not r9 2621 add r9, r10 2622 mov rdi, r10 2623 and rdi, 3 2624 je .LBB0_480 2625 .LBB0_479: # =>This Inner Loop Header: Depth=1 2626 mov eax, dword ptr [rdx + 4*rsi] 2627 sub eax, dword ptr [rcx + 4*rsi] 2628 mov dword ptr [r8 + 4*rsi], eax 2629 add rsi, 1 2630 add rdi, -1 2631 jne .LBB0_479 2632 .LBB0_480: 2633 cmp r9, 3 2634 jb .LBB0_1013 2635 .LBB0_481: # =>This Inner Loop Header: Depth=1 2636 mov eax, dword ptr [rdx + 4*rsi] 2637 sub eax, dword ptr [rcx + 4*rsi] 2638 mov dword ptr [r8 + 4*rsi], eax 2639 mov eax, dword ptr [rdx + 4*rsi + 4] 2640 sub eax, dword ptr [rcx + 4*rsi + 4] 2641 mov dword ptr [r8 + 4*rsi + 4], eax 2642 mov eax, dword ptr [rdx + 4*rsi + 8] 2643 sub eax, dword ptr [rcx + 4*rsi + 8] 2644 mov dword ptr [r8 + 4*rsi + 8], eax 2645 mov eax, dword ptr [rdx + 4*rsi + 12] 2646 sub eax, dword ptr [rcx + 4*rsi + 12] 2647 mov dword ptr [r8 + 4*rsi + 12], eax 2648 add rsi, 4 2649 cmp r10, rsi 2650 jne .LBB0_481 2651 jmp .LBB0_1013 2652 .LBB0_293: 2653 test r9d, r9d 2654 jle .LBB0_1013 2655 # %bb.294: 2656 mov r10d, r9d 2657 cmp r9d, 8 2658 jae .LBB0_296 2659 # %bb.295: 2660 xor esi, esi 2661 .LBB0_305: 2662 mov r9, rsi 2663 not r9 2664 add r9, r10 2665 mov rdi, r10 2666 and rdi, 3 2667 je .LBB0_307 2668 .LBB0_306: # =>This Inner Loop Header: Depth=1 2669 mov eax, dword ptr [rcx + 4*rsi] 2670 add eax, dword ptr [rdx + 4*rsi] 2671 mov dword ptr [r8 + 4*rsi], eax 2672 add rsi, 1 2673 add rdi, -1 2674 jne .LBB0_306 2675 .LBB0_307: 2676 cmp r9, 3 2677 jb .LBB0_1013 2678 .LBB0_308: # =>This Inner Loop Header: Depth=1 2679 mov eax, dword ptr [rcx + 4*rsi] 2680 add eax, dword ptr [rdx + 4*rsi] 2681 mov dword ptr [r8 + 4*rsi], eax 2682 mov eax, dword ptr [rcx + 4*rsi + 4] 2683 add eax, dword ptr [rdx + 4*rsi + 4] 2684 mov dword ptr [r8 + 4*rsi + 4], eax 2685 mov eax, dword ptr [rcx + 4*rsi + 8] 2686 add eax, dword ptr [rdx + 4*rsi + 8] 2687 mov dword ptr [r8 + 4*rsi + 8], eax 2688 mov eax, dword ptr [rcx + 4*rsi + 12] 2689 add eax, dword ptr [rdx + 4*rsi + 12] 2690 mov dword ptr [r8 + 4*rsi + 12], eax 2691 add rsi, 4 2692 cmp r10, rsi 2693 jne .LBB0_308 2694 jmp .LBB0_1013 2695 .LBB0_639: 2696 test r9d, r9d 2697 jle .LBB0_1013 2698 # %bb.640: 2699 mov r10d, r9d 2700 cmp r9d, 8 2701 jae .LBB0_642 2702 # %bb.641: 2703 xor esi, esi 2704 .LBB0_651: 2705 mov r9, rsi 2706 not r9 2707 add r9, r10 2708 mov rdi, r10 2709 and rdi, 3 2710 je .LBB0_653 2711 .LBB0_652: # =>This Inner Loop Header: Depth=1 2712 mov eax, dword ptr [rdx + 4*rsi] 2713 sub eax, dword ptr [rcx + 4*rsi] 2714 mov dword ptr [r8 + 4*rsi], eax 2715 add rsi, 1 2716 add rdi, -1 2717 jne .LBB0_652 2718 .LBB0_653: 2719 cmp r9, 3 2720 jb .LBB0_1013 2721 .LBB0_654: # =>This Inner Loop Header: Depth=1 2722 mov eax, dword ptr [rdx + 4*rsi] 2723 sub eax, dword ptr [rcx + 4*rsi] 2724 mov dword ptr [r8 + 4*rsi], eax 2725 mov eax, dword ptr [rdx + 4*rsi + 4] 2726 sub eax, dword ptr [rcx + 4*rsi + 4] 2727 mov dword ptr [r8 + 4*rsi + 4], eax 2728 mov eax, dword ptr [rdx + 4*rsi + 8] 2729 sub eax, dword ptr [rcx + 4*rsi + 8] 2730 mov dword ptr [r8 + 4*rsi + 8], eax 2731 mov eax, dword ptr [rdx + 4*rsi + 12] 2732 sub eax, dword ptr [rcx + 4*rsi + 12] 2733 mov dword ptr [r8 + 4*rsi + 12], eax 2734 add rsi, 4 2735 cmp r10, rsi 2736 jne .LBB0_654 2737 jmp .LBB0_1013 2738 .LBB0_792: 2739 lea rsi, [r8 + 4*r10] 2740 lea rax, [rdx + 4*r10] 2741 cmp rax, r8 2742 seta r9b 2743 lea rax, [rcx + 4*r10] 2744 cmp rsi, rdx 2745 seta r11b 2746 cmp rax, r8 2747 seta al 2748 cmp rsi, rcx 2749 seta dil 2750 xor esi, esi 2751 test r9b, r11b 2752 jne .LBB0_801 2753 # %bb.793: 2754 and al, dil 2755 jne .LBB0_801 2756 # %bb.794: 2757 mov esi, r10d 2758 and esi, -8 2759 lea rax, [rsi - 8] 2760 mov r9, rax 2761 shr r9, 3 2762 add r9, 1 2763 test rax, rax 2764 je .LBB0_795 2765 # %bb.796: 2766 mov rax, r9 2767 and rax, -2 2768 neg rax 2769 xor edi, edi 2770 .LBB0_797: # =>This Inner Loop Header: Depth=1 2771 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 2772 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 2773 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 2774 pmulld xmm2, xmm0 2775 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 2776 pmulld xmm0, xmm1 2777 movdqu xmmword ptr [r8 + 4*rdi], xmm2 2778 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 2779 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 2780 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 2781 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32] 2782 pmulld xmm2, xmm0 2783 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 48] 2784 pmulld xmm0, xmm1 2785 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm2 2786 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm0 2787 add rdi, 16 2788 add rax, 2 2789 jne .LBB0_797 2790 jmp .LBB0_798 2791 .LBB0_942: 2792 lea rsi, [r8 + 4*r10] 2793 lea rax, [rdx + 4*r10] 2794 cmp rax, r8 2795 seta r9b 2796 lea rax, [rcx + 4*r10] 2797 cmp rsi, rdx 2798 seta r11b 2799 cmp rax, r8 2800 seta al 2801 cmp rsi, rcx 2802 seta dil 2803 xor esi, esi 2804 test r9b, r11b 2805 jne .LBB0_951 2806 # %bb.943: 2807 and al, dil 2808 jne .LBB0_951 2809 # %bb.944: 2810 mov esi, r10d 2811 and esi, -8 2812 lea rax, [rsi - 8] 2813 mov r9, rax 2814 shr r9, 3 2815 add r9, 1 2816 test rax, rax 2817 je .LBB0_945 2818 # %bb.946: 2819 mov rax, r9 2820 and rax, -2 2821 neg rax 2822 xor edi, edi 2823 .LBB0_947: # =>This Inner Loop Header: Depth=1 2824 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 2825 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 2826 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 2827 pmulld xmm2, xmm0 2828 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 2829 pmulld xmm0, xmm1 2830 movdqu xmmword ptr [r8 + 4*rdi], xmm2 2831 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 2832 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 2833 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 2834 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32] 2835 pmulld xmm2, xmm0 2836 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 48] 2837 pmulld xmm0, xmm1 2838 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm2 2839 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm0 2840 add rdi, 16 2841 add rax, 2 2842 jne .LBB0_947 2843 jmp .LBB0_948 2844 .LBB0_107: 2845 lea rsi, [r8 + 4*r10] 2846 lea rax, [rdx + 4*r10] 2847 cmp rax, r8 2848 seta r9b 2849 lea rax, [rcx + 4*r10] 2850 cmp rsi, rdx 2851 seta r11b 2852 cmp rax, r8 2853 seta al 2854 cmp rsi, rcx 2855 seta dil 2856 xor esi, esi 2857 test r9b, r11b 2858 jne .LBB0_116 2859 # %bb.108: 2860 and al, dil 2861 jne .LBB0_116 2862 # %bb.109: 2863 mov esi, r10d 2864 and esi, -8 2865 lea rax, [rsi - 8] 2866 mov r9, rax 2867 shr r9, 3 2868 add r9, 1 2869 test rax, rax 2870 je .LBB0_110 2871 # %bb.111: 2872 mov rax, r9 2873 and rax, -2 2874 neg rax 2875 xor edi, edi 2876 .LBB0_112: # =>This Inner Loop Header: Depth=1 2877 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 2878 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 2879 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 2880 paddd xmm2, xmm0 2881 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 2882 paddd xmm0, xmm1 2883 movdqu xmmword ptr [r8 + 4*rdi], xmm2 2884 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 2885 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 2886 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 2887 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32] 2888 paddd xmm2, xmm0 2889 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 48] 2890 paddd xmm0, xmm1 2891 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm2 2892 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm0 2893 add rdi, 16 2894 add rax, 2 2895 jne .LBB0_112 2896 jmp .LBB0_113 2897 .LBB0_453: 2898 lea rsi, [r8 + 4*r10] 2899 lea rax, [rdx + 4*r10] 2900 cmp rax, r8 2901 seta r9b 2902 lea rax, [rcx + 4*r10] 2903 cmp rsi, rdx 2904 seta r11b 2905 cmp rax, r8 2906 seta al 2907 cmp rsi, rcx 2908 seta dil 2909 xor esi, esi 2910 test r9b, r11b 2911 jne .LBB0_462 2912 # %bb.454: 2913 and al, dil 2914 jne .LBB0_462 2915 # %bb.455: 2916 mov esi, r10d 2917 and esi, -8 2918 lea rax, [rsi - 8] 2919 mov r9, rax 2920 shr r9, 3 2921 add r9, 1 2922 test rax, rax 2923 je .LBB0_456 2924 # %bb.457: 2925 mov rax, r9 2926 and rax, -2 2927 neg rax 2928 xor edi, edi 2929 .LBB0_458: # =>This Inner Loop Header: Depth=1 2930 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 2931 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 2932 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 2933 psubd xmm0, xmm2 2934 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 2935 psubd xmm1, xmm2 2936 movdqu xmmword ptr [r8 + 4*rdi], xmm0 2937 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1 2938 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 2939 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 2940 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32] 2941 psubd xmm0, xmm2 2942 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 2943 psubd xmm1, xmm2 2944 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm0 2945 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1 2946 add rdi, 16 2947 add rax, 2 2948 jne .LBB0_458 2949 jmp .LBB0_459 2950 .LBB0_280: 2951 lea rsi, [r8 + 4*r10] 2952 lea rax, [rdx + 4*r10] 2953 cmp rax, r8 2954 seta r9b 2955 lea rax, [rcx + 4*r10] 2956 cmp rsi, rdx 2957 seta r11b 2958 cmp rax, r8 2959 seta al 2960 cmp rsi, rcx 2961 seta dil 2962 xor esi, esi 2963 test r9b, r11b 2964 jne .LBB0_289 2965 # %bb.281: 2966 and al, dil 2967 jne .LBB0_289 2968 # %bb.282: 2969 mov esi, r10d 2970 and esi, -8 2971 lea rax, [rsi - 8] 2972 mov r9, rax 2973 shr r9, 3 2974 add r9, 1 2975 test rax, rax 2976 je .LBB0_283 2977 # %bb.284: 2978 mov rax, r9 2979 and rax, -2 2980 neg rax 2981 xor edi, edi 2982 .LBB0_285: # =>This Inner Loop Header: Depth=1 2983 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 2984 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 2985 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 2986 paddd xmm2, xmm0 2987 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 2988 paddd xmm0, xmm1 2989 movdqu xmmword ptr [r8 + 4*rdi], xmm2 2990 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 2991 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 2992 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 2993 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32] 2994 paddd xmm2, xmm0 2995 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 48] 2996 paddd xmm0, xmm1 2997 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm2 2998 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm0 2999 add rdi, 16 3000 add rax, 2 3001 jne .LBB0_285 3002 jmp .LBB0_286 3003 .LBB0_626: 3004 lea rsi, [r8 + 4*r10] 3005 lea rax, [rdx + 4*r10] 3006 cmp rax, r8 3007 seta r9b 3008 lea rax, [rcx + 4*r10] 3009 cmp rsi, rdx 3010 seta r11b 3011 cmp rax, r8 3012 seta al 3013 cmp rsi, rcx 3014 seta dil 3015 xor esi, esi 3016 test r9b, r11b 3017 jne .LBB0_635 3018 # %bb.627: 3019 and al, dil 3020 jne .LBB0_635 3021 # %bb.628: 3022 mov esi, r10d 3023 and esi, -8 3024 lea rax, [rsi - 8] 3025 mov r9, rax 3026 shr r9, 3 3027 add r9, 1 3028 test rax, rax 3029 je .LBB0_629 3030 # %bb.630: 3031 mov rax, r9 3032 and rax, -2 3033 neg rax 3034 xor edi, edi 3035 .LBB0_631: # =>This Inner Loop Header: Depth=1 3036 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 3037 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 3038 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 3039 psubd xmm0, xmm2 3040 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 3041 psubd xmm1, xmm2 3042 movdqu xmmword ptr [r8 + 4*rdi], xmm0 3043 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1 3044 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 3045 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 3046 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32] 3047 psubd xmm0, xmm2 3048 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 3049 psubd xmm1, xmm2 3050 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm0 3051 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1 3052 add rdi, 16 3053 add rax, 2 3054 jne .LBB0_631 3055 jmp .LBB0_632 3056 .LBB0_850: 3057 lea rsi, [r8 + 8*r10] 3058 lea rax, [rdx + 8*r10] 3059 cmp rax, r8 3060 seta r9b 3061 lea rax, [rcx + 8*r10] 3062 cmp rsi, rdx 3063 seta r11b 3064 cmp rax, r8 3065 seta al 3066 cmp rsi, rcx 3067 seta dil 3068 xor esi, esi 3069 test r9b, r11b 3070 jne .LBB0_859 3071 # %bb.851: 3072 and al, dil 3073 jne .LBB0_859 3074 # %bb.852: 3075 mov esi, r10d 3076 and esi, -4 3077 lea rax, [rsi - 4] 3078 mov r9, rax 3079 shr r9, 2 3080 add r9, 1 3081 test rax, rax 3082 je .LBB0_853 3083 # %bb.854: 3084 mov rax, r9 3085 and rax, -2 3086 neg rax 3087 xor edi, edi 3088 .LBB0_855: # =>This Inner Loop Header: Depth=1 3089 movupd xmm0, xmmword ptr [rdx + 8*rdi] 3090 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 3091 movupd xmm2, xmmword ptr [rcx + 8*rdi] 3092 mulpd xmm2, xmm0 3093 movupd xmm0, xmmword ptr [rcx + 8*rdi + 16] 3094 mulpd xmm0, xmm1 3095 movupd xmmword ptr [r8 + 8*rdi], xmm2 3096 movupd xmmword ptr [r8 + 8*rdi + 16], xmm0 3097 movupd xmm0, xmmword ptr [rdx + 8*rdi + 32] 3098 movupd xmm1, xmmword ptr [rdx + 8*rdi + 48] 3099 movupd xmm2, xmmword ptr [rcx + 8*rdi + 32] 3100 mulpd xmm2, xmm0 3101 movupd xmm0, xmmword ptr [rcx + 8*rdi + 48] 3102 mulpd xmm0, xmm1 3103 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 3104 movupd xmmword ptr [r8 + 8*rdi + 48], xmm0 3105 add rdi, 8 3106 add rax, 2 3107 jne .LBB0_855 3108 jmp .LBB0_856 3109 .LBB0_1000: 3110 lea rsi, [r8 + 8*r10] 3111 lea rax, [rdx + 8*r10] 3112 cmp rax, r8 3113 seta r9b 3114 lea rax, [rcx + 8*r10] 3115 cmp rsi, rdx 3116 seta r11b 3117 cmp rax, r8 3118 seta al 3119 cmp rsi, rcx 3120 seta dil 3121 xor esi, esi 3122 test r9b, r11b 3123 jne .LBB0_1009 3124 # %bb.1001: 3125 and al, dil 3126 jne .LBB0_1009 3127 # %bb.1002: 3128 mov esi, r10d 3129 and esi, -4 3130 lea rax, [rsi - 4] 3131 mov r9, rax 3132 shr r9, 2 3133 add r9, 1 3134 test rax, rax 3135 je .LBB0_1003 3136 # %bb.1004: 3137 mov rax, r9 3138 and rax, -2 3139 neg rax 3140 xor edi, edi 3141 .LBB0_1005: # =>This Inner Loop Header: Depth=1 3142 movupd xmm0, xmmword ptr [rdx + 8*rdi] 3143 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 3144 movupd xmm2, xmmword ptr [rcx + 8*rdi] 3145 mulpd xmm2, xmm0 3146 movupd xmm0, xmmword ptr [rcx + 8*rdi + 16] 3147 mulpd xmm0, xmm1 3148 movupd xmmword ptr [r8 + 8*rdi], xmm2 3149 movupd xmmword ptr [r8 + 8*rdi + 16], xmm0 3150 movupd xmm0, xmmword ptr [rdx + 8*rdi + 32] 3151 movupd xmm1, xmmword ptr [rdx + 8*rdi + 48] 3152 movupd xmm2, xmmword ptr [rcx + 8*rdi + 32] 3153 mulpd xmm2, xmm0 3154 movupd xmm0, xmmword ptr [rcx + 8*rdi + 48] 3155 mulpd xmm0, xmm1 3156 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 3157 movupd xmmword ptr [r8 + 8*rdi + 48], xmm0 3158 add rdi, 8 3159 add rax, 2 3160 jne .LBB0_1005 3161 jmp .LBB0_1006 3162 .LBB0_181: 3163 lea rsi, [r8 + 8*r10] 3164 lea rax, [rdx + 8*r10] 3165 cmp rax, r8 3166 seta r9b 3167 lea rax, [rcx + 8*r10] 3168 cmp rsi, rdx 3169 seta r11b 3170 cmp rax, r8 3171 seta al 3172 cmp rsi, rcx 3173 seta dil 3174 xor esi, esi 3175 test r9b, r11b 3176 jne .LBB0_190 3177 # %bb.182: 3178 and al, dil 3179 jne .LBB0_190 3180 # %bb.183: 3181 mov esi, r10d 3182 and esi, -4 3183 lea rax, [rsi - 4] 3184 mov r9, rax 3185 shr r9, 2 3186 add r9, 1 3187 test rax, rax 3188 je .LBB0_184 3189 # %bb.185: 3190 mov rax, r9 3191 and rax, -2 3192 neg rax 3193 xor edi, edi 3194 .LBB0_186: # =>This Inner Loop Header: Depth=1 3195 movupd xmm0, xmmword ptr [rdx + 8*rdi] 3196 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 3197 movupd xmm2, xmmword ptr [rcx + 8*rdi] 3198 addpd xmm2, xmm0 3199 movupd xmm0, xmmword ptr [rcx + 8*rdi + 16] 3200 addpd xmm0, xmm1 3201 movupd xmmword ptr [r8 + 8*rdi], xmm2 3202 movupd xmmword ptr [r8 + 8*rdi + 16], xmm0 3203 movupd xmm0, xmmword ptr [rdx + 8*rdi + 32] 3204 movupd xmm1, xmmword ptr [rdx + 8*rdi + 48] 3205 movupd xmm2, xmmword ptr [rcx + 8*rdi + 32] 3206 addpd xmm2, xmm0 3207 movupd xmm0, xmmword ptr [rcx + 8*rdi + 48] 3208 addpd xmm0, xmm1 3209 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 3210 movupd xmmword ptr [r8 + 8*rdi + 48], xmm0 3211 add rdi, 8 3212 add rax, 2 3213 jne .LBB0_186 3214 jmp .LBB0_187 3215 .LBB0_527: 3216 lea rsi, [r8 + 8*r10] 3217 lea rax, [rdx + 8*r10] 3218 cmp rax, r8 3219 seta r9b 3220 lea rax, [rcx + 8*r10] 3221 cmp rsi, rdx 3222 seta r11b 3223 cmp rax, r8 3224 seta al 3225 cmp rsi, rcx 3226 seta dil 3227 xor esi, esi 3228 test r9b, r11b 3229 jne .LBB0_536 3230 # %bb.528: 3231 and al, dil 3232 jne .LBB0_536 3233 # %bb.529: 3234 mov esi, r10d 3235 and esi, -4 3236 lea rax, [rsi - 4] 3237 mov r9, rax 3238 shr r9, 2 3239 add r9, 1 3240 test rax, rax 3241 je .LBB0_530 3242 # %bb.531: 3243 mov rax, r9 3244 and rax, -2 3245 neg rax 3246 xor edi, edi 3247 .LBB0_532: # =>This Inner Loop Header: Depth=1 3248 movupd xmm0, xmmword ptr [rdx + 8*rdi] 3249 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 3250 movupd xmm2, xmmword ptr [rcx + 8*rdi] 3251 subpd xmm0, xmm2 3252 movupd xmm2, xmmword ptr [rcx + 8*rdi + 16] 3253 subpd xmm1, xmm2 3254 movupd xmmword ptr [r8 + 8*rdi], xmm0 3255 movupd xmmword ptr [r8 + 8*rdi + 16], xmm1 3256 movupd xmm0, xmmword ptr [rdx + 8*rdi + 32] 3257 movupd xmm1, xmmword ptr [rdx + 8*rdi + 48] 3258 movupd xmm2, xmmword ptr [rcx + 8*rdi + 32] 3259 subpd xmm0, xmm2 3260 movupd xmm2, xmmword ptr [rcx + 8*rdi + 48] 3261 subpd xmm1, xmm2 3262 movupd xmmword ptr [r8 + 8*rdi + 32], xmm0 3263 movupd xmmword ptr [r8 + 8*rdi + 48], xmm1 3264 add rdi, 8 3265 add rax, 2 3266 jne .LBB0_532 3267 jmp .LBB0_533 3268 .LBB0_354: 3269 lea rsi, [r8 + 8*r10] 3270 lea rax, [rdx + 8*r10] 3271 cmp rax, r8 3272 seta r9b 3273 lea rax, [rcx + 8*r10] 3274 cmp rsi, rdx 3275 seta r11b 3276 cmp rax, r8 3277 seta al 3278 cmp rsi, rcx 3279 seta dil 3280 xor esi, esi 3281 test r9b, r11b 3282 jne .LBB0_363 3283 # %bb.355: 3284 and al, dil 3285 jne .LBB0_363 3286 # %bb.356: 3287 mov esi, r10d 3288 and esi, -4 3289 lea rax, [rsi - 4] 3290 mov r9, rax 3291 shr r9, 2 3292 add r9, 1 3293 test rax, rax 3294 je .LBB0_357 3295 # %bb.358: 3296 mov rax, r9 3297 and rax, -2 3298 neg rax 3299 xor edi, edi 3300 .LBB0_359: # =>This Inner Loop Header: Depth=1 3301 movupd xmm0, xmmword ptr [rdx + 8*rdi] 3302 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 3303 movupd xmm2, xmmword ptr [rcx + 8*rdi] 3304 addpd xmm2, xmm0 3305 movupd xmm0, xmmword ptr [rcx + 8*rdi + 16] 3306 addpd xmm0, xmm1 3307 movupd xmmword ptr [r8 + 8*rdi], xmm2 3308 movupd xmmword ptr [r8 + 8*rdi + 16], xmm0 3309 movupd xmm0, xmmword ptr [rdx + 8*rdi + 32] 3310 movupd xmm1, xmmword ptr [rdx + 8*rdi + 48] 3311 movupd xmm2, xmmword ptr [rcx + 8*rdi + 32] 3312 addpd xmm2, xmm0 3313 movupd xmm0, xmmword ptr [rcx + 8*rdi + 48] 3314 addpd xmm0, xmm1 3315 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 3316 movupd xmmword ptr [r8 + 8*rdi + 48], xmm0 3317 add rdi, 8 3318 add rax, 2 3319 jne .LBB0_359 3320 jmp .LBB0_360 3321 .LBB0_700: 3322 lea rsi, [r8 + 8*r10] 3323 lea rax, [rdx + 8*r10] 3324 cmp rax, r8 3325 seta r9b 3326 lea rax, [rcx + 8*r10] 3327 cmp rsi, rdx 3328 seta r11b 3329 cmp rax, r8 3330 seta al 3331 cmp rsi, rcx 3332 seta dil 3333 xor esi, esi 3334 test r9b, r11b 3335 jne .LBB0_709 3336 # %bb.701: 3337 and al, dil 3338 jne .LBB0_709 3339 # %bb.702: 3340 mov esi, r10d 3341 and esi, -4 3342 lea rax, [rsi - 4] 3343 mov r9, rax 3344 shr r9, 2 3345 add r9, 1 3346 test rax, rax 3347 je .LBB0_703 3348 # %bb.704: 3349 mov rax, r9 3350 and rax, -2 3351 neg rax 3352 xor edi, edi 3353 .LBB0_705: # =>This Inner Loop Header: Depth=1 3354 movupd xmm0, xmmword ptr [rdx + 8*rdi] 3355 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 3356 movupd xmm2, xmmword ptr [rcx + 8*rdi] 3357 subpd xmm0, xmm2 3358 movupd xmm2, xmmword ptr [rcx + 8*rdi + 16] 3359 subpd xmm1, xmm2 3360 movupd xmmword ptr [r8 + 8*rdi], xmm0 3361 movupd xmmword ptr [r8 + 8*rdi + 16], xmm1 3362 movupd xmm0, xmmword ptr [rdx + 8*rdi + 32] 3363 movupd xmm1, xmmword ptr [rdx + 8*rdi + 48] 3364 movupd xmm2, xmmword ptr [rcx + 8*rdi + 32] 3365 subpd xmm0, xmm2 3366 movupd xmm2, xmmword ptr [rcx + 8*rdi + 48] 3367 subpd xmm1, xmm2 3368 movupd xmmword ptr [r8 + 8*rdi + 32], xmm0 3369 movupd xmmword ptr [r8 + 8*rdi + 48], xmm1 3370 add rdi, 8 3371 add rax, 2 3372 jne .LBB0_705 3373 jmp .LBB0_706 3374 .LBB0_747: 3375 lea rsi, [r8 + r10] 3376 lea rax, [rdx + r10] 3377 cmp rax, r8 3378 seta r9b 3379 lea rax, [rcx + r10] 3380 cmp rsi, rdx 3381 seta r11b 3382 cmp rax, r8 3383 seta al 3384 cmp rsi, rcx 3385 seta sil 3386 xor edi, edi 3387 test r9b, r11b 3388 jne .LBB0_756 3389 # %bb.748: 3390 and al, sil 3391 jne .LBB0_756 3392 # %bb.749: 3393 mov edi, r10d 3394 and edi, -32 3395 lea rax, [rdi - 32] 3396 mov r9, rax 3397 shr r9, 5 3398 add r9, 1 3399 test rax, rax 3400 je .LBB0_750 3401 # %bb.751: 3402 mov rsi, r9 3403 and rsi, -2 3404 neg rsi 3405 xor eax, eax 3406 movdqa xmm0, xmmword ptr [rip + .LCPI0_0] # xmm0 = [255,255,255,255,255,255,255,255] 3407 .LBB0_752: # =>This Inner Loop Header: Depth=1 3408 movdqu xmm1, xmmword ptr [rdx + rax] 3409 movdqu xmm2, xmmword ptr [rdx + rax + 16] 3410 movdqu xmm3, xmmword ptr [rcx + rax] 3411 movdqu xmm4, xmmword ptr [rcx + rax + 16] 3412 pmovzxbw xmm5, xmm1 # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 3413 punpckhbw xmm1, xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3414 pmovzxbw xmm6, xmm3 # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 3415 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3416 pmullw xmm3, xmm1 3417 pand xmm3, xmm0 3418 pmullw xmm6, xmm5 3419 pand xmm6, xmm0 3420 packuswb xmm6, xmm3 3421 pmovzxbw xmm1, xmm2 # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 3422 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3423 pmovzxbw xmm3, xmm4 # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 3424 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3425 pmullw xmm4, xmm2 3426 pand xmm4, xmm0 3427 pmullw xmm3, xmm1 3428 pand xmm3, xmm0 3429 packuswb xmm3, xmm4 3430 movdqu xmmword ptr [r8 + rax], xmm6 3431 movdqu xmmword ptr [r8 + rax + 16], xmm3 3432 movdqu xmm1, xmmword ptr [rdx + rax + 32] 3433 movdqu xmm2, xmmword ptr [rdx + rax + 48] 3434 movdqu xmm3, xmmword ptr [rcx + rax + 32] 3435 movdqu xmm4, xmmword ptr [rcx + rax + 48] 3436 pmovzxbw xmm5, xmm1 # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 3437 punpckhbw xmm1, xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3438 pmovzxbw xmm6, xmm3 # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 3439 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3440 pmullw xmm3, xmm1 3441 pand xmm3, xmm0 3442 pmullw xmm6, xmm5 3443 pand xmm6, xmm0 3444 packuswb xmm6, xmm3 3445 pmovzxbw xmm1, xmm2 # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 3446 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3447 pmovzxbw xmm3, xmm4 # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 3448 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3449 pmullw xmm4, xmm2 3450 pand xmm4, xmm0 3451 pmullw xmm3, xmm1 3452 pand xmm3, xmm0 3453 packuswb xmm3, xmm4 3454 movdqu xmmword ptr [r8 + rax + 32], xmm6 3455 movdqu xmmword ptr [r8 + rax + 48], xmm3 3456 add rax, 64 3457 add rsi, 2 3458 jne .LBB0_752 3459 jmp .LBB0_753 3460 .LBB0_897: 3461 lea rsi, [r8 + r10] 3462 lea rax, [rdx + r10] 3463 cmp rax, r8 3464 seta r9b 3465 lea rax, [rcx + r10] 3466 cmp rsi, rdx 3467 seta r11b 3468 cmp rax, r8 3469 seta al 3470 cmp rsi, rcx 3471 seta sil 3472 xor edi, edi 3473 test r9b, r11b 3474 jne .LBB0_906 3475 # %bb.898: 3476 and al, sil 3477 jne .LBB0_906 3478 # %bb.899: 3479 mov edi, r10d 3480 and edi, -32 3481 lea rax, [rdi - 32] 3482 mov r9, rax 3483 shr r9, 5 3484 add r9, 1 3485 test rax, rax 3486 je .LBB0_900 3487 # %bb.901: 3488 mov rsi, r9 3489 and rsi, -2 3490 neg rsi 3491 xor eax, eax 3492 movdqa xmm0, xmmword ptr [rip + .LCPI0_0] # xmm0 = [255,255,255,255,255,255,255,255] 3493 .LBB0_902: # =>This Inner Loop Header: Depth=1 3494 movdqu xmm1, xmmword ptr [rdx + rax] 3495 movdqu xmm2, xmmword ptr [rdx + rax + 16] 3496 movdqu xmm3, xmmword ptr [rcx + rax] 3497 movdqu xmm4, xmmword ptr [rcx + rax + 16] 3498 pmovzxbw xmm5, xmm1 # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 3499 punpckhbw xmm1, xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3500 pmovzxbw xmm6, xmm3 # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 3501 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3502 pmullw xmm3, xmm1 3503 pand xmm3, xmm0 3504 pmullw xmm6, xmm5 3505 pand xmm6, xmm0 3506 packuswb xmm6, xmm3 3507 pmovzxbw xmm1, xmm2 # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 3508 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3509 pmovzxbw xmm3, xmm4 # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 3510 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3511 pmullw xmm4, xmm2 3512 pand xmm4, xmm0 3513 pmullw xmm3, xmm1 3514 pand xmm3, xmm0 3515 packuswb xmm3, xmm4 3516 movdqu xmmword ptr [r8 + rax], xmm6 3517 movdqu xmmword ptr [r8 + rax + 16], xmm3 3518 movdqu xmm1, xmmword ptr [rdx + rax + 32] 3519 movdqu xmm2, xmmword ptr [rdx + rax + 48] 3520 movdqu xmm3, xmmword ptr [rcx + rax + 32] 3521 movdqu xmm4, xmmword ptr [rcx + rax + 48] 3522 pmovzxbw xmm5, xmm1 # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 3523 punpckhbw xmm1, xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3524 pmovzxbw xmm6, xmm3 # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 3525 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3526 pmullw xmm3, xmm1 3527 pand xmm3, xmm0 3528 pmullw xmm6, xmm5 3529 pand xmm6, xmm0 3530 packuswb xmm6, xmm3 3531 pmovzxbw xmm1, xmm2 # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 3532 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3533 pmovzxbw xmm3, xmm4 # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 3534 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3535 pmullw xmm4, xmm2 3536 pand xmm4, xmm0 3537 pmullw xmm3, xmm1 3538 pand xmm3, xmm0 3539 packuswb xmm3, xmm4 3540 movdqu xmmword ptr [r8 + rax + 32], xmm6 3541 movdqu xmmword ptr [r8 + rax + 48], xmm3 3542 add rax, 64 3543 add rsi, 2 3544 jne .LBB0_902 3545 jmp .LBB0_903 3546 .LBB0_62: 3547 lea rsi, [r8 + r10] 3548 lea rax, [rdx + r10] 3549 cmp rax, r8 3550 seta r9b 3551 lea rax, [rcx + r10] 3552 cmp rsi, rdx 3553 seta r11b 3554 cmp rax, r8 3555 seta al 3556 cmp rsi, rcx 3557 seta dil 3558 xor esi, esi 3559 test r9b, r11b 3560 jne .LBB0_71 3561 # %bb.63: 3562 and al, dil 3563 jne .LBB0_71 3564 # %bb.64: 3565 mov esi, r10d 3566 and esi, -32 3567 lea rax, [rsi - 32] 3568 mov r9, rax 3569 shr r9, 5 3570 add r9, 1 3571 test rax, rax 3572 je .LBB0_65 3573 # %bb.66: 3574 mov rax, r9 3575 and rax, -2 3576 neg rax 3577 xor edi, edi 3578 .LBB0_67: # =>This Inner Loop Header: Depth=1 3579 movdqu xmm0, xmmword ptr [rdx + rdi] 3580 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 3581 movdqu xmm2, xmmword ptr [rcx + rdi] 3582 paddb xmm2, xmm0 3583 movdqu xmm0, xmmword ptr [rcx + rdi + 16] 3584 paddb xmm0, xmm1 3585 movdqu xmmword ptr [r8 + rdi], xmm2 3586 movdqu xmmword ptr [r8 + rdi + 16], xmm0 3587 movdqu xmm0, xmmword ptr [rdx + rdi + 32] 3588 movdqu xmm1, xmmword ptr [rdx + rdi + 48] 3589 movdqu xmm2, xmmword ptr [rcx + rdi + 32] 3590 paddb xmm2, xmm0 3591 movdqu xmm0, xmmword ptr [rcx + rdi + 48] 3592 paddb xmm0, xmm1 3593 movdqu xmmword ptr [r8 + rdi + 32], xmm2 3594 movdqu xmmword ptr [r8 + rdi + 48], xmm0 3595 add rdi, 64 3596 add rax, 2 3597 jne .LBB0_67 3598 jmp .LBB0_68 3599 .LBB0_408: 3600 lea rsi, [r8 + r10] 3601 lea rax, [rdx + r10] 3602 cmp rax, r8 3603 seta r9b 3604 lea rax, [rcx + r10] 3605 cmp rsi, rdx 3606 seta r11b 3607 cmp rax, r8 3608 seta al 3609 cmp rsi, rcx 3610 seta dil 3611 xor esi, esi 3612 test r9b, r11b 3613 jne .LBB0_417 3614 # %bb.409: 3615 and al, dil 3616 jne .LBB0_417 3617 # %bb.410: 3618 mov esi, r10d 3619 and esi, -32 3620 lea rax, [rsi - 32] 3621 mov r9, rax 3622 shr r9, 5 3623 add r9, 1 3624 test rax, rax 3625 je .LBB0_411 3626 # %bb.412: 3627 mov rax, r9 3628 and rax, -2 3629 neg rax 3630 xor edi, edi 3631 .LBB0_413: # =>This Inner Loop Header: Depth=1 3632 movdqu xmm0, xmmword ptr [rdx + rdi] 3633 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 3634 movdqu xmm2, xmmword ptr [rcx + rdi] 3635 psubb xmm0, xmm2 3636 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 3637 psubb xmm1, xmm2 3638 movdqu xmmword ptr [r8 + rdi], xmm0 3639 movdqu xmmword ptr [r8 + rdi + 16], xmm1 3640 movdqu xmm0, xmmword ptr [rdx + rdi + 32] 3641 movdqu xmm1, xmmword ptr [rdx + rdi + 48] 3642 movdqu xmm2, xmmword ptr [rcx + rdi + 32] 3643 psubb xmm0, xmm2 3644 movdqu xmm2, xmmword ptr [rcx + rdi + 48] 3645 psubb xmm1, xmm2 3646 movdqu xmmword ptr [r8 + rdi + 32], xmm0 3647 movdqu xmmword ptr [r8 + rdi + 48], xmm1 3648 add rdi, 64 3649 add rax, 2 3650 jne .LBB0_413 3651 jmp .LBB0_414 3652 .LBB0_235: 3653 lea rsi, [r8 + r10] 3654 lea rax, [rdx + r10] 3655 cmp rax, r8 3656 seta r9b 3657 lea rax, [rcx + r10] 3658 cmp rsi, rdx 3659 seta r11b 3660 cmp rax, r8 3661 seta al 3662 cmp rsi, rcx 3663 seta dil 3664 xor esi, esi 3665 test r9b, r11b 3666 jne .LBB0_244 3667 # %bb.236: 3668 and al, dil 3669 jne .LBB0_244 3670 # %bb.237: 3671 mov esi, r10d 3672 and esi, -32 3673 lea rax, [rsi - 32] 3674 mov r9, rax 3675 shr r9, 5 3676 add r9, 1 3677 test rax, rax 3678 je .LBB0_238 3679 # %bb.239: 3680 mov rax, r9 3681 and rax, -2 3682 neg rax 3683 xor edi, edi 3684 .LBB0_240: # =>This Inner Loop Header: Depth=1 3685 movdqu xmm0, xmmword ptr [rdx + rdi] 3686 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 3687 movdqu xmm2, xmmword ptr [rcx + rdi] 3688 paddb xmm2, xmm0 3689 movdqu xmm0, xmmword ptr [rcx + rdi + 16] 3690 paddb xmm0, xmm1 3691 movdqu xmmword ptr [r8 + rdi], xmm2 3692 movdqu xmmword ptr [r8 + rdi + 16], xmm0 3693 movdqu xmm0, xmmword ptr [rdx + rdi + 32] 3694 movdqu xmm1, xmmword ptr [rdx + rdi + 48] 3695 movdqu xmm2, xmmword ptr [rcx + rdi + 32] 3696 paddb xmm2, xmm0 3697 movdqu xmm0, xmmword ptr [rcx + rdi + 48] 3698 paddb xmm0, xmm1 3699 movdqu xmmword ptr [r8 + rdi + 32], xmm2 3700 movdqu xmmword ptr [r8 + rdi + 48], xmm0 3701 add rdi, 64 3702 add rax, 2 3703 jne .LBB0_240 3704 jmp .LBB0_241 3705 .LBB0_581: 3706 lea rsi, [r8 + r10] 3707 lea rax, [rdx + r10] 3708 cmp rax, r8 3709 seta r9b 3710 lea rax, [rcx + r10] 3711 cmp rsi, rdx 3712 seta r11b 3713 cmp rax, r8 3714 seta al 3715 cmp rsi, rcx 3716 seta dil 3717 xor esi, esi 3718 test r9b, r11b 3719 jne .LBB0_590 3720 # %bb.582: 3721 and al, dil 3722 jne .LBB0_590 3723 # %bb.583: 3724 mov esi, r10d 3725 and esi, -32 3726 lea rax, [rsi - 32] 3727 mov r9, rax 3728 shr r9, 5 3729 add r9, 1 3730 test rax, rax 3731 je .LBB0_584 3732 # %bb.585: 3733 mov rax, r9 3734 and rax, -2 3735 neg rax 3736 xor edi, edi 3737 .LBB0_586: # =>This Inner Loop Header: Depth=1 3738 movdqu xmm0, xmmword ptr [rdx + rdi] 3739 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 3740 movdqu xmm2, xmmword ptr [rcx + rdi] 3741 psubb xmm0, xmm2 3742 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 3743 psubb xmm1, xmm2 3744 movdqu xmmword ptr [r8 + rdi], xmm0 3745 movdqu xmmword ptr [r8 + rdi + 16], xmm1 3746 movdqu xmm0, xmmword ptr [rdx + rdi + 32] 3747 movdqu xmm1, xmmword ptr [rdx + rdi + 48] 3748 movdqu xmm2, xmmword ptr [rcx + rdi + 32] 3749 psubb xmm0, xmm2 3750 movdqu xmm2, xmmword ptr [rcx + rdi + 48] 3751 psubb xmm1, xmm2 3752 movdqu xmmword ptr [r8 + rdi + 32], xmm0 3753 movdqu xmmword ptr [r8 + rdi + 48], xmm1 3754 add rdi, 64 3755 add rax, 2 3756 jne .LBB0_586 3757 jmp .LBB0_587 3758 .LBB0_821: 3759 and esi, -4 3760 xor edi, edi 3761 .LBB0_822: # =>This Inner Loop Header: Depth=1 3762 mov rax, qword ptr [rcx + 8*rdi] 3763 imul rax, qword ptr [rdx + 8*rdi] 3764 mov qword ptr [r8 + 8*rdi], rax 3765 mov rax, qword ptr [rcx + 8*rdi + 8] 3766 imul rax, qword ptr [rdx + 8*rdi + 8] 3767 mov qword ptr [r8 + 8*rdi + 8], rax 3768 mov rax, qword ptr [rcx + 8*rdi + 16] 3769 imul rax, qword ptr [rdx + 8*rdi + 16] 3770 mov qword ptr [r8 + 8*rdi + 16], rax 3771 mov rax, qword ptr [rcx + 8*rdi + 24] 3772 imul rax, qword ptr [rdx + 8*rdi + 24] 3773 mov qword ptr [r8 + 8*rdi + 24], rax 3774 add rdi, 4 3775 cmp rsi, rdi 3776 jne .LBB0_822 3777 .LBB0_823: 3778 test r9, r9 3779 je .LBB0_1013 3780 # %bb.824: 3781 lea rsi, [r8 + 8*rdi] 3782 lea rcx, [rcx + 8*rdi] 3783 lea rdx, [rdx + 8*rdi] 3784 xor edi, edi 3785 .LBB0_825: # =>This Inner Loop Header: Depth=1 3786 mov rax, qword ptr [rcx + 8*rdi] 3787 imul rax, qword ptr [rdx + 8*rdi] 3788 mov qword ptr [rsi + 8*rdi], rax 3789 add rdi, 1 3790 cmp r9, rdi 3791 jne .LBB0_825 3792 jmp .LBB0_1013 3793 .LBB0_971: 3794 and esi, -4 3795 xor edi, edi 3796 .LBB0_972: # =>This Inner Loop Header: Depth=1 3797 mov rax, qword ptr [rcx + 8*rdi] 3798 imul rax, qword ptr [rdx + 8*rdi] 3799 mov qword ptr [r8 + 8*rdi], rax 3800 mov rax, qword ptr [rcx + 8*rdi + 8] 3801 imul rax, qword ptr [rdx + 8*rdi + 8] 3802 mov qword ptr [r8 + 8*rdi + 8], rax 3803 mov rax, qword ptr [rcx + 8*rdi + 16] 3804 imul rax, qword ptr [rdx + 8*rdi + 16] 3805 mov qword ptr [r8 + 8*rdi + 16], rax 3806 mov rax, qword ptr [rcx + 8*rdi + 24] 3807 imul rax, qword ptr [rdx + 8*rdi + 24] 3808 mov qword ptr [r8 + 8*rdi + 24], rax 3809 add rdi, 4 3810 cmp rsi, rdi 3811 jne .LBB0_972 3812 .LBB0_973: 3813 test r9, r9 3814 je .LBB0_1013 3815 # %bb.974: 3816 lea rsi, [r8 + 8*rdi] 3817 lea rcx, [rcx + 8*rdi] 3818 lea rdx, [rdx + 8*rdi] 3819 xor edi, edi 3820 .LBB0_975: # =>This Inner Loop Header: Depth=1 3821 mov rax, qword ptr [rcx + 8*rdi] 3822 imul rax, qword ptr [rdx + 8*rdi] 3823 mov qword ptr [rsi + 8*rdi], rax 3824 add rdi, 1 3825 cmp r9, rdi 3826 jne .LBB0_975 3827 jmp .LBB0_1013 3828 .LBB0_136: 3829 lea rsi, [r8 + 8*r10] 3830 lea rax, [rdx + 8*r10] 3831 cmp rax, r8 3832 seta r9b 3833 lea rax, [rcx + 8*r10] 3834 cmp rsi, rdx 3835 seta r11b 3836 cmp rax, r8 3837 seta al 3838 cmp rsi, rcx 3839 seta dil 3840 xor esi, esi 3841 test r9b, r11b 3842 jne .LBB0_145 3843 # %bb.137: 3844 and al, dil 3845 jne .LBB0_145 3846 # %bb.138: 3847 mov esi, r10d 3848 and esi, -4 3849 lea rax, [rsi - 4] 3850 mov r9, rax 3851 shr r9, 2 3852 add r9, 1 3853 test rax, rax 3854 je .LBB0_139 3855 # %bb.140: 3856 mov rax, r9 3857 and rax, -2 3858 neg rax 3859 xor edi, edi 3860 .LBB0_141: # =>This Inner Loop Header: Depth=1 3861 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 3862 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 3863 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 3864 paddq xmm2, xmm0 3865 movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16] 3866 paddq xmm0, xmm1 3867 movdqu xmmword ptr [r8 + 8*rdi], xmm2 3868 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0 3869 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] 3870 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] 3871 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32] 3872 paddq xmm2, xmm0 3873 movdqu xmm0, xmmword ptr [rcx + 8*rdi + 48] 3874 paddq xmm0, xmm1 3875 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm2 3876 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm0 3877 add rdi, 8 3878 add rax, 2 3879 jne .LBB0_141 3880 jmp .LBB0_142 3881 .LBB0_482: 3882 lea rsi, [r8 + 8*r10] 3883 lea rax, [rdx + 8*r10] 3884 cmp rax, r8 3885 seta r9b 3886 lea rax, [rcx + 8*r10] 3887 cmp rsi, rdx 3888 seta r11b 3889 cmp rax, r8 3890 seta al 3891 cmp rsi, rcx 3892 seta dil 3893 xor esi, esi 3894 test r9b, r11b 3895 jne .LBB0_491 3896 # %bb.483: 3897 and al, dil 3898 jne .LBB0_491 3899 # %bb.484: 3900 mov esi, r10d 3901 and esi, -4 3902 lea rax, [rsi - 4] 3903 mov r9, rax 3904 shr r9, 2 3905 add r9, 1 3906 test rax, rax 3907 je .LBB0_485 3908 # %bb.486: 3909 mov rax, r9 3910 and rax, -2 3911 neg rax 3912 xor edi, edi 3913 .LBB0_487: # =>This Inner Loop Header: Depth=1 3914 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 3915 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 3916 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 3917 psubq xmm0, xmm2 3918 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 3919 psubq xmm1, xmm2 3920 movdqu xmmword ptr [r8 + 8*rdi], xmm0 3921 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1 3922 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] 3923 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] 3924 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32] 3925 psubq xmm0, xmm2 3926 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48] 3927 psubq xmm1, xmm2 3928 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm0 3929 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1 3930 add rdi, 8 3931 add rax, 2 3932 jne .LBB0_487 3933 jmp .LBB0_488 3934 .LBB0_309: 3935 lea rsi, [r8 + 8*r10] 3936 lea rax, [rdx + 8*r10] 3937 cmp rax, r8 3938 seta r9b 3939 lea rax, [rcx + 8*r10] 3940 cmp rsi, rdx 3941 seta r11b 3942 cmp rax, r8 3943 seta al 3944 cmp rsi, rcx 3945 seta dil 3946 xor esi, esi 3947 test r9b, r11b 3948 jne .LBB0_318 3949 # %bb.310: 3950 and al, dil 3951 jne .LBB0_318 3952 # %bb.311: 3953 mov esi, r10d 3954 and esi, -4 3955 lea rax, [rsi - 4] 3956 mov r9, rax 3957 shr r9, 2 3958 add r9, 1 3959 test rax, rax 3960 je .LBB0_312 3961 # %bb.313: 3962 mov rax, r9 3963 and rax, -2 3964 neg rax 3965 xor edi, edi 3966 .LBB0_314: # =>This Inner Loop Header: Depth=1 3967 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 3968 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 3969 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 3970 paddq xmm2, xmm0 3971 movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16] 3972 paddq xmm0, xmm1 3973 movdqu xmmword ptr [r8 + 8*rdi], xmm2 3974 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0 3975 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] 3976 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] 3977 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32] 3978 paddq xmm2, xmm0 3979 movdqu xmm0, xmmword ptr [rcx + 8*rdi + 48] 3980 paddq xmm0, xmm1 3981 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm2 3982 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm0 3983 add rdi, 8 3984 add rax, 2 3985 jne .LBB0_314 3986 jmp .LBB0_315 3987 .LBB0_655: 3988 lea rsi, [r8 + 8*r10] 3989 lea rax, [rdx + 8*r10] 3990 cmp rax, r8 3991 seta r9b 3992 lea rax, [rcx + 8*r10] 3993 cmp rsi, rdx 3994 seta r11b 3995 cmp rax, r8 3996 seta al 3997 cmp rsi, rcx 3998 seta dil 3999 xor esi, esi 4000 test r9b, r11b 4001 jne .LBB0_664 4002 # %bb.656: 4003 and al, dil 4004 jne .LBB0_664 4005 # %bb.657: 4006 mov esi, r10d 4007 and esi, -4 4008 lea rax, [rsi - 4] 4009 mov r9, rax 4010 shr r9, 2 4011 add r9, 1 4012 test rax, rax 4013 je .LBB0_658 4014 # %bb.659: 4015 mov rax, r9 4016 and rax, -2 4017 neg rax 4018 xor edi, edi 4019 .LBB0_660: # =>This Inner Loop Header: Depth=1 4020 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 4021 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 4022 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 4023 psubq xmm0, xmm2 4024 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 4025 psubq xmm1, xmm2 4026 movdqu xmmword ptr [r8 + 8*rdi], xmm0 4027 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1 4028 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] 4029 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] 4030 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32] 4031 psubq xmm0, xmm2 4032 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48] 4033 psubq xmm1, xmm2 4034 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm0 4035 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1 4036 add rdi, 8 4037 add rax, 2 4038 jne .LBB0_660 4039 jmp .LBB0_661 4040 .LBB0_763: 4041 lea rsi, [r8 + 2*r10] 4042 lea rax, [rdx + 2*r10] 4043 cmp rax, r8 4044 seta r9b 4045 lea rax, [rcx + 2*r10] 4046 cmp rsi, rdx 4047 seta r11b 4048 cmp rax, r8 4049 seta al 4050 cmp rsi, rcx 4051 seta dil 4052 xor esi, esi 4053 test r9b, r11b 4054 jne .LBB0_772 4055 # %bb.764: 4056 and al, dil 4057 jne .LBB0_772 4058 # %bb.765: 4059 mov esi, r10d 4060 and esi, -16 4061 lea rax, [rsi - 16] 4062 mov r9, rax 4063 shr r9, 4 4064 add r9, 1 4065 test rax, rax 4066 je .LBB0_766 4067 # %bb.767: 4068 mov rax, r9 4069 and rax, -2 4070 neg rax 4071 xor edi, edi 4072 .LBB0_768: # =>This Inner Loop Header: Depth=1 4073 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 4074 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 4075 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 4076 pmullw xmm2, xmm0 4077 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 4078 pmullw xmm0, xmm1 4079 movdqu xmmword ptr [r8 + 2*rdi], xmm2 4080 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 4081 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 4082 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 4083 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32] 4084 pmullw xmm2, xmm0 4085 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 48] 4086 pmullw xmm0, xmm1 4087 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm2 4088 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm0 4089 add rdi, 32 4090 add rax, 2 4091 jne .LBB0_768 4092 jmp .LBB0_769 4093 .LBB0_779: 4094 lea rsi, [r8 + 2*r10] 4095 lea rax, [rdx + 2*r10] 4096 cmp rax, r8 4097 seta r9b 4098 lea rax, [rcx + 2*r10] 4099 cmp rsi, rdx 4100 seta r11b 4101 cmp rax, r8 4102 seta al 4103 cmp rsi, rcx 4104 seta dil 4105 xor esi, esi 4106 test r9b, r11b 4107 jne .LBB0_788 4108 # %bb.780: 4109 and al, dil 4110 jne .LBB0_788 4111 # %bb.781: 4112 mov esi, r10d 4113 and esi, -16 4114 lea rax, [rsi - 16] 4115 mov r9, rax 4116 shr r9, 4 4117 add r9, 1 4118 test rax, rax 4119 je .LBB0_782 4120 # %bb.783: 4121 mov rax, r9 4122 and rax, -2 4123 neg rax 4124 xor edi, edi 4125 .LBB0_784: # =>This Inner Loop Header: Depth=1 4126 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 4127 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 4128 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 4129 pmullw xmm2, xmm0 4130 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 4131 pmullw xmm0, xmm1 4132 movdqu xmmword ptr [r8 + 2*rdi], xmm2 4133 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 4134 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 4135 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 4136 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32] 4137 pmullw xmm2, xmm0 4138 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 48] 4139 pmullw xmm0, xmm1 4140 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm2 4141 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm0 4142 add rdi, 32 4143 add rax, 2 4144 jne .LBB0_784 4145 jmp .LBB0_785 4146 .LBB0_913: 4147 lea rsi, [r8 + 2*r10] 4148 lea rax, [rdx + 2*r10] 4149 cmp rax, r8 4150 seta r9b 4151 lea rax, [rcx + 2*r10] 4152 cmp rsi, rdx 4153 seta r11b 4154 cmp rax, r8 4155 seta al 4156 cmp rsi, rcx 4157 seta dil 4158 xor esi, esi 4159 test r9b, r11b 4160 jne .LBB0_922 4161 # %bb.914: 4162 and al, dil 4163 jne .LBB0_922 4164 # %bb.915: 4165 mov esi, r10d 4166 and esi, -16 4167 lea rax, [rsi - 16] 4168 mov r9, rax 4169 shr r9, 4 4170 add r9, 1 4171 test rax, rax 4172 je .LBB0_916 4173 # %bb.917: 4174 mov rax, r9 4175 and rax, -2 4176 neg rax 4177 xor edi, edi 4178 .LBB0_918: # =>This Inner Loop Header: Depth=1 4179 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 4180 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 4181 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 4182 pmullw xmm2, xmm0 4183 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 4184 pmullw xmm0, xmm1 4185 movdqu xmmword ptr [r8 + 2*rdi], xmm2 4186 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 4187 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 4188 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 4189 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32] 4190 pmullw xmm2, xmm0 4191 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 48] 4192 pmullw xmm0, xmm1 4193 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm2 4194 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm0 4195 add rdi, 32 4196 add rax, 2 4197 jne .LBB0_918 4198 jmp .LBB0_919 4199 .LBB0_929: 4200 lea rsi, [r8 + 2*r10] 4201 lea rax, [rdx + 2*r10] 4202 cmp rax, r8 4203 seta r9b 4204 lea rax, [rcx + 2*r10] 4205 cmp rsi, rdx 4206 seta r11b 4207 cmp rax, r8 4208 seta al 4209 cmp rsi, rcx 4210 seta dil 4211 xor esi, esi 4212 test r9b, r11b 4213 jne .LBB0_938 4214 # %bb.930: 4215 and al, dil 4216 jne .LBB0_938 4217 # %bb.931: 4218 mov esi, r10d 4219 and esi, -16 4220 lea rax, [rsi - 16] 4221 mov r9, rax 4222 shr r9, 4 4223 add r9, 1 4224 test rax, rax 4225 je .LBB0_932 4226 # %bb.933: 4227 mov rax, r9 4228 and rax, -2 4229 neg rax 4230 xor edi, edi 4231 .LBB0_934: # =>This Inner Loop Header: Depth=1 4232 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 4233 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 4234 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 4235 pmullw xmm2, xmm0 4236 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 4237 pmullw xmm0, xmm1 4238 movdqu xmmword ptr [r8 + 2*rdi], xmm2 4239 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 4240 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 4241 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 4242 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32] 4243 pmullw xmm2, xmm0 4244 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 48] 4245 pmullw xmm0, xmm1 4246 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm2 4247 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm0 4248 add rdi, 32 4249 add rax, 2 4250 jne .LBB0_934 4251 jmp .LBB0_935 4252 .LBB0_78: 4253 lea rsi, [r8 + 2*r10] 4254 lea rax, [rdx + 2*r10] 4255 cmp rax, r8 4256 seta r9b 4257 lea rax, [rcx + 2*r10] 4258 cmp rsi, rdx 4259 seta r11b 4260 cmp rax, r8 4261 seta al 4262 cmp rsi, rcx 4263 seta dil 4264 xor esi, esi 4265 test r9b, r11b 4266 jne .LBB0_87 4267 # %bb.79: 4268 and al, dil 4269 jne .LBB0_87 4270 # %bb.80: 4271 mov esi, r10d 4272 and esi, -16 4273 lea rax, [rsi - 16] 4274 mov r9, rax 4275 shr r9, 4 4276 add r9, 1 4277 test rax, rax 4278 je .LBB0_81 4279 # %bb.82: 4280 mov rax, r9 4281 and rax, -2 4282 neg rax 4283 xor edi, edi 4284 .LBB0_83: # =>This Inner Loop Header: Depth=1 4285 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 4286 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 4287 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 4288 paddw xmm2, xmm0 4289 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 4290 paddw xmm0, xmm1 4291 movdqu xmmword ptr [r8 + 2*rdi], xmm2 4292 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 4293 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 4294 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 4295 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32] 4296 paddw xmm2, xmm0 4297 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 48] 4298 paddw xmm0, xmm1 4299 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm2 4300 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm0 4301 add rdi, 32 4302 add rax, 2 4303 jne .LBB0_83 4304 jmp .LBB0_84 4305 .LBB0_94: 4306 lea rsi, [r8 + 2*r10] 4307 lea rax, [rdx + 2*r10] 4308 cmp rax, r8 4309 seta r9b 4310 lea rax, [rcx + 2*r10] 4311 cmp rsi, rdx 4312 seta r11b 4313 cmp rax, r8 4314 seta al 4315 cmp rsi, rcx 4316 seta dil 4317 xor esi, esi 4318 test r9b, r11b 4319 jne .LBB0_103 4320 # %bb.95: 4321 and al, dil 4322 jne .LBB0_103 4323 # %bb.96: 4324 mov esi, r10d 4325 and esi, -16 4326 lea rax, [rsi - 16] 4327 mov r9, rax 4328 shr r9, 4 4329 add r9, 1 4330 test rax, rax 4331 je .LBB0_97 4332 # %bb.98: 4333 mov rax, r9 4334 and rax, -2 4335 neg rax 4336 xor edi, edi 4337 .LBB0_99: # =>This Inner Loop Header: Depth=1 4338 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 4339 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 4340 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 4341 paddw xmm2, xmm0 4342 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 4343 paddw xmm0, xmm1 4344 movdqu xmmword ptr [r8 + 2*rdi], xmm2 4345 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 4346 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 4347 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 4348 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32] 4349 paddw xmm2, xmm0 4350 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 48] 4351 paddw xmm0, xmm1 4352 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm2 4353 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm0 4354 add rdi, 32 4355 add rax, 2 4356 jne .LBB0_99 4357 jmp .LBB0_100 4358 .LBB0_424: 4359 lea rsi, [r8 + 2*r10] 4360 lea rax, [rdx + 2*r10] 4361 cmp rax, r8 4362 seta r9b 4363 lea rax, [rcx + 2*r10] 4364 cmp rsi, rdx 4365 seta r11b 4366 cmp rax, r8 4367 seta al 4368 cmp rsi, rcx 4369 seta dil 4370 xor esi, esi 4371 test r9b, r11b 4372 jne .LBB0_433 4373 # %bb.425: 4374 and al, dil 4375 jne .LBB0_433 4376 # %bb.426: 4377 mov esi, r10d 4378 and esi, -16 4379 lea rax, [rsi - 16] 4380 mov r9, rax 4381 shr r9, 4 4382 add r9, 1 4383 test rax, rax 4384 je .LBB0_427 4385 # %bb.428: 4386 mov rax, r9 4387 and rax, -2 4388 neg rax 4389 xor edi, edi 4390 .LBB0_429: # =>This Inner Loop Header: Depth=1 4391 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 4392 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 4393 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 4394 psubw xmm0, xmm2 4395 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 4396 psubw xmm1, xmm2 4397 movdqu xmmword ptr [r8 + 2*rdi], xmm0 4398 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1 4399 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 4400 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 4401 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32] 4402 psubw xmm0, xmm2 4403 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 4404 psubw xmm1, xmm2 4405 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm0 4406 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1 4407 add rdi, 32 4408 add rax, 2 4409 jne .LBB0_429 4410 jmp .LBB0_430 4411 .LBB0_440: 4412 lea rsi, [r8 + 2*r10] 4413 lea rax, [rdx + 2*r10] 4414 cmp rax, r8 4415 seta r9b 4416 lea rax, [rcx + 2*r10] 4417 cmp rsi, rdx 4418 seta r11b 4419 cmp rax, r8 4420 seta al 4421 cmp rsi, rcx 4422 seta dil 4423 xor esi, esi 4424 test r9b, r11b 4425 jne .LBB0_449 4426 # %bb.441: 4427 and al, dil 4428 jne .LBB0_449 4429 # %bb.442: 4430 mov esi, r10d 4431 and esi, -16 4432 lea rax, [rsi - 16] 4433 mov r9, rax 4434 shr r9, 4 4435 add r9, 1 4436 test rax, rax 4437 je .LBB0_443 4438 # %bb.444: 4439 mov rax, r9 4440 and rax, -2 4441 neg rax 4442 xor edi, edi 4443 .LBB0_445: # =>This Inner Loop Header: Depth=1 4444 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 4445 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 4446 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 4447 psubw xmm0, xmm2 4448 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 4449 psubw xmm1, xmm2 4450 movdqu xmmword ptr [r8 + 2*rdi], xmm0 4451 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1 4452 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 4453 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 4454 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32] 4455 psubw xmm0, xmm2 4456 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 4457 psubw xmm1, xmm2 4458 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm0 4459 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1 4460 add rdi, 32 4461 add rax, 2 4462 jne .LBB0_445 4463 jmp .LBB0_446 4464 .LBB0_251: 4465 lea rsi, [r8 + 2*r10] 4466 lea rax, [rdx + 2*r10] 4467 cmp rax, r8 4468 seta r9b 4469 lea rax, [rcx + 2*r10] 4470 cmp rsi, rdx 4471 seta r11b 4472 cmp rax, r8 4473 seta al 4474 cmp rsi, rcx 4475 seta dil 4476 xor esi, esi 4477 test r9b, r11b 4478 jne .LBB0_260 4479 # %bb.252: 4480 and al, dil 4481 jne .LBB0_260 4482 # %bb.253: 4483 mov esi, r10d 4484 and esi, -16 4485 lea rax, [rsi - 16] 4486 mov r9, rax 4487 shr r9, 4 4488 add r9, 1 4489 test rax, rax 4490 je .LBB0_254 4491 # %bb.255: 4492 mov rax, r9 4493 and rax, -2 4494 neg rax 4495 xor edi, edi 4496 .LBB0_256: # =>This Inner Loop Header: Depth=1 4497 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 4498 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 4499 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 4500 paddw xmm2, xmm0 4501 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 4502 paddw xmm0, xmm1 4503 movdqu xmmword ptr [r8 + 2*rdi], xmm2 4504 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 4505 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 4506 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 4507 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32] 4508 paddw xmm2, xmm0 4509 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 48] 4510 paddw xmm0, xmm1 4511 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm2 4512 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm0 4513 add rdi, 32 4514 add rax, 2 4515 jne .LBB0_256 4516 jmp .LBB0_257 4517 .LBB0_267: 4518 lea rsi, [r8 + 2*r10] 4519 lea rax, [rdx + 2*r10] 4520 cmp rax, r8 4521 seta r9b 4522 lea rax, [rcx + 2*r10] 4523 cmp rsi, rdx 4524 seta r11b 4525 cmp rax, r8 4526 seta al 4527 cmp rsi, rcx 4528 seta dil 4529 xor esi, esi 4530 test r9b, r11b 4531 jne .LBB0_276 4532 # %bb.268: 4533 and al, dil 4534 jne .LBB0_276 4535 # %bb.269: 4536 mov esi, r10d 4537 and esi, -16 4538 lea rax, [rsi - 16] 4539 mov r9, rax 4540 shr r9, 4 4541 add r9, 1 4542 test rax, rax 4543 je .LBB0_270 4544 # %bb.271: 4545 mov rax, r9 4546 and rax, -2 4547 neg rax 4548 xor edi, edi 4549 .LBB0_272: # =>This Inner Loop Header: Depth=1 4550 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 4551 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 4552 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 4553 paddw xmm2, xmm0 4554 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 4555 paddw xmm0, xmm1 4556 movdqu xmmword ptr [r8 + 2*rdi], xmm2 4557 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 4558 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 4559 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 4560 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32] 4561 paddw xmm2, xmm0 4562 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 48] 4563 paddw xmm0, xmm1 4564 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm2 4565 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm0 4566 add rdi, 32 4567 add rax, 2 4568 jne .LBB0_272 4569 jmp .LBB0_273 4570 .LBB0_597: 4571 lea rsi, [r8 + 2*r10] 4572 lea rax, [rdx + 2*r10] 4573 cmp rax, r8 4574 seta r9b 4575 lea rax, [rcx + 2*r10] 4576 cmp rsi, rdx 4577 seta r11b 4578 cmp rax, r8 4579 seta al 4580 cmp rsi, rcx 4581 seta dil 4582 xor esi, esi 4583 test r9b, r11b 4584 jne .LBB0_606 4585 # %bb.598: 4586 and al, dil 4587 jne .LBB0_606 4588 # %bb.599: 4589 mov esi, r10d 4590 and esi, -16 4591 lea rax, [rsi - 16] 4592 mov r9, rax 4593 shr r9, 4 4594 add r9, 1 4595 test rax, rax 4596 je .LBB0_600 4597 # %bb.601: 4598 mov rax, r9 4599 and rax, -2 4600 neg rax 4601 xor edi, edi 4602 .LBB0_602: # =>This Inner Loop Header: Depth=1 4603 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 4604 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 4605 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 4606 psubw xmm0, xmm2 4607 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 4608 psubw xmm1, xmm2 4609 movdqu xmmword ptr [r8 + 2*rdi], xmm0 4610 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1 4611 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 4612 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 4613 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32] 4614 psubw xmm0, xmm2 4615 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 4616 psubw xmm1, xmm2 4617 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm0 4618 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1 4619 add rdi, 32 4620 add rax, 2 4621 jne .LBB0_602 4622 jmp .LBB0_603 4623 .LBB0_613: 4624 lea rsi, [r8 + 2*r10] 4625 lea rax, [rdx + 2*r10] 4626 cmp rax, r8 4627 seta r9b 4628 lea rax, [rcx + 2*r10] 4629 cmp rsi, rdx 4630 seta r11b 4631 cmp rax, r8 4632 seta al 4633 cmp rsi, rcx 4634 seta dil 4635 xor esi, esi 4636 test r9b, r11b 4637 jne .LBB0_622 4638 # %bb.614: 4639 and al, dil 4640 jne .LBB0_622 4641 # %bb.615: 4642 mov esi, r10d 4643 and esi, -16 4644 lea rax, [rsi - 16] 4645 mov r9, rax 4646 shr r9, 4 4647 add r9, 1 4648 test rax, rax 4649 je .LBB0_616 4650 # %bb.617: 4651 mov rax, r9 4652 and rax, -2 4653 neg rax 4654 xor edi, edi 4655 .LBB0_618: # =>This Inner Loop Header: Depth=1 4656 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 4657 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 4658 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 4659 psubw xmm0, xmm2 4660 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 4661 psubw xmm1, xmm2 4662 movdqu xmmword ptr [r8 + 2*rdi], xmm0 4663 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1 4664 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 4665 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 4666 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32] 4667 psubw xmm0, xmm2 4668 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 4669 psubw xmm1, xmm2 4670 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm0 4671 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1 4672 add rdi, 32 4673 add rax, 2 4674 jne .LBB0_618 4675 jmp .LBB0_619 4676 .LBB0_829: 4677 and esi, -4 4678 xor edi, edi 4679 .LBB0_830: # =>This Inner Loop Header: Depth=1 4680 mov rax, qword ptr [rcx + 8*rdi] 4681 imul rax, qword ptr [rdx + 8*rdi] 4682 mov qword ptr [r8 + 8*rdi], rax 4683 mov rax, qword ptr [rcx + 8*rdi + 8] 4684 imul rax, qword ptr [rdx + 8*rdi + 8] 4685 mov qword ptr [r8 + 8*rdi + 8], rax 4686 mov rax, qword ptr [rcx + 8*rdi + 16] 4687 imul rax, qword ptr [rdx + 8*rdi + 16] 4688 mov qword ptr [r8 + 8*rdi + 16], rax 4689 mov rax, qword ptr [rcx + 8*rdi + 24] 4690 imul rax, qword ptr [rdx + 8*rdi + 24] 4691 mov qword ptr [r8 + 8*rdi + 24], rax 4692 add rdi, 4 4693 cmp rsi, rdi 4694 jne .LBB0_830 4695 .LBB0_831: 4696 test r9, r9 4697 je .LBB0_1013 4698 # %bb.832: 4699 lea rsi, [r8 + 8*rdi] 4700 lea rcx, [rcx + 8*rdi] 4701 lea rdx, [rdx + 8*rdi] 4702 xor edi, edi 4703 .LBB0_833: # =>This Inner Loop Header: Depth=1 4704 mov rax, qword ptr [rcx + 8*rdi] 4705 imul rax, qword ptr [rdx + 8*rdi] 4706 mov qword ptr [rsi + 8*rdi], rax 4707 add rdi, 1 4708 cmp r9, rdi 4709 jne .LBB0_833 4710 jmp .LBB0_1013 4711 .LBB0_837: 4712 lea rsi, [r8 + 4*r10] 4713 lea rax, [rdx + 4*r10] 4714 cmp rax, r8 4715 seta r9b 4716 lea rax, [rcx + 4*r10] 4717 cmp rsi, rdx 4718 seta r11b 4719 cmp rax, r8 4720 seta al 4721 cmp rsi, rcx 4722 seta dil 4723 xor esi, esi 4724 test r9b, r11b 4725 jne .LBB0_846 4726 # %bb.838: 4727 and al, dil 4728 jne .LBB0_846 4729 # %bb.839: 4730 mov esi, r10d 4731 and esi, -8 4732 lea rax, [rsi - 8] 4733 mov r9, rax 4734 shr r9, 3 4735 add r9, 1 4736 test rax, rax 4737 je .LBB0_840 4738 # %bb.841: 4739 mov rax, r9 4740 and rax, -2 4741 neg rax 4742 xor edi, edi 4743 .LBB0_842: # =>This Inner Loop Header: Depth=1 4744 movups xmm0, xmmword ptr [rdx + 4*rdi] 4745 movups xmm1, xmmword ptr [rdx + 4*rdi + 16] 4746 movups xmm2, xmmword ptr [rcx + 4*rdi] 4747 mulps xmm2, xmm0 4748 movups xmm0, xmmword ptr [rcx + 4*rdi + 16] 4749 mulps xmm0, xmm1 4750 movups xmmword ptr [r8 + 4*rdi], xmm2 4751 movups xmmword ptr [r8 + 4*rdi + 16], xmm0 4752 movups xmm0, xmmword ptr [rdx + 4*rdi + 32] 4753 movups xmm1, xmmword ptr [rdx + 4*rdi + 48] 4754 movups xmm2, xmmword ptr [rcx + 4*rdi + 32] 4755 mulps xmm2, xmm0 4756 movups xmm0, xmmword ptr [rcx + 4*rdi + 48] 4757 mulps xmm0, xmm1 4758 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 4759 movups xmmword ptr [r8 + 4*rdi + 48], xmm0 4760 add rdi, 16 4761 add rax, 2 4762 jne .LBB0_842 4763 jmp .LBB0_843 4764 .LBB0_979: 4765 and esi, -4 4766 xor edi, edi 4767 .LBB0_980: # =>This Inner Loop Header: Depth=1 4768 mov rax, qword ptr [rcx + 8*rdi] 4769 imul rax, qword ptr [rdx + 8*rdi] 4770 mov qword ptr [r8 + 8*rdi], rax 4771 mov rax, qword ptr [rcx + 8*rdi + 8] 4772 imul rax, qword ptr [rdx + 8*rdi + 8] 4773 mov qword ptr [r8 + 8*rdi + 8], rax 4774 mov rax, qword ptr [rcx + 8*rdi + 16] 4775 imul rax, qword ptr [rdx + 8*rdi + 16] 4776 mov qword ptr [r8 + 8*rdi + 16], rax 4777 mov rax, qword ptr [rcx + 8*rdi + 24] 4778 imul rax, qword ptr [rdx + 8*rdi + 24] 4779 mov qword ptr [r8 + 8*rdi + 24], rax 4780 add rdi, 4 4781 cmp rsi, rdi 4782 jne .LBB0_980 4783 .LBB0_981: 4784 test r9, r9 4785 je .LBB0_1013 4786 # %bb.982: 4787 lea rsi, [r8 + 8*rdi] 4788 lea rcx, [rcx + 8*rdi] 4789 lea rdx, [rdx + 8*rdi] 4790 xor edi, edi 4791 .LBB0_983: # =>This Inner Loop Header: Depth=1 4792 mov rax, qword ptr [rcx + 8*rdi] 4793 imul rax, qword ptr [rdx + 8*rdi] 4794 mov qword ptr [rsi + 8*rdi], rax 4795 add rdi, 1 4796 cmp r9, rdi 4797 jne .LBB0_983 4798 .LBB0_1013: 4799 mov rsp, rbp 4800 pop rbp 4801 ret 4802 .LBB0_987: 4803 lea rsi, [r8 + 4*r10] 4804 lea rax, [rdx + 4*r10] 4805 cmp rax, r8 4806 seta r9b 4807 lea rax, [rcx + 4*r10] 4808 cmp rsi, rdx 4809 seta r11b 4810 cmp rax, r8 4811 seta al 4812 cmp rsi, rcx 4813 seta dil 4814 xor esi, esi 4815 test r9b, r11b 4816 jne .LBB0_996 4817 # %bb.988: 4818 and al, dil 4819 jne .LBB0_996 4820 # %bb.989: 4821 mov esi, r10d 4822 and esi, -8 4823 lea rax, [rsi - 8] 4824 mov r9, rax 4825 shr r9, 3 4826 add r9, 1 4827 test rax, rax 4828 je .LBB0_990 4829 # %bb.991: 4830 mov rax, r9 4831 and rax, -2 4832 neg rax 4833 xor edi, edi 4834 .LBB0_992: # =>This Inner Loop Header: Depth=1 4835 movups xmm0, xmmword ptr [rdx + 4*rdi] 4836 movups xmm1, xmmword ptr [rdx + 4*rdi + 16] 4837 movups xmm2, xmmword ptr [rcx + 4*rdi] 4838 mulps xmm2, xmm0 4839 movups xmm0, xmmword ptr [rcx + 4*rdi + 16] 4840 mulps xmm0, xmm1 4841 movups xmmword ptr [r8 + 4*rdi], xmm2 4842 movups xmmword ptr [r8 + 4*rdi + 16], xmm0 4843 movups xmm0, xmmword ptr [rdx + 4*rdi + 32] 4844 movups xmm1, xmmword ptr [rdx + 4*rdi + 48] 4845 movups xmm2, xmmword ptr [rcx + 4*rdi + 32] 4846 mulps xmm2, xmm0 4847 movups xmm0, xmmword ptr [rcx + 4*rdi + 48] 4848 mulps xmm0, xmm1 4849 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 4850 movups xmmword ptr [r8 + 4*rdi + 48], xmm0 4851 add rdi, 16 4852 add rax, 2 4853 jne .LBB0_992 4854 jmp .LBB0_993 4855 .LBB0_152: 4856 lea rsi, [r8 + 8*r10] 4857 lea rax, [rdx + 8*r10] 4858 cmp rax, r8 4859 seta r9b 4860 lea rax, [rcx + 8*r10] 4861 cmp rsi, rdx 4862 seta r11b 4863 cmp rax, r8 4864 seta al 4865 cmp rsi, rcx 4866 seta dil 4867 xor esi, esi 4868 test r9b, r11b 4869 jne .LBB0_161 4870 # %bb.153: 4871 and al, dil 4872 jne .LBB0_161 4873 # %bb.154: 4874 mov esi, r10d 4875 and esi, -4 4876 lea rax, [rsi - 4] 4877 mov r9, rax 4878 shr r9, 2 4879 add r9, 1 4880 test rax, rax 4881 je .LBB0_155 4882 # %bb.156: 4883 mov rax, r9 4884 and rax, -2 4885 neg rax 4886 xor edi, edi 4887 .LBB0_157: # =>This Inner Loop Header: Depth=1 4888 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 4889 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 4890 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 4891 paddq xmm2, xmm0 4892 movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16] 4893 paddq xmm0, xmm1 4894 movdqu xmmword ptr [r8 + 8*rdi], xmm2 4895 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0 4896 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] 4897 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] 4898 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32] 4899 paddq xmm2, xmm0 4900 movdqu xmm0, xmmword ptr [rcx + 8*rdi + 48] 4901 paddq xmm0, xmm1 4902 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm2 4903 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm0 4904 add rdi, 8 4905 add rax, 2 4906 jne .LBB0_157 4907 jmp .LBB0_158 4908 .LBB0_168: 4909 lea rsi, [r8 + 4*r10] 4910 lea rax, [rdx + 4*r10] 4911 cmp rax, r8 4912 seta r9b 4913 lea rax, [rcx + 4*r10] 4914 cmp rsi, rdx 4915 seta r11b 4916 cmp rax, r8 4917 seta al 4918 cmp rsi, rcx 4919 seta dil 4920 xor esi, esi 4921 test r9b, r11b 4922 jne .LBB0_177 4923 # %bb.169: 4924 and al, dil 4925 jne .LBB0_177 4926 # %bb.170: 4927 mov esi, r10d 4928 and esi, -8 4929 lea rax, [rsi - 8] 4930 mov r9, rax 4931 shr r9, 3 4932 add r9, 1 4933 test rax, rax 4934 je .LBB0_171 4935 # %bb.172: 4936 mov rax, r9 4937 and rax, -2 4938 neg rax 4939 xor edi, edi 4940 .LBB0_173: # =>This Inner Loop Header: Depth=1 4941 movups xmm0, xmmword ptr [rdx + 4*rdi] 4942 movups xmm1, xmmword ptr [rdx + 4*rdi + 16] 4943 movups xmm2, xmmword ptr [rcx + 4*rdi] 4944 addps xmm2, xmm0 4945 movups xmm0, xmmword ptr [rcx + 4*rdi + 16] 4946 addps xmm0, xmm1 4947 movups xmmword ptr [r8 + 4*rdi], xmm2 4948 movups xmmword ptr [r8 + 4*rdi + 16], xmm0 4949 movups xmm0, xmmword ptr [rdx + 4*rdi + 32] 4950 movups xmm1, xmmword ptr [rdx + 4*rdi + 48] 4951 movups xmm2, xmmword ptr [rcx + 4*rdi + 32] 4952 addps xmm2, xmm0 4953 movups xmm0, xmmword ptr [rcx + 4*rdi + 48] 4954 addps xmm0, xmm1 4955 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 4956 movups xmmword ptr [r8 + 4*rdi + 48], xmm0 4957 add rdi, 16 4958 add rax, 2 4959 jne .LBB0_173 4960 jmp .LBB0_174 4961 .LBB0_498: 4962 lea rsi, [r8 + 8*r10] 4963 lea rax, [rdx + 8*r10] 4964 cmp rax, r8 4965 seta r9b 4966 lea rax, [rcx + 8*r10] 4967 cmp rsi, rdx 4968 seta r11b 4969 cmp rax, r8 4970 seta al 4971 cmp rsi, rcx 4972 seta dil 4973 xor esi, esi 4974 test r9b, r11b 4975 jne .LBB0_507 4976 # %bb.499: 4977 and al, dil 4978 jne .LBB0_507 4979 # %bb.500: 4980 mov esi, r10d 4981 and esi, -4 4982 lea rax, [rsi - 4] 4983 mov r9, rax 4984 shr r9, 2 4985 add r9, 1 4986 test rax, rax 4987 je .LBB0_501 4988 # %bb.502: 4989 mov rax, r9 4990 and rax, -2 4991 neg rax 4992 xor edi, edi 4993 .LBB0_503: # =>This Inner Loop Header: Depth=1 4994 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 4995 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 4996 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 4997 psubq xmm0, xmm2 4998 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 4999 psubq xmm1, xmm2 5000 movdqu xmmword ptr [r8 + 8*rdi], xmm0 5001 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1 5002 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] 5003 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] 5004 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32] 5005 psubq xmm0, xmm2 5006 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48] 5007 psubq xmm1, xmm2 5008 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm0 5009 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1 5010 add rdi, 8 5011 add rax, 2 5012 jne .LBB0_503 5013 jmp .LBB0_504 5014 .LBB0_514: 5015 lea rsi, [r8 + 4*r10] 5016 lea rax, [rdx + 4*r10] 5017 cmp rax, r8 5018 seta r9b 5019 lea rax, [rcx + 4*r10] 5020 cmp rsi, rdx 5021 seta r11b 5022 cmp rax, r8 5023 seta al 5024 cmp rsi, rcx 5025 seta dil 5026 xor esi, esi 5027 test r9b, r11b 5028 jne .LBB0_523 5029 # %bb.515: 5030 and al, dil 5031 jne .LBB0_523 5032 # %bb.516: 5033 mov esi, r10d 5034 and esi, -8 5035 lea rax, [rsi - 8] 5036 mov r9, rax 5037 shr r9, 3 5038 add r9, 1 5039 test rax, rax 5040 je .LBB0_517 5041 # %bb.518: 5042 mov rax, r9 5043 and rax, -2 5044 neg rax 5045 xor edi, edi 5046 .LBB0_519: # =>This Inner Loop Header: Depth=1 5047 movups xmm0, xmmword ptr [rdx + 4*rdi] 5048 movups xmm1, xmmword ptr [rdx + 4*rdi + 16] 5049 movups xmm2, xmmword ptr [rcx + 4*rdi] 5050 subps xmm0, xmm2 5051 movups xmm2, xmmword ptr [rcx + 4*rdi + 16] 5052 subps xmm1, xmm2 5053 movups xmmword ptr [r8 + 4*rdi], xmm0 5054 movups xmmword ptr [r8 + 4*rdi + 16], xmm1 5055 movups xmm0, xmmword ptr [rdx + 4*rdi + 32] 5056 movups xmm1, xmmword ptr [rdx + 4*rdi + 48] 5057 movups xmm2, xmmword ptr [rcx + 4*rdi + 32] 5058 subps xmm0, xmm2 5059 movups xmm2, xmmword ptr [rcx + 4*rdi + 48] 5060 subps xmm1, xmm2 5061 movups xmmword ptr [r8 + 4*rdi + 32], xmm0 5062 movups xmmword ptr [r8 + 4*rdi + 48], xmm1 5063 add rdi, 16 5064 add rax, 2 5065 jne .LBB0_519 5066 jmp .LBB0_520 5067 .LBB0_325: 5068 lea rsi, [r8 + 8*r10] 5069 lea rax, [rdx + 8*r10] 5070 cmp rax, r8 5071 seta r9b 5072 lea rax, [rcx + 8*r10] 5073 cmp rsi, rdx 5074 seta r11b 5075 cmp rax, r8 5076 seta al 5077 cmp rsi, rcx 5078 seta dil 5079 xor esi, esi 5080 test r9b, r11b 5081 jne .LBB0_334 5082 # %bb.326: 5083 and al, dil 5084 jne .LBB0_334 5085 # %bb.327: 5086 mov esi, r10d 5087 and esi, -4 5088 lea rax, [rsi - 4] 5089 mov r9, rax 5090 shr r9, 2 5091 add r9, 1 5092 test rax, rax 5093 je .LBB0_328 5094 # %bb.329: 5095 mov rax, r9 5096 and rax, -2 5097 neg rax 5098 xor edi, edi 5099 .LBB0_330: # =>This Inner Loop Header: Depth=1 5100 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 5101 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 5102 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 5103 paddq xmm2, xmm0 5104 movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16] 5105 paddq xmm0, xmm1 5106 movdqu xmmword ptr [r8 + 8*rdi], xmm2 5107 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0 5108 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] 5109 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] 5110 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32] 5111 paddq xmm2, xmm0 5112 movdqu xmm0, xmmword ptr [rcx + 8*rdi + 48] 5113 paddq xmm0, xmm1 5114 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm2 5115 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm0 5116 add rdi, 8 5117 add rax, 2 5118 jne .LBB0_330 5119 jmp .LBB0_331 5120 .LBB0_341: 5121 lea rsi, [r8 + 4*r10] 5122 lea rax, [rdx + 4*r10] 5123 cmp rax, r8 5124 seta r9b 5125 lea rax, [rcx + 4*r10] 5126 cmp rsi, rdx 5127 seta r11b 5128 cmp rax, r8 5129 seta al 5130 cmp rsi, rcx 5131 seta dil 5132 xor esi, esi 5133 test r9b, r11b 5134 jne .LBB0_350 5135 # %bb.342: 5136 and al, dil 5137 jne .LBB0_350 5138 # %bb.343: 5139 mov esi, r10d 5140 and esi, -8 5141 lea rax, [rsi - 8] 5142 mov r9, rax 5143 shr r9, 3 5144 add r9, 1 5145 test rax, rax 5146 je .LBB0_344 5147 # %bb.345: 5148 mov rax, r9 5149 and rax, -2 5150 neg rax 5151 xor edi, edi 5152 .LBB0_346: # =>This Inner Loop Header: Depth=1 5153 movups xmm0, xmmword ptr [rdx + 4*rdi] 5154 movups xmm1, xmmword ptr [rdx + 4*rdi + 16] 5155 movups xmm2, xmmword ptr [rcx + 4*rdi] 5156 addps xmm2, xmm0 5157 movups xmm0, xmmword ptr [rcx + 4*rdi + 16] 5158 addps xmm0, xmm1 5159 movups xmmword ptr [r8 + 4*rdi], xmm2 5160 movups xmmword ptr [r8 + 4*rdi + 16], xmm0 5161 movups xmm0, xmmword ptr [rdx + 4*rdi + 32] 5162 movups xmm1, xmmword ptr [rdx + 4*rdi + 48] 5163 movups xmm2, xmmword ptr [rcx + 4*rdi + 32] 5164 addps xmm2, xmm0 5165 movups xmm0, xmmword ptr [rcx + 4*rdi + 48] 5166 addps xmm0, xmm1 5167 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 5168 movups xmmword ptr [r8 + 4*rdi + 48], xmm0 5169 add rdi, 16 5170 add rax, 2 5171 jne .LBB0_346 5172 jmp .LBB0_347 5173 .LBB0_671: 5174 lea rsi, [r8 + 8*r10] 5175 lea rax, [rdx + 8*r10] 5176 cmp rax, r8 5177 seta r9b 5178 lea rax, [rcx + 8*r10] 5179 cmp rsi, rdx 5180 seta r11b 5181 cmp rax, r8 5182 seta al 5183 cmp rsi, rcx 5184 seta dil 5185 xor esi, esi 5186 test r9b, r11b 5187 jne .LBB0_680 5188 # %bb.672: 5189 and al, dil 5190 jne .LBB0_680 5191 # %bb.673: 5192 mov esi, r10d 5193 and esi, -4 5194 lea rax, [rsi - 4] 5195 mov r9, rax 5196 shr r9, 2 5197 add r9, 1 5198 test rax, rax 5199 je .LBB0_674 5200 # %bb.675: 5201 mov rax, r9 5202 and rax, -2 5203 neg rax 5204 xor edi, edi 5205 .LBB0_676: # =>This Inner Loop Header: Depth=1 5206 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 5207 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 5208 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 5209 psubq xmm0, xmm2 5210 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 5211 psubq xmm1, xmm2 5212 movdqu xmmword ptr [r8 + 8*rdi], xmm0 5213 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1 5214 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] 5215 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] 5216 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32] 5217 psubq xmm0, xmm2 5218 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48] 5219 psubq xmm1, xmm2 5220 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm0 5221 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1 5222 add rdi, 8 5223 add rax, 2 5224 jne .LBB0_676 5225 jmp .LBB0_677 5226 .LBB0_687: 5227 lea rsi, [r8 + 4*r10] 5228 lea rax, [rdx + 4*r10] 5229 cmp rax, r8 5230 seta r9b 5231 lea rax, [rcx + 4*r10] 5232 cmp rsi, rdx 5233 seta r11b 5234 cmp rax, r8 5235 seta al 5236 cmp rsi, rcx 5237 seta dil 5238 xor esi, esi 5239 test r9b, r11b 5240 jne .LBB0_696 5241 # %bb.688: 5242 and al, dil 5243 jne .LBB0_696 5244 # %bb.689: 5245 mov esi, r10d 5246 and esi, -8 5247 lea rax, [rsi - 8] 5248 mov r9, rax 5249 shr r9, 3 5250 add r9, 1 5251 test rax, rax 5252 je .LBB0_690 5253 # %bb.691: 5254 mov rax, r9 5255 and rax, -2 5256 neg rax 5257 xor edi, edi 5258 .LBB0_692: # =>This Inner Loop Header: Depth=1 5259 movups xmm0, xmmword ptr [rdx + 4*rdi] 5260 movups xmm1, xmmword ptr [rdx + 4*rdi + 16] 5261 movups xmm2, xmmword ptr [rcx + 4*rdi] 5262 subps xmm0, xmm2 5263 movups xmm2, xmmword ptr [rcx + 4*rdi + 16] 5264 subps xmm1, xmm2 5265 movups xmmword ptr [r8 + 4*rdi], xmm0 5266 movups xmmword ptr [r8 + 4*rdi + 16], xmm1 5267 movups xmm0, xmmword ptr [rdx + 4*rdi + 32] 5268 movups xmm1, xmmword ptr [rdx + 4*rdi + 48] 5269 movups xmm2, xmmword ptr [rcx + 4*rdi + 32] 5270 subps xmm0, xmm2 5271 movups xmm2, xmmword ptr [rcx + 4*rdi + 48] 5272 subps xmm1, xmm2 5273 movups xmmword ptr [r8 + 4*rdi + 32], xmm0 5274 movups xmmword ptr [r8 + 4*rdi + 48], xmm1 5275 add rdi, 16 5276 add rax, 2 5277 jne .LBB0_692 5278 jmp .LBB0_693 5279 .LBB0_734: 5280 lea rsi, [r8 + r10] 5281 lea rax, [rdx + r10] 5282 cmp rax, r8 5283 seta r9b 5284 lea rax, [rcx + r10] 5285 cmp rsi, rdx 5286 seta r11b 5287 cmp rax, r8 5288 seta al 5289 cmp rsi, rcx 5290 seta sil 5291 xor edi, edi 5292 test r9b, r11b 5293 jne .LBB0_743 5294 # %bb.735: 5295 and al, sil 5296 jne .LBB0_743 5297 # %bb.736: 5298 mov edi, r10d 5299 and edi, -32 5300 lea rax, [rdi - 32] 5301 mov r9, rax 5302 shr r9, 5 5303 add r9, 1 5304 test rax, rax 5305 je .LBB0_737 5306 # %bb.738: 5307 mov rsi, r9 5308 and rsi, -2 5309 neg rsi 5310 xor eax, eax 5311 movdqa xmm0, xmmword ptr [rip + .LCPI0_0] # xmm0 = [255,255,255,255,255,255,255,255] 5312 .LBB0_739: # =>This Inner Loop Header: Depth=1 5313 movdqu xmm1, xmmword ptr [rdx + rax] 5314 movdqu xmm2, xmmword ptr [rdx + rax + 16] 5315 movdqu xmm3, xmmword ptr [rcx + rax] 5316 movdqu xmm4, xmmword ptr [rcx + rax + 16] 5317 pmovzxbw xmm5, xmm1 # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 5318 punpckhbw xmm1, xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5319 pmovzxbw xmm6, xmm3 # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 5320 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5321 pmullw xmm3, xmm1 5322 pand xmm3, xmm0 5323 pmullw xmm6, xmm5 5324 pand xmm6, xmm0 5325 packuswb xmm6, xmm3 5326 pmovzxbw xmm1, xmm2 # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 5327 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5328 pmovzxbw xmm3, xmm4 # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 5329 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5330 pmullw xmm4, xmm2 5331 pand xmm4, xmm0 5332 pmullw xmm3, xmm1 5333 pand xmm3, xmm0 5334 packuswb xmm3, xmm4 5335 movdqu xmmword ptr [r8 + rax], xmm6 5336 movdqu xmmword ptr [r8 + rax + 16], xmm3 5337 movdqu xmm1, xmmword ptr [rdx + rax + 32] 5338 movdqu xmm2, xmmword ptr [rdx + rax + 48] 5339 movdqu xmm3, xmmword ptr [rcx + rax + 32] 5340 movdqu xmm4, xmmword ptr [rcx + rax + 48] 5341 pmovzxbw xmm5, xmm1 # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 5342 punpckhbw xmm1, xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5343 pmovzxbw xmm6, xmm3 # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 5344 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5345 pmullw xmm3, xmm1 5346 pand xmm3, xmm0 5347 pmullw xmm6, xmm5 5348 pand xmm6, xmm0 5349 packuswb xmm6, xmm3 5350 pmovzxbw xmm1, xmm2 # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 5351 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5352 pmovzxbw xmm3, xmm4 # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 5353 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5354 pmullw xmm4, xmm2 5355 pand xmm4, xmm0 5356 pmullw xmm3, xmm1 5357 pand xmm3, xmm0 5358 packuswb xmm3, xmm4 5359 movdqu xmmword ptr [r8 + rax + 32], xmm6 5360 movdqu xmmword ptr [r8 + rax + 48], xmm3 5361 add rax, 64 5362 add rsi, 2 5363 jne .LBB0_739 5364 jmp .LBB0_740 5365 .LBB0_884: 5366 lea rsi, [r8 + r10] 5367 lea rax, [rdx + r10] 5368 cmp rax, r8 5369 seta r9b 5370 lea rax, [rcx + r10] 5371 cmp rsi, rdx 5372 seta r11b 5373 cmp rax, r8 5374 seta al 5375 cmp rsi, rcx 5376 seta sil 5377 xor edi, edi 5378 test r9b, r11b 5379 jne .LBB0_893 5380 # %bb.885: 5381 and al, sil 5382 jne .LBB0_893 5383 # %bb.886: 5384 mov edi, r10d 5385 and edi, -32 5386 lea rax, [rdi - 32] 5387 mov r9, rax 5388 shr r9, 5 5389 add r9, 1 5390 test rax, rax 5391 je .LBB0_887 5392 # %bb.888: 5393 mov rsi, r9 5394 and rsi, -2 5395 neg rsi 5396 xor eax, eax 5397 movdqa xmm0, xmmword ptr [rip + .LCPI0_0] # xmm0 = [255,255,255,255,255,255,255,255] 5398 .LBB0_889: # =>This Inner Loop Header: Depth=1 5399 movdqu xmm1, xmmword ptr [rdx + rax] 5400 movdqu xmm2, xmmword ptr [rdx + rax + 16] 5401 movdqu xmm3, xmmword ptr [rcx + rax] 5402 movdqu xmm4, xmmword ptr [rcx + rax + 16] 5403 pmovzxbw xmm5, xmm1 # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 5404 punpckhbw xmm1, xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5405 pmovzxbw xmm6, xmm3 # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 5406 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5407 pmullw xmm3, xmm1 5408 pand xmm3, xmm0 5409 pmullw xmm6, xmm5 5410 pand xmm6, xmm0 5411 packuswb xmm6, xmm3 5412 pmovzxbw xmm1, xmm2 # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 5413 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5414 pmovzxbw xmm3, xmm4 # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 5415 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5416 pmullw xmm4, xmm2 5417 pand xmm4, xmm0 5418 pmullw xmm3, xmm1 5419 pand xmm3, xmm0 5420 packuswb xmm3, xmm4 5421 movdqu xmmword ptr [r8 + rax], xmm6 5422 movdqu xmmword ptr [r8 + rax + 16], xmm3 5423 movdqu xmm1, xmmword ptr [rdx + rax + 32] 5424 movdqu xmm2, xmmword ptr [rdx + rax + 48] 5425 movdqu xmm3, xmmword ptr [rcx + rax + 32] 5426 movdqu xmm4, xmmword ptr [rcx + rax + 48] 5427 pmovzxbw xmm5, xmm1 # xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 5428 punpckhbw xmm1, xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5429 pmovzxbw xmm6, xmm3 # xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 5430 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5431 pmullw xmm3, xmm1 5432 pand xmm3, xmm0 5433 pmullw xmm6, xmm5 5434 pand xmm6, xmm0 5435 packuswb xmm6, xmm3 5436 pmovzxbw xmm1, xmm2 # xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 5437 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5438 pmovzxbw xmm3, xmm4 # xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 5439 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5440 pmullw xmm4, xmm2 5441 pand xmm4, xmm0 5442 pmullw xmm3, xmm1 5443 pand xmm3, xmm0 5444 packuswb xmm3, xmm4 5445 movdqu xmmword ptr [r8 + rax + 32], xmm6 5446 movdqu xmmword ptr [r8 + rax + 48], xmm3 5447 add rax, 64 5448 add rsi, 2 5449 jne .LBB0_889 5450 jmp .LBB0_890 5451 .LBB0_49: 5452 lea rsi, [r8 + r10] 5453 lea rax, [rdx + r10] 5454 cmp rax, r8 5455 seta r9b 5456 lea rax, [rcx + r10] 5457 cmp rsi, rdx 5458 seta r11b 5459 cmp rax, r8 5460 seta al 5461 cmp rsi, rcx 5462 seta dil 5463 xor esi, esi 5464 test r9b, r11b 5465 jne .LBB0_58 5466 # %bb.50: 5467 and al, dil 5468 jne .LBB0_58 5469 # %bb.51: 5470 mov esi, r10d 5471 and esi, -32 5472 lea rax, [rsi - 32] 5473 mov r9, rax 5474 shr r9, 5 5475 add r9, 1 5476 test rax, rax 5477 je .LBB0_52 5478 # %bb.53: 5479 mov rax, r9 5480 and rax, -2 5481 neg rax 5482 xor edi, edi 5483 .LBB0_54: # =>This Inner Loop Header: Depth=1 5484 movdqu xmm0, xmmword ptr [rdx + rdi] 5485 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 5486 movdqu xmm2, xmmword ptr [rcx + rdi] 5487 paddb xmm2, xmm0 5488 movdqu xmm0, xmmword ptr [rcx + rdi + 16] 5489 paddb xmm0, xmm1 5490 movdqu xmmword ptr [r8 + rdi], xmm2 5491 movdqu xmmword ptr [r8 + rdi + 16], xmm0 5492 movdqu xmm0, xmmword ptr [rdx + rdi + 32] 5493 movdqu xmm1, xmmword ptr [rdx + rdi + 48] 5494 movdqu xmm2, xmmword ptr [rcx + rdi + 32] 5495 paddb xmm2, xmm0 5496 movdqu xmm0, xmmword ptr [rcx + rdi + 48] 5497 paddb xmm0, xmm1 5498 movdqu xmmword ptr [r8 + rdi + 32], xmm2 5499 movdqu xmmword ptr [r8 + rdi + 48], xmm0 5500 add rdi, 64 5501 add rax, 2 5502 jne .LBB0_54 5503 jmp .LBB0_55 5504 .LBB0_395: 5505 lea rsi, [r8 + r10] 5506 lea rax, [rdx + r10] 5507 cmp rax, r8 5508 seta r9b 5509 lea rax, [rcx + r10] 5510 cmp rsi, rdx 5511 seta r11b 5512 cmp rax, r8 5513 seta al 5514 cmp rsi, rcx 5515 seta dil 5516 xor esi, esi 5517 test r9b, r11b 5518 jne .LBB0_404 5519 # %bb.396: 5520 and al, dil 5521 jne .LBB0_404 5522 # %bb.397: 5523 mov esi, r10d 5524 and esi, -32 5525 lea rax, [rsi - 32] 5526 mov r9, rax 5527 shr r9, 5 5528 add r9, 1 5529 test rax, rax 5530 je .LBB0_398 5531 # %bb.399: 5532 mov rax, r9 5533 and rax, -2 5534 neg rax 5535 xor edi, edi 5536 .LBB0_400: # =>This Inner Loop Header: Depth=1 5537 movdqu xmm0, xmmword ptr [rdx + rdi] 5538 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 5539 movdqu xmm2, xmmword ptr [rcx + rdi] 5540 psubb xmm0, xmm2 5541 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 5542 psubb xmm1, xmm2 5543 movdqu xmmword ptr [r8 + rdi], xmm0 5544 movdqu xmmword ptr [r8 + rdi + 16], xmm1 5545 movdqu xmm0, xmmword ptr [rdx + rdi + 32] 5546 movdqu xmm1, xmmword ptr [rdx + rdi + 48] 5547 movdqu xmm2, xmmword ptr [rcx + rdi + 32] 5548 psubb xmm0, xmm2 5549 movdqu xmm2, xmmword ptr [rcx + rdi + 48] 5550 psubb xmm1, xmm2 5551 movdqu xmmword ptr [r8 + rdi + 32], xmm0 5552 movdqu xmmword ptr [r8 + rdi + 48], xmm1 5553 add rdi, 64 5554 add rax, 2 5555 jne .LBB0_400 5556 jmp .LBB0_401 5557 .LBB0_222: 5558 lea rsi, [r8 + r10] 5559 lea rax, [rdx + r10] 5560 cmp rax, r8 5561 seta r9b 5562 lea rax, [rcx + r10] 5563 cmp rsi, rdx 5564 seta r11b 5565 cmp rax, r8 5566 seta al 5567 cmp rsi, rcx 5568 seta dil 5569 xor esi, esi 5570 test r9b, r11b 5571 jne .LBB0_231 5572 # %bb.223: 5573 and al, dil 5574 jne .LBB0_231 5575 # %bb.224: 5576 mov esi, r10d 5577 and esi, -32 5578 lea rax, [rsi - 32] 5579 mov r9, rax 5580 shr r9, 5 5581 add r9, 1 5582 test rax, rax 5583 je .LBB0_225 5584 # %bb.226: 5585 mov rax, r9 5586 and rax, -2 5587 neg rax 5588 xor edi, edi 5589 .LBB0_227: # =>This Inner Loop Header: Depth=1 5590 movdqu xmm0, xmmword ptr [rdx + rdi] 5591 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 5592 movdqu xmm2, xmmword ptr [rcx + rdi] 5593 paddb xmm2, xmm0 5594 movdqu xmm0, xmmword ptr [rcx + rdi + 16] 5595 paddb xmm0, xmm1 5596 movdqu xmmword ptr [r8 + rdi], xmm2 5597 movdqu xmmword ptr [r8 + rdi + 16], xmm0 5598 movdqu xmm0, xmmword ptr [rdx + rdi + 32] 5599 movdqu xmm1, xmmword ptr [rdx + rdi + 48] 5600 movdqu xmm2, xmmword ptr [rcx + rdi + 32] 5601 paddb xmm2, xmm0 5602 movdqu xmm0, xmmword ptr [rcx + rdi + 48] 5603 paddb xmm0, xmm1 5604 movdqu xmmword ptr [r8 + rdi + 32], xmm2 5605 movdqu xmmword ptr [r8 + rdi + 48], xmm0 5606 add rdi, 64 5607 add rax, 2 5608 jne .LBB0_227 5609 jmp .LBB0_228 5610 .LBB0_568: 5611 lea rsi, [r8 + r10] 5612 lea rax, [rdx + r10] 5613 cmp rax, r8 5614 seta r9b 5615 lea rax, [rcx + r10] 5616 cmp rsi, rdx 5617 seta r11b 5618 cmp rax, r8 5619 seta al 5620 cmp rsi, rcx 5621 seta dil 5622 xor esi, esi 5623 test r9b, r11b 5624 jne .LBB0_577 5625 # %bb.569: 5626 and al, dil 5627 jne .LBB0_577 5628 # %bb.570: 5629 mov esi, r10d 5630 and esi, -32 5631 lea rax, [rsi - 32] 5632 mov r9, rax 5633 shr r9, 5 5634 add r9, 1 5635 test rax, rax 5636 je .LBB0_571 5637 # %bb.572: 5638 mov rax, r9 5639 and rax, -2 5640 neg rax 5641 xor edi, edi 5642 .LBB0_573: # =>This Inner Loop Header: Depth=1 5643 movdqu xmm0, xmmword ptr [rdx + rdi] 5644 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 5645 movdqu xmm2, xmmword ptr [rcx + rdi] 5646 psubb xmm0, xmm2 5647 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 5648 psubb xmm1, xmm2 5649 movdqu xmmword ptr [r8 + rdi], xmm0 5650 movdqu xmmword ptr [r8 + rdi + 16], xmm1 5651 movdqu xmm0, xmmword ptr [rdx + rdi + 32] 5652 movdqu xmm1, xmmword ptr [rdx + rdi + 48] 5653 movdqu xmm2, xmmword ptr [rcx + rdi + 32] 5654 psubb xmm0, xmm2 5655 movdqu xmm2, xmmword ptr [rcx + rdi + 48] 5656 psubb xmm1, xmm2 5657 movdqu xmmword ptr [r8 + rdi + 32], xmm0 5658 movdqu xmmword ptr [r8 + rdi + 48], xmm1 5659 add rdi, 64 5660 add rax, 2 5661 jne .LBB0_573 5662 jmp .LBB0_574 5663 .LBB0_808: 5664 lea rsi, [r8 + 4*r10] 5665 lea rax, [rdx + 4*r10] 5666 cmp rax, r8 5667 seta r9b 5668 lea rax, [rcx + 4*r10] 5669 cmp rsi, rdx 5670 seta r11b 5671 cmp rax, r8 5672 seta al 5673 cmp rsi, rcx 5674 seta dil 5675 xor esi, esi 5676 test r9b, r11b 5677 jne .LBB0_817 5678 # %bb.809: 5679 and al, dil 5680 jne .LBB0_817 5681 # %bb.810: 5682 mov esi, r10d 5683 and esi, -8 5684 lea rax, [rsi - 8] 5685 mov r9, rax 5686 shr r9, 3 5687 add r9, 1 5688 test rax, rax 5689 je .LBB0_811 5690 # %bb.812: 5691 mov rax, r9 5692 and rax, -2 5693 neg rax 5694 xor edi, edi 5695 .LBB0_813: # =>This Inner Loop Header: Depth=1 5696 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 5697 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 5698 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 5699 pmulld xmm2, xmm0 5700 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 5701 pmulld xmm0, xmm1 5702 movdqu xmmword ptr [r8 + 4*rdi], xmm2 5703 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 5704 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 5705 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 5706 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32] 5707 pmulld xmm2, xmm0 5708 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 48] 5709 pmulld xmm0, xmm1 5710 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm2 5711 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm0 5712 add rdi, 16 5713 add rax, 2 5714 jne .LBB0_813 5715 jmp .LBB0_814 5716 .LBB0_958: 5717 lea rsi, [r8 + 4*r10] 5718 lea rax, [rdx + 4*r10] 5719 cmp rax, r8 5720 seta r9b 5721 lea rax, [rcx + 4*r10] 5722 cmp rsi, rdx 5723 seta r11b 5724 cmp rax, r8 5725 seta al 5726 cmp rsi, rcx 5727 seta dil 5728 xor esi, esi 5729 test r9b, r11b 5730 jne .LBB0_967 5731 # %bb.959: 5732 and al, dil 5733 jne .LBB0_967 5734 # %bb.960: 5735 mov esi, r10d 5736 and esi, -8 5737 lea rax, [rsi - 8] 5738 mov r9, rax 5739 shr r9, 3 5740 add r9, 1 5741 test rax, rax 5742 je .LBB0_961 5743 # %bb.962: 5744 mov rax, r9 5745 and rax, -2 5746 neg rax 5747 xor edi, edi 5748 .LBB0_963: # =>This Inner Loop Header: Depth=1 5749 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 5750 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 5751 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 5752 pmulld xmm2, xmm0 5753 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 5754 pmulld xmm0, xmm1 5755 movdqu xmmword ptr [r8 + 4*rdi], xmm2 5756 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 5757 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 5758 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 5759 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32] 5760 pmulld xmm2, xmm0 5761 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 48] 5762 pmulld xmm0, xmm1 5763 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm2 5764 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm0 5765 add rdi, 16 5766 add rax, 2 5767 jne .LBB0_963 5768 jmp .LBB0_964 5769 .LBB0_123: 5770 lea rsi, [r8 + 4*r10] 5771 lea rax, [rdx + 4*r10] 5772 cmp rax, r8 5773 seta r9b 5774 lea rax, [rcx + 4*r10] 5775 cmp rsi, rdx 5776 seta r11b 5777 cmp rax, r8 5778 seta al 5779 cmp rsi, rcx 5780 seta dil 5781 xor esi, esi 5782 test r9b, r11b 5783 jne .LBB0_132 5784 # %bb.124: 5785 and al, dil 5786 jne .LBB0_132 5787 # %bb.125: 5788 mov esi, r10d 5789 and esi, -8 5790 lea rax, [rsi - 8] 5791 mov r9, rax 5792 shr r9, 3 5793 add r9, 1 5794 test rax, rax 5795 je .LBB0_126 5796 # %bb.127: 5797 mov rax, r9 5798 and rax, -2 5799 neg rax 5800 xor edi, edi 5801 .LBB0_128: # =>This Inner Loop Header: Depth=1 5802 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 5803 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 5804 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 5805 paddd xmm2, xmm0 5806 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 5807 paddd xmm0, xmm1 5808 movdqu xmmword ptr [r8 + 4*rdi], xmm2 5809 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 5810 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 5811 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 5812 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32] 5813 paddd xmm2, xmm0 5814 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 48] 5815 paddd xmm0, xmm1 5816 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm2 5817 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm0 5818 add rdi, 16 5819 add rax, 2 5820 jne .LBB0_128 5821 jmp .LBB0_129 5822 .LBB0_469: 5823 lea rsi, [r8 + 4*r10] 5824 lea rax, [rdx + 4*r10] 5825 cmp rax, r8 5826 seta r9b 5827 lea rax, [rcx + 4*r10] 5828 cmp rsi, rdx 5829 seta r11b 5830 cmp rax, r8 5831 seta al 5832 cmp rsi, rcx 5833 seta dil 5834 xor esi, esi 5835 test r9b, r11b 5836 jne .LBB0_478 5837 # %bb.470: 5838 and al, dil 5839 jne .LBB0_478 5840 # %bb.471: 5841 mov esi, r10d 5842 and esi, -8 5843 lea rax, [rsi - 8] 5844 mov r9, rax 5845 shr r9, 3 5846 add r9, 1 5847 test rax, rax 5848 je .LBB0_472 5849 # %bb.473: 5850 mov rax, r9 5851 and rax, -2 5852 neg rax 5853 xor edi, edi 5854 .LBB0_474: # =>This Inner Loop Header: Depth=1 5855 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 5856 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 5857 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 5858 psubd xmm0, xmm2 5859 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 5860 psubd xmm1, xmm2 5861 movdqu xmmword ptr [r8 + 4*rdi], xmm0 5862 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1 5863 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 5864 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 5865 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32] 5866 psubd xmm0, xmm2 5867 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 5868 psubd xmm1, xmm2 5869 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm0 5870 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1 5871 add rdi, 16 5872 add rax, 2 5873 jne .LBB0_474 5874 jmp .LBB0_475 5875 .LBB0_296: 5876 lea rsi, [r8 + 4*r10] 5877 lea rax, [rdx + 4*r10] 5878 cmp rax, r8 5879 seta r9b 5880 lea rax, [rcx + 4*r10] 5881 cmp rsi, rdx 5882 seta r11b 5883 cmp rax, r8 5884 seta al 5885 cmp rsi, rcx 5886 seta dil 5887 xor esi, esi 5888 test r9b, r11b 5889 jne .LBB0_305 5890 # %bb.297: 5891 and al, dil 5892 jne .LBB0_305 5893 # %bb.298: 5894 mov esi, r10d 5895 and esi, -8 5896 lea rax, [rsi - 8] 5897 mov r9, rax 5898 shr r9, 3 5899 add r9, 1 5900 test rax, rax 5901 je .LBB0_299 5902 # %bb.300: 5903 mov rax, r9 5904 and rax, -2 5905 neg rax 5906 xor edi, edi 5907 .LBB0_301: # =>This Inner Loop Header: Depth=1 5908 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 5909 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 5910 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 5911 paddd xmm2, xmm0 5912 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 5913 paddd xmm0, xmm1 5914 movdqu xmmword ptr [r8 + 4*rdi], xmm2 5915 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 5916 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 5917 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 5918 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32] 5919 paddd xmm2, xmm0 5920 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 48] 5921 paddd xmm0, xmm1 5922 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm2 5923 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm0 5924 add rdi, 16 5925 add rax, 2 5926 jne .LBB0_301 5927 jmp .LBB0_302 5928 .LBB0_642: 5929 lea rsi, [r8 + 4*r10] 5930 lea rax, [rdx + 4*r10] 5931 cmp rax, r8 5932 seta r9b 5933 lea rax, [rcx + 4*r10] 5934 cmp rsi, rdx 5935 seta r11b 5936 cmp rax, r8 5937 seta al 5938 cmp rsi, rcx 5939 seta dil 5940 xor esi, esi 5941 test r9b, r11b 5942 jne .LBB0_651 5943 # %bb.643: 5944 and al, dil 5945 jne .LBB0_651 5946 # %bb.644: 5947 mov esi, r10d 5948 and esi, -8 5949 lea rax, [rsi - 8] 5950 mov r9, rax 5951 shr r9, 3 5952 add r9, 1 5953 test rax, rax 5954 je .LBB0_645 5955 # %bb.646: 5956 mov rax, r9 5957 and rax, -2 5958 neg rax 5959 xor edi, edi 5960 .LBB0_647: # =>This Inner Loop Header: Depth=1 5961 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 5962 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 5963 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 5964 psubd xmm0, xmm2 5965 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 5966 psubd xmm1, xmm2 5967 movdqu xmmword ptr [r8 + 4*rdi], xmm0 5968 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1 5969 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 5970 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 5971 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32] 5972 psubd xmm0, xmm2 5973 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 5974 psubd xmm1, xmm2 5975 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm0 5976 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1 5977 add rdi, 16 5978 add rax, 2 5979 jne .LBB0_647 5980 jmp .LBB0_648 5981 .LBB0_795: 5982 xor edi, edi 5983 .LBB0_798: 5984 test r9b, 1 5985 je .LBB0_800 5986 # %bb.799: 5987 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 5988 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 5989 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 5990 pmulld xmm2, xmm0 5991 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 5992 pmulld xmm0, xmm1 5993 movdqu xmmword ptr [r8 + 4*rdi], xmm2 5994 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 5995 .LBB0_800: 5996 cmp rsi, r10 5997 jne .LBB0_801 5998 jmp .LBB0_1013 5999 .LBB0_945: 6000 xor edi, edi 6001 .LBB0_948: 6002 test r9b, 1 6003 je .LBB0_950 6004 # %bb.949: 6005 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 6006 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 6007 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 6008 pmulld xmm2, xmm0 6009 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 6010 pmulld xmm0, xmm1 6011 movdqu xmmword ptr [r8 + 4*rdi], xmm2 6012 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 6013 .LBB0_950: 6014 cmp rsi, r10 6015 jne .LBB0_951 6016 jmp .LBB0_1013 6017 .LBB0_110: 6018 xor edi, edi 6019 .LBB0_113: 6020 test r9b, 1 6021 je .LBB0_115 6022 # %bb.114: 6023 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 6024 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 6025 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 6026 paddd xmm2, xmm0 6027 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 6028 paddd xmm0, xmm1 6029 movdqu xmmword ptr [r8 + 4*rdi], xmm2 6030 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 6031 .LBB0_115: 6032 cmp rsi, r10 6033 je .LBB0_1013 6034 jmp .LBB0_116 6035 .LBB0_456: 6036 xor edi, edi 6037 .LBB0_459: 6038 test r9b, 1 6039 je .LBB0_461 6040 # %bb.460: 6041 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 6042 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 6043 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 6044 psubd xmm0, xmm2 6045 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 6046 psubd xmm1, xmm2 6047 movdqu xmmword ptr [r8 + 4*rdi], xmm0 6048 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1 6049 .LBB0_461: 6050 cmp rsi, r10 6051 jne .LBB0_462 6052 jmp .LBB0_1013 6053 .LBB0_283: 6054 xor edi, edi 6055 .LBB0_286: 6056 test r9b, 1 6057 je .LBB0_288 6058 # %bb.287: 6059 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 6060 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 6061 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 6062 paddd xmm2, xmm0 6063 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 6064 paddd xmm0, xmm1 6065 movdqu xmmword ptr [r8 + 4*rdi], xmm2 6066 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 6067 .LBB0_288: 6068 cmp rsi, r10 6069 je .LBB0_1013 6070 jmp .LBB0_289 6071 .LBB0_629: 6072 xor edi, edi 6073 .LBB0_632: 6074 test r9b, 1 6075 je .LBB0_634 6076 # %bb.633: 6077 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 6078 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 6079 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 6080 psubd xmm0, xmm2 6081 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 6082 psubd xmm1, xmm2 6083 movdqu xmmword ptr [r8 + 4*rdi], xmm0 6084 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1 6085 .LBB0_634: 6086 cmp rsi, r10 6087 jne .LBB0_635 6088 jmp .LBB0_1013 6089 .LBB0_853: 6090 xor edi, edi 6091 .LBB0_856: 6092 test r9b, 1 6093 je .LBB0_858 6094 # %bb.857: 6095 movupd xmm0, xmmword ptr [rdx + 8*rdi] 6096 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 6097 movupd xmm2, xmmword ptr [rcx + 8*rdi] 6098 mulpd xmm2, xmm0 6099 movupd xmm0, xmmword ptr [rcx + 8*rdi + 16] 6100 mulpd xmm0, xmm1 6101 movupd xmmword ptr [r8 + 8*rdi], xmm2 6102 movupd xmmword ptr [r8 + 8*rdi + 16], xmm0 6103 .LBB0_858: 6104 cmp rsi, r10 6105 jne .LBB0_859 6106 jmp .LBB0_1013 6107 .LBB0_1003: 6108 xor edi, edi 6109 .LBB0_1006: 6110 test r9b, 1 6111 je .LBB0_1008 6112 # %bb.1007: 6113 movupd xmm0, xmmword ptr [rdx + 8*rdi] 6114 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 6115 movupd xmm2, xmmword ptr [rcx + 8*rdi] 6116 mulpd xmm2, xmm0 6117 movupd xmm0, xmmword ptr [rcx + 8*rdi + 16] 6118 mulpd xmm0, xmm1 6119 movupd xmmword ptr [r8 + 8*rdi], xmm2 6120 movupd xmmword ptr [r8 + 8*rdi + 16], xmm0 6121 .LBB0_1008: 6122 cmp rsi, r10 6123 jne .LBB0_1009 6124 jmp .LBB0_1013 6125 .LBB0_184: 6126 xor edi, edi 6127 .LBB0_187: 6128 test r9b, 1 6129 je .LBB0_189 6130 # %bb.188: 6131 movupd xmm0, xmmword ptr [rdx + 8*rdi] 6132 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 6133 movupd xmm2, xmmword ptr [rcx + 8*rdi] 6134 addpd xmm2, xmm0 6135 movupd xmm0, xmmword ptr [rcx + 8*rdi + 16] 6136 addpd xmm0, xmm1 6137 movupd xmmword ptr [r8 + 8*rdi], xmm2 6138 movupd xmmword ptr [r8 + 8*rdi + 16], xmm0 6139 .LBB0_189: 6140 cmp rsi, r10 6141 je .LBB0_1013 6142 jmp .LBB0_190 6143 .LBB0_530: 6144 xor edi, edi 6145 .LBB0_533: 6146 test r9b, 1 6147 je .LBB0_535 6148 # %bb.534: 6149 movupd xmm0, xmmword ptr [rdx + 8*rdi] 6150 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 6151 movupd xmm2, xmmword ptr [rcx + 8*rdi] 6152 subpd xmm0, xmm2 6153 movupd xmm2, xmmword ptr [rcx + 8*rdi + 16] 6154 subpd xmm1, xmm2 6155 movupd xmmword ptr [r8 + 8*rdi], xmm0 6156 movupd xmmword ptr [r8 + 8*rdi + 16], xmm1 6157 .LBB0_535: 6158 cmp rsi, r10 6159 jne .LBB0_536 6160 jmp .LBB0_1013 6161 .LBB0_357: 6162 xor edi, edi 6163 .LBB0_360: 6164 test r9b, 1 6165 je .LBB0_362 6166 # %bb.361: 6167 movupd xmm0, xmmword ptr [rdx + 8*rdi] 6168 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 6169 movupd xmm2, xmmword ptr [rcx + 8*rdi] 6170 addpd xmm2, xmm0 6171 movupd xmm0, xmmword ptr [rcx + 8*rdi + 16] 6172 addpd xmm0, xmm1 6173 movupd xmmword ptr [r8 + 8*rdi], xmm2 6174 movupd xmmword ptr [r8 + 8*rdi + 16], xmm0 6175 .LBB0_362: 6176 cmp rsi, r10 6177 jne .LBB0_363 6178 jmp .LBB0_1013 6179 .LBB0_703: 6180 xor edi, edi 6181 .LBB0_706: 6182 test r9b, 1 6183 je .LBB0_708 6184 # %bb.707: 6185 movupd xmm0, xmmword ptr [rdx + 8*rdi] 6186 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 6187 movupd xmm2, xmmword ptr [rcx + 8*rdi] 6188 subpd xmm0, xmm2 6189 movupd xmm2, xmmword ptr [rcx + 8*rdi + 16] 6190 subpd xmm1, xmm2 6191 movupd xmmword ptr [r8 + 8*rdi], xmm0 6192 movupd xmmword ptr [r8 + 8*rdi + 16], xmm1 6193 .LBB0_708: 6194 cmp rsi, r10 6195 jne .LBB0_709 6196 jmp .LBB0_1013 6197 .LBB0_750: 6198 xor eax, eax 6199 .LBB0_753: 6200 test r9b, 1 6201 je .LBB0_755 6202 # %bb.754: 6203 movdqu xmm1, xmmword ptr [rdx + rax] 6204 movdqu xmm2, xmmword ptr [rdx + rax + 16] 6205 movdqu xmm3, xmmword ptr [rcx + rax] 6206 movdqu xmm0, xmmword ptr [rcx + rax + 16] 6207 pmovzxbw xmm4, xmm1 # xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 6208 punpckhbw xmm1, xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6209 pmovzxbw xmm5, xmm3 # xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 6210 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6211 pmullw xmm3, xmm1 6212 movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,255,255,255,255,255,255,255] 6213 pand xmm3, xmm1 6214 pmullw xmm5, xmm4 6215 pand xmm5, xmm1 6216 packuswb xmm5, xmm3 6217 pmovzxbw xmm3, xmm2 # xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 6218 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6219 pmovzxbw xmm4, xmm0 # xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 6220 punpckhbw xmm0, xmm0 # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6221 pmullw xmm0, xmm2 6222 pand xmm0, xmm1 6223 pmullw xmm4, xmm3 6224 pand xmm4, xmm1 6225 packuswb xmm4, xmm0 6226 movdqu xmmword ptr [r8 + rax], xmm5 6227 movdqu xmmword ptr [r8 + rax + 16], xmm4 6228 .LBB0_755: 6229 cmp rdi, r10 6230 jne .LBB0_756 6231 jmp .LBB0_1013 6232 .LBB0_900: 6233 xor eax, eax 6234 .LBB0_903: 6235 test r9b, 1 6236 je .LBB0_905 6237 # %bb.904: 6238 movdqu xmm1, xmmword ptr [rdx + rax] 6239 movdqu xmm2, xmmword ptr [rdx + rax + 16] 6240 movdqu xmm3, xmmword ptr [rcx + rax] 6241 movdqu xmm0, xmmword ptr [rcx + rax + 16] 6242 pmovzxbw xmm4, xmm1 # xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 6243 punpckhbw xmm1, xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6244 pmovzxbw xmm5, xmm3 # xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 6245 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6246 pmullw xmm3, xmm1 6247 movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,255,255,255,255,255,255,255] 6248 pand xmm3, xmm1 6249 pmullw xmm5, xmm4 6250 pand xmm5, xmm1 6251 packuswb xmm5, xmm3 6252 pmovzxbw xmm3, xmm2 # xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 6253 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6254 pmovzxbw xmm4, xmm0 # xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 6255 punpckhbw xmm0, xmm0 # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6256 pmullw xmm0, xmm2 6257 pand xmm0, xmm1 6258 pmullw xmm4, xmm3 6259 pand xmm4, xmm1 6260 packuswb xmm4, xmm0 6261 movdqu xmmword ptr [r8 + rax], xmm5 6262 movdqu xmmword ptr [r8 + rax + 16], xmm4 6263 .LBB0_905: 6264 cmp rdi, r10 6265 jne .LBB0_906 6266 jmp .LBB0_1013 6267 .LBB0_65: 6268 xor edi, edi 6269 .LBB0_68: 6270 test r9b, 1 6271 je .LBB0_70 6272 # %bb.69: 6273 movdqu xmm0, xmmword ptr [rdx + rdi] 6274 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 6275 movdqu xmm2, xmmword ptr [rcx + rdi] 6276 paddb xmm2, xmm0 6277 movdqu xmm0, xmmword ptr [rcx + rdi + 16] 6278 paddb xmm0, xmm1 6279 movdqu xmmword ptr [r8 + rdi], xmm2 6280 movdqu xmmword ptr [r8 + rdi + 16], xmm0 6281 .LBB0_70: 6282 cmp rsi, r10 6283 je .LBB0_1013 6284 jmp .LBB0_71 6285 .LBB0_411: 6286 xor edi, edi 6287 .LBB0_414: 6288 test r9b, 1 6289 je .LBB0_416 6290 # %bb.415: 6291 movdqu xmm0, xmmword ptr [rdx + rdi] 6292 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 6293 movdqu xmm2, xmmword ptr [rcx + rdi] 6294 psubb xmm0, xmm2 6295 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 6296 psubb xmm1, xmm2 6297 movdqu xmmword ptr [r8 + rdi], xmm0 6298 movdqu xmmword ptr [r8 + rdi + 16], xmm1 6299 .LBB0_416: 6300 cmp rsi, r10 6301 jne .LBB0_417 6302 jmp .LBB0_1013 6303 .LBB0_238: 6304 xor edi, edi 6305 .LBB0_241: 6306 test r9b, 1 6307 je .LBB0_243 6308 # %bb.242: 6309 movdqu xmm0, xmmword ptr [rdx + rdi] 6310 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 6311 movdqu xmm2, xmmword ptr [rcx + rdi] 6312 paddb xmm2, xmm0 6313 movdqu xmm0, xmmword ptr [rcx + rdi + 16] 6314 paddb xmm0, xmm1 6315 movdqu xmmword ptr [r8 + rdi], xmm2 6316 movdqu xmmword ptr [r8 + rdi + 16], xmm0 6317 .LBB0_243: 6318 cmp rsi, r10 6319 je .LBB0_1013 6320 jmp .LBB0_244 6321 .LBB0_584: 6322 xor edi, edi 6323 .LBB0_587: 6324 test r9b, 1 6325 je .LBB0_589 6326 # %bb.588: 6327 movdqu xmm0, xmmword ptr [rdx + rdi] 6328 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 6329 movdqu xmm2, xmmword ptr [rcx + rdi] 6330 psubb xmm0, xmm2 6331 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 6332 psubb xmm1, xmm2 6333 movdqu xmmword ptr [r8 + rdi], xmm0 6334 movdqu xmmword ptr [r8 + rdi + 16], xmm1 6335 .LBB0_589: 6336 cmp rsi, r10 6337 jne .LBB0_590 6338 jmp .LBB0_1013 6339 .LBB0_139: 6340 xor edi, edi 6341 .LBB0_142: 6342 test r9b, 1 6343 je .LBB0_144 6344 # %bb.143: 6345 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 6346 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 6347 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 6348 paddq xmm2, xmm0 6349 movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16] 6350 paddq xmm0, xmm1 6351 movdqu xmmword ptr [r8 + 8*rdi], xmm2 6352 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0 6353 .LBB0_144: 6354 cmp rsi, r10 6355 je .LBB0_1013 6356 jmp .LBB0_145 6357 .LBB0_485: 6358 xor edi, edi 6359 .LBB0_488: 6360 test r9b, 1 6361 je .LBB0_490 6362 # %bb.489: 6363 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 6364 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 6365 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 6366 psubq xmm0, xmm2 6367 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 6368 psubq xmm1, xmm2 6369 movdqu xmmword ptr [r8 + 8*rdi], xmm0 6370 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1 6371 .LBB0_490: 6372 cmp rsi, r10 6373 jne .LBB0_491 6374 jmp .LBB0_1013 6375 .LBB0_312: 6376 xor edi, edi 6377 .LBB0_315: 6378 test r9b, 1 6379 je .LBB0_317 6380 # %bb.316: 6381 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 6382 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 6383 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 6384 paddq xmm2, xmm0 6385 movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16] 6386 paddq xmm0, xmm1 6387 movdqu xmmword ptr [r8 + 8*rdi], xmm2 6388 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0 6389 .LBB0_317: 6390 cmp rsi, r10 6391 jne .LBB0_318 6392 jmp .LBB0_1013 6393 .LBB0_658: 6394 xor edi, edi 6395 .LBB0_661: 6396 test r9b, 1 6397 je .LBB0_663 6398 # %bb.662: 6399 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 6400 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 6401 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 6402 psubq xmm0, xmm2 6403 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 6404 psubq xmm1, xmm2 6405 movdqu xmmword ptr [r8 + 8*rdi], xmm0 6406 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1 6407 .LBB0_663: 6408 cmp rsi, r10 6409 jne .LBB0_664 6410 jmp .LBB0_1013 6411 .LBB0_766: 6412 xor edi, edi 6413 .LBB0_769: 6414 test r9b, 1 6415 je .LBB0_771 6416 # %bb.770: 6417 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 6418 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 6419 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 6420 pmullw xmm2, xmm0 6421 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 6422 pmullw xmm0, xmm1 6423 movdqu xmmword ptr [r8 + 2*rdi], xmm2 6424 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 6425 .LBB0_771: 6426 cmp rsi, r10 6427 jne .LBB0_772 6428 jmp .LBB0_1013 6429 .LBB0_782: 6430 xor edi, edi 6431 .LBB0_785: 6432 test r9b, 1 6433 je .LBB0_787 6434 # %bb.786: 6435 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 6436 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 6437 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 6438 pmullw xmm2, xmm0 6439 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 6440 pmullw xmm0, xmm1 6441 movdqu xmmword ptr [r8 + 2*rdi], xmm2 6442 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 6443 .LBB0_787: 6444 cmp rsi, r10 6445 jne .LBB0_788 6446 jmp .LBB0_1013 6447 .LBB0_916: 6448 xor edi, edi 6449 .LBB0_919: 6450 test r9b, 1 6451 je .LBB0_921 6452 # %bb.920: 6453 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 6454 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 6455 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 6456 pmullw xmm2, xmm0 6457 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 6458 pmullw xmm0, xmm1 6459 movdqu xmmword ptr [r8 + 2*rdi], xmm2 6460 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 6461 .LBB0_921: 6462 cmp rsi, r10 6463 jne .LBB0_922 6464 jmp .LBB0_1013 6465 .LBB0_932: 6466 xor edi, edi 6467 .LBB0_935: 6468 test r9b, 1 6469 je .LBB0_937 6470 # %bb.936: 6471 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 6472 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 6473 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 6474 pmullw xmm2, xmm0 6475 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 6476 pmullw xmm0, xmm1 6477 movdqu xmmword ptr [r8 + 2*rdi], xmm2 6478 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 6479 .LBB0_937: 6480 cmp rsi, r10 6481 jne .LBB0_938 6482 jmp .LBB0_1013 6483 .LBB0_81: 6484 xor edi, edi 6485 .LBB0_84: 6486 test r9b, 1 6487 je .LBB0_86 6488 # %bb.85: 6489 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 6490 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 6491 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 6492 paddw xmm2, xmm0 6493 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 6494 paddw xmm0, xmm1 6495 movdqu xmmword ptr [r8 + 2*rdi], xmm2 6496 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 6497 .LBB0_86: 6498 cmp rsi, r10 6499 je .LBB0_1013 6500 jmp .LBB0_87 6501 .LBB0_97: 6502 xor edi, edi 6503 .LBB0_100: 6504 test r9b, 1 6505 je .LBB0_102 6506 # %bb.101: 6507 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 6508 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 6509 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 6510 paddw xmm2, xmm0 6511 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 6512 paddw xmm0, xmm1 6513 movdqu xmmword ptr [r8 + 2*rdi], xmm2 6514 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 6515 .LBB0_102: 6516 cmp rsi, r10 6517 je .LBB0_1013 6518 jmp .LBB0_103 6519 .LBB0_427: 6520 xor edi, edi 6521 .LBB0_430: 6522 test r9b, 1 6523 je .LBB0_432 6524 # %bb.431: 6525 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 6526 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 6527 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 6528 psubw xmm0, xmm2 6529 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 6530 psubw xmm1, xmm2 6531 movdqu xmmword ptr [r8 + 2*rdi], xmm0 6532 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1 6533 .LBB0_432: 6534 cmp rsi, r10 6535 jne .LBB0_433 6536 jmp .LBB0_1013 6537 .LBB0_443: 6538 xor edi, edi 6539 .LBB0_446: 6540 test r9b, 1 6541 je .LBB0_448 6542 # %bb.447: 6543 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 6544 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 6545 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 6546 psubw xmm0, xmm2 6547 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 6548 psubw xmm1, xmm2 6549 movdqu xmmword ptr [r8 + 2*rdi], xmm0 6550 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1 6551 .LBB0_448: 6552 cmp rsi, r10 6553 jne .LBB0_449 6554 jmp .LBB0_1013 6555 .LBB0_254: 6556 xor edi, edi 6557 .LBB0_257: 6558 test r9b, 1 6559 je .LBB0_259 6560 # %bb.258: 6561 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 6562 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 6563 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 6564 paddw xmm2, xmm0 6565 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 6566 paddw xmm0, xmm1 6567 movdqu xmmword ptr [r8 + 2*rdi], xmm2 6568 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 6569 .LBB0_259: 6570 cmp rsi, r10 6571 je .LBB0_1013 6572 jmp .LBB0_260 6573 .LBB0_270: 6574 xor edi, edi 6575 .LBB0_273: 6576 test r9b, 1 6577 je .LBB0_275 6578 # %bb.274: 6579 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 6580 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 6581 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 6582 paddw xmm2, xmm0 6583 movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16] 6584 paddw xmm0, xmm1 6585 movdqu xmmword ptr [r8 + 2*rdi], xmm2 6586 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 6587 .LBB0_275: 6588 cmp rsi, r10 6589 je .LBB0_1013 6590 jmp .LBB0_276 6591 .LBB0_600: 6592 xor edi, edi 6593 .LBB0_603: 6594 test r9b, 1 6595 je .LBB0_605 6596 # %bb.604: 6597 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 6598 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 6599 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 6600 psubw xmm0, xmm2 6601 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 6602 psubw xmm1, xmm2 6603 movdqu xmmword ptr [r8 + 2*rdi], xmm0 6604 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1 6605 .LBB0_605: 6606 cmp rsi, r10 6607 jne .LBB0_606 6608 jmp .LBB0_1013 6609 .LBB0_616: 6610 xor edi, edi 6611 .LBB0_619: 6612 test r9b, 1 6613 je .LBB0_621 6614 # %bb.620: 6615 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 6616 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 6617 movdqu xmm2, xmmword ptr [rcx + 2*rdi] 6618 psubw xmm0, xmm2 6619 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 6620 psubw xmm1, xmm2 6621 movdqu xmmword ptr [r8 + 2*rdi], xmm0 6622 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1 6623 .LBB0_621: 6624 cmp rsi, r10 6625 jne .LBB0_622 6626 jmp .LBB0_1013 6627 .LBB0_840: 6628 xor edi, edi 6629 .LBB0_843: 6630 test r9b, 1 6631 je .LBB0_845 6632 # %bb.844: 6633 movups xmm0, xmmword ptr [rdx + 4*rdi] 6634 movups xmm1, xmmword ptr [rdx + 4*rdi + 16] 6635 movups xmm2, xmmword ptr [rcx + 4*rdi] 6636 mulps xmm2, xmm0 6637 movups xmm0, xmmword ptr [rcx + 4*rdi + 16] 6638 mulps xmm0, xmm1 6639 movups xmmword ptr [r8 + 4*rdi], xmm2 6640 movups xmmword ptr [r8 + 4*rdi + 16], xmm0 6641 .LBB0_845: 6642 cmp rsi, r10 6643 jne .LBB0_846 6644 jmp .LBB0_1013 6645 .LBB0_990: 6646 xor edi, edi 6647 .LBB0_993: 6648 test r9b, 1 6649 je .LBB0_995 6650 # %bb.994: 6651 movups xmm0, xmmword ptr [rdx + 4*rdi] 6652 movups xmm1, xmmword ptr [rdx + 4*rdi + 16] 6653 movups xmm2, xmmword ptr [rcx + 4*rdi] 6654 mulps xmm2, xmm0 6655 movups xmm0, xmmword ptr [rcx + 4*rdi + 16] 6656 mulps xmm0, xmm1 6657 movups xmmword ptr [r8 + 4*rdi], xmm2 6658 movups xmmword ptr [r8 + 4*rdi + 16], xmm0 6659 .LBB0_995: 6660 cmp rsi, r10 6661 jne .LBB0_996 6662 jmp .LBB0_1013 6663 .LBB0_155: 6664 xor edi, edi 6665 .LBB0_158: 6666 test r9b, 1 6667 je .LBB0_160 6668 # %bb.159: 6669 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 6670 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 6671 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 6672 paddq xmm2, xmm0 6673 movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16] 6674 paddq xmm0, xmm1 6675 movdqu xmmword ptr [r8 + 8*rdi], xmm2 6676 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0 6677 .LBB0_160: 6678 cmp rsi, r10 6679 je .LBB0_1013 6680 jmp .LBB0_161 6681 .LBB0_171: 6682 xor edi, edi 6683 .LBB0_174: 6684 test r9b, 1 6685 je .LBB0_176 6686 # %bb.175: 6687 movups xmm0, xmmword ptr [rdx + 4*rdi] 6688 movups xmm1, xmmword ptr [rdx + 4*rdi + 16] 6689 movups xmm2, xmmword ptr [rcx + 4*rdi] 6690 addps xmm2, xmm0 6691 movups xmm0, xmmword ptr [rcx + 4*rdi + 16] 6692 addps xmm0, xmm1 6693 movups xmmword ptr [r8 + 4*rdi], xmm2 6694 movups xmmword ptr [r8 + 4*rdi + 16], xmm0 6695 .LBB0_176: 6696 cmp rsi, r10 6697 je .LBB0_1013 6698 jmp .LBB0_177 6699 .LBB0_501: 6700 xor edi, edi 6701 .LBB0_504: 6702 test r9b, 1 6703 je .LBB0_506 6704 # %bb.505: 6705 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 6706 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 6707 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 6708 psubq xmm0, xmm2 6709 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 6710 psubq xmm1, xmm2 6711 movdqu xmmword ptr [r8 + 8*rdi], xmm0 6712 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1 6713 .LBB0_506: 6714 cmp rsi, r10 6715 jne .LBB0_507 6716 jmp .LBB0_1013 6717 .LBB0_517: 6718 xor edi, edi 6719 .LBB0_520: 6720 test r9b, 1 6721 je .LBB0_522 6722 # %bb.521: 6723 movups xmm0, xmmword ptr [rdx + 4*rdi] 6724 movups xmm1, xmmword ptr [rdx + 4*rdi + 16] 6725 movups xmm2, xmmword ptr [rcx + 4*rdi] 6726 subps xmm0, xmm2 6727 movups xmm2, xmmword ptr [rcx + 4*rdi + 16] 6728 subps xmm1, xmm2 6729 movups xmmword ptr [r8 + 4*rdi], xmm0 6730 movups xmmword ptr [r8 + 4*rdi + 16], xmm1 6731 .LBB0_522: 6732 cmp rsi, r10 6733 jne .LBB0_523 6734 jmp .LBB0_1013 6735 .LBB0_328: 6736 xor edi, edi 6737 .LBB0_331: 6738 test r9b, 1 6739 je .LBB0_333 6740 # %bb.332: 6741 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 6742 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 6743 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 6744 paddq xmm2, xmm0 6745 movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16] 6746 paddq xmm0, xmm1 6747 movdqu xmmword ptr [r8 + 8*rdi], xmm2 6748 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0 6749 .LBB0_333: 6750 cmp rsi, r10 6751 jne .LBB0_334 6752 jmp .LBB0_1013 6753 .LBB0_344: 6754 xor edi, edi 6755 .LBB0_347: 6756 test r9b, 1 6757 je .LBB0_349 6758 # %bb.348: 6759 movups xmm0, xmmword ptr [rdx + 4*rdi] 6760 movups xmm1, xmmword ptr [rdx + 4*rdi + 16] 6761 movups xmm2, xmmword ptr [rcx + 4*rdi] 6762 addps xmm2, xmm0 6763 movups xmm0, xmmword ptr [rcx + 4*rdi + 16] 6764 addps xmm0, xmm1 6765 movups xmmword ptr [r8 + 4*rdi], xmm2 6766 movups xmmword ptr [r8 + 4*rdi + 16], xmm0 6767 .LBB0_349: 6768 cmp rsi, r10 6769 jne .LBB0_350 6770 jmp .LBB0_1013 6771 .LBB0_674: 6772 xor edi, edi 6773 .LBB0_677: 6774 test r9b, 1 6775 je .LBB0_679 6776 # %bb.678: 6777 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 6778 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 6779 movdqu xmm2, xmmword ptr [rcx + 8*rdi] 6780 psubq xmm0, xmm2 6781 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 6782 psubq xmm1, xmm2 6783 movdqu xmmword ptr [r8 + 8*rdi], xmm0 6784 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1 6785 .LBB0_679: 6786 cmp rsi, r10 6787 jne .LBB0_680 6788 jmp .LBB0_1013 6789 .LBB0_690: 6790 xor edi, edi 6791 .LBB0_693: 6792 test r9b, 1 6793 je .LBB0_695 6794 # %bb.694: 6795 movups xmm0, xmmword ptr [rdx + 4*rdi] 6796 movups xmm1, xmmword ptr [rdx + 4*rdi + 16] 6797 movups xmm2, xmmword ptr [rcx + 4*rdi] 6798 subps xmm0, xmm2 6799 movups xmm2, xmmword ptr [rcx + 4*rdi + 16] 6800 subps xmm1, xmm2 6801 movups xmmword ptr [r8 + 4*rdi], xmm0 6802 movups xmmword ptr [r8 + 4*rdi + 16], xmm1 6803 .LBB0_695: 6804 cmp rsi, r10 6805 jne .LBB0_696 6806 jmp .LBB0_1013 6807 .LBB0_737: 6808 xor eax, eax 6809 .LBB0_740: 6810 test r9b, 1 6811 je .LBB0_742 6812 # %bb.741: 6813 movdqu xmm1, xmmword ptr [rdx + rax] 6814 movdqu xmm2, xmmword ptr [rdx + rax + 16] 6815 movdqu xmm3, xmmword ptr [rcx + rax] 6816 movdqu xmm0, xmmword ptr [rcx + rax + 16] 6817 pmovzxbw xmm4, xmm1 # xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 6818 punpckhbw xmm1, xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6819 pmovzxbw xmm5, xmm3 # xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 6820 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6821 pmullw xmm3, xmm1 6822 movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,255,255,255,255,255,255,255] 6823 pand xmm3, xmm1 6824 pmullw xmm5, xmm4 6825 pand xmm5, xmm1 6826 packuswb xmm5, xmm3 6827 pmovzxbw xmm3, xmm2 # xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 6828 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6829 pmovzxbw xmm4, xmm0 # xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 6830 punpckhbw xmm0, xmm0 # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6831 pmullw xmm0, xmm2 6832 pand xmm0, xmm1 6833 pmullw xmm4, xmm3 6834 pand xmm4, xmm1 6835 packuswb xmm4, xmm0 6836 movdqu xmmword ptr [r8 + rax], xmm5 6837 movdqu xmmword ptr [r8 + rax + 16], xmm4 6838 .LBB0_742: 6839 cmp rdi, r10 6840 jne .LBB0_743 6841 jmp .LBB0_1013 6842 .LBB0_887: 6843 xor eax, eax 6844 .LBB0_890: 6845 test r9b, 1 6846 je .LBB0_892 6847 # %bb.891: 6848 movdqu xmm1, xmmword ptr [rdx + rax] 6849 movdqu xmm2, xmmword ptr [rdx + rax + 16] 6850 movdqu xmm3, xmmword ptr [rcx + rax] 6851 movdqu xmm0, xmmword ptr [rcx + rax + 16] 6852 pmovzxbw xmm4, xmm1 # xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 6853 punpckhbw xmm1, xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6854 pmovzxbw xmm5, xmm3 # xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 6855 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6856 pmullw xmm3, xmm1 6857 movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,255,255,255,255,255,255,255] 6858 pand xmm3, xmm1 6859 pmullw xmm5, xmm4 6860 pand xmm5, xmm1 6861 packuswb xmm5, xmm3 6862 pmovzxbw xmm3, xmm2 # xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 6863 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6864 pmovzxbw xmm4, xmm0 # xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 6865 punpckhbw xmm0, xmm0 # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 6866 pmullw xmm0, xmm2 6867 pand xmm0, xmm1 6868 pmullw xmm4, xmm3 6869 pand xmm4, xmm1 6870 packuswb xmm4, xmm0 6871 movdqu xmmword ptr [r8 + rax], xmm5 6872 movdqu xmmword ptr [r8 + rax + 16], xmm4 6873 .LBB0_892: 6874 cmp rdi, r10 6875 jne .LBB0_893 6876 jmp .LBB0_1013 6877 .LBB0_52: 6878 xor edi, edi 6879 .LBB0_55: 6880 test r9b, 1 6881 je .LBB0_57 6882 # %bb.56: 6883 movdqu xmm0, xmmword ptr [rdx + rdi] 6884 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 6885 movdqu xmm2, xmmword ptr [rcx + rdi] 6886 paddb xmm2, xmm0 6887 movdqu xmm0, xmmword ptr [rcx + rdi + 16] 6888 paddb xmm0, xmm1 6889 movdqu xmmword ptr [r8 + rdi], xmm2 6890 movdqu xmmword ptr [r8 + rdi + 16], xmm0 6891 .LBB0_57: 6892 cmp rsi, r10 6893 je .LBB0_1013 6894 jmp .LBB0_58 6895 .LBB0_398: 6896 xor edi, edi 6897 .LBB0_401: 6898 test r9b, 1 6899 je .LBB0_403 6900 # %bb.402: 6901 movdqu xmm0, xmmword ptr [rdx + rdi] 6902 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 6903 movdqu xmm2, xmmword ptr [rcx + rdi] 6904 psubb xmm0, xmm2 6905 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 6906 psubb xmm1, xmm2 6907 movdqu xmmword ptr [r8 + rdi], xmm0 6908 movdqu xmmword ptr [r8 + rdi + 16], xmm1 6909 .LBB0_403: 6910 cmp rsi, r10 6911 jne .LBB0_404 6912 jmp .LBB0_1013 6913 .LBB0_225: 6914 xor edi, edi 6915 .LBB0_228: 6916 test r9b, 1 6917 je .LBB0_230 6918 # %bb.229: 6919 movdqu xmm0, xmmword ptr [rdx + rdi] 6920 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 6921 movdqu xmm2, xmmword ptr [rcx + rdi] 6922 paddb xmm2, xmm0 6923 movdqu xmm0, xmmword ptr [rcx + rdi + 16] 6924 paddb xmm0, xmm1 6925 movdqu xmmword ptr [r8 + rdi], xmm2 6926 movdqu xmmword ptr [r8 + rdi + 16], xmm0 6927 .LBB0_230: 6928 cmp rsi, r10 6929 je .LBB0_1013 6930 jmp .LBB0_231 6931 .LBB0_571: 6932 xor edi, edi 6933 .LBB0_574: 6934 test r9b, 1 6935 je .LBB0_576 6936 # %bb.575: 6937 movdqu xmm0, xmmword ptr [rdx + rdi] 6938 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 6939 movdqu xmm2, xmmword ptr [rcx + rdi] 6940 psubb xmm0, xmm2 6941 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 6942 psubb xmm1, xmm2 6943 movdqu xmmword ptr [r8 + rdi], xmm0 6944 movdqu xmmword ptr [r8 + rdi + 16], xmm1 6945 .LBB0_576: 6946 cmp rsi, r10 6947 jne .LBB0_577 6948 jmp .LBB0_1013 6949 .LBB0_811: 6950 xor edi, edi 6951 .LBB0_814: 6952 test r9b, 1 6953 je .LBB0_816 6954 # %bb.815: 6955 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 6956 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 6957 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 6958 pmulld xmm2, xmm0 6959 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 6960 pmulld xmm0, xmm1 6961 movdqu xmmword ptr [r8 + 4*rdi], xmm2 6962 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 6963 .LBB0_816: 6964 cmp rsi, r10 6965 jne .LBB0_817 6966 jmp .LBB0_1013 6967 .LBB0_961: 6968 xor edi, edi 6969 .LBB0_964: 6970 test r9b, 1 6971 je .LBB0_966 6972 # %bb.965: 6973 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 6974 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 6975 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 6976 pmulld xmm2, xmm0 6977 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 6978 pmulld xmm0, xmm1 6979 movdqu xmmword ptr [r8 + 4*rdi], xmm2 6980 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 6981 .LBB0_966: 6982 cmp rsi, r10 6983 jne .LBB0_967 6984 jmp .LBB0_1013 6985 .LBB0_126: 6986 xor edi, edi 6987 .LBB0_129: 6988 test r9b, 1 6989 je .LBB0_131 6990 # %bb.130: 6991 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 6992 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 6993 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 6994 paddd xmm2, xmm0 6995 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 6996 paddd xmm0, xmm1 6997 movdqu xmmword ptr [r8 + 4*rdi], xmm2 6998 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 6999 .LBB0_131: 7000 cmp rsi, r10 7001 je .LBB0_1013 7002 jmp .LBB0_132 7003 .LBB0_472: 7004 xor edi, edi 7005 .LBB0_475: 7006 test r9b, 1 7007 je .LBB0_477 7008 # %bb.476: 7009 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 7010 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 7011 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 7012 psubd xmm0, xmm2 7013 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 7014 psubd xmm1, xmm2 7015 movdqu xmmword ptr [r8 + 4*rdi], xmm0 7016 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1 7017 .LBB0_477: 7018 cmp rsi, r10 7019 jne .LBB0_478 7020 jmp .LBB0_1013 7021 .LBB0_299: 7022 xor edi, edi 7023 .LBB0_302: 7024 test r9b, 1 7025 je .LBB0_304 7026 # %bb.303: 7027 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 7028 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 7029 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 7030 paddd xmm2, xmm0 7031 movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16] 7032 paddd xmm0, xmm1 7033 movdqu xmmword ptr [r8 + 4*rdi], xmm2 7034 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 7035 .LBB0_304: 7036 cmp rsi, r10 7037 je .LBB0_1013 7038 jmp .LBB0_305 7039 .LBB0_645: 7040 xor edi, edi 7041 .LBB0_648: 7042 test r9b, 1 7043 je .LBB0_650 7044 # %bb.649: 7045 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 7046 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 7047 movdqu xmm2, xmmword ptr [rcx + 4*rdi] 7048 psubd xmm0, xmm2 7049 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 7050 psubd xmm1, xmm2 7051 movdqu xmmword ptr [r8 + 4*rdi], xmm0 7052 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1 7053 .LBB0_650: 7054 cmp rsi, r10 7055 jne .LBB0_651 7056 jmp .LBB0_1013 7057 .Lfunc_end0: 7058 .size arithmetic_binary_sse4, .Lfunc_end0-arithmetic_binary_sse4 7059 # -- End function 7060 .section .rodata.cst16,"aM",@progbits,16 7061 .p2align 4 # -- Begin function arithmetic_arr_scalar_sse4 7062 .LCPI1_0: 7063 .short 255 # 0xff 7064 .short 255 # 0xff 7065 .short 255 # 0xff 7066 .short 255 # 0xff 7067 .short 255 # 0xff 7068 .short 255 # 0xff 7069 .short 255 # 0xff 7070 .short 255 # 0xff 7071 .text 7072 .globl arithmetic_arr_scalar_sse4 7073 .p2align 4, 0x90 7074 .type arithmetic_arr_scalar_sse4,@function 7075 arithmetic_arr_scalar_sse4: # @arithmetic_arr_scalar_sse4 7076 # %bb.0: 7077 push rbp 7078 mov rbp, rsp 7079 and rsp, -8 7080 cmp sil, 20 7081 jg .LBB1_12 7082 # %bb.1: 7083 test sil, sil 7084 je .LBB1_23 7085 # %bb.2: 7086 cmp sil, 1 7087 je .LBB1_31 7088 # %bb.3: 7089 cmp sil, 2 7090 jne .LBB1_1069 7091 # %bb.4: 7092 cmp edi, 6 7093 jg .LBB1_55 7094 # %bb.5: 7095 cmp edi, 3 7096 jle .LBB1_97 7097 # %bb.6: 7098 cmp edi, 4 7099 je .LBB1_157 7100 # %bb.7: 7101 cmp edi, 5 7102 je .LBB1_160 7103 # %bb.8: 7104 cmp edi, 6 7105 jne .LBB1_1069 7106 # %bb.9: 7107 test r9d, r9d 7108 jle .LBB1_1069 7109 # %bb.10: 7110 mov eax, dword ptr [rcx] 7111 mov r10d, r9d 7112 cmp r9d, 8 7113 jb .LBB1_11 7114 # %bb.265: 7115 lea rcx, [rdx + 4*r10] 7116 cmp rcx, r8 7117 jbe .LBB1_453 7118 # %bb.266: 7119 lea rcx, [r8 + 4*r10] 7120 cmp rcx, rdx 7121 jbe .LBB1_453 7122 .LBB1_11: 7123 xor esi, esi 7124 .LBB1_625: 7125 mov r9, rsi 7126 not r9 7127 add r9, r10 7128 mov rdi, r10 7129 and rdi, 3 7130 je .LBB1_627 7131 .LBB1_626: # =>This Inner Loop Header: Depth=1 7132 mov ecx, dword ptr [rdx + 4*rsi] 7133 imul ecx, eax 7134 mov dword ptr [r8 + 4*rsi], ecx 7135 add rsi, 1 7136 add rdi, -1 7137 jne .LBB1_626 7138 .LBB1_627: 7139 cmp r9, 3 7140 jb .LBB1_1069 7141 .LBB1_628: # =>This Inner Loop Header: Depth=1 7142 mov ecx, dword ptr [rdx + 4*rsi] 7143 imul ecx, eax 7144 mov dword ptr [r8 + 4*rsi], ecx 7145 mov ecx, dword ptr [rdx + 4*rsi + 4] 7146 imul ecx, eax 7147 mov dword ptr [r8 + 4*rsi + 4], ecx 7148 mov ecx, dword ptr [rdx + 4*rsi + 8] 7149 imul ecx, eax 7150 mov dword ptr [r8 + 4*rsi + 8], ecx 7151 mov ecx, dword ptr [rdx + 4*rsi + 12] 7152 imul ecx, eax 7153 mov dword ptr [r8 + 4*rsi + 12], ecx 7154 add rsi, 4 7155 cmp r10, rsi 7156 jne .LBB1_628 7157 jmp .LBB1_1069 7158 .LBB1_12: 7159 cmp sil, 21 7160 je .LBB1_39 7161 # %bb.13: 7162 cmp sil, 22 7163 je .LBB1_47 7164 # %bb.14: 7165 cmp sil, 23 7166 jne .LBB1_1069 7167 # %bb.15: 7168 cmp edi, 6 7169 jg .LBB1_62 7170 # %bb.16: 7171 cmp edi, 3 7172 jle .LBB1_102 7173 # %bb.17: 7174 cmp edi, 4 7175 je .LBB1_163 7176 # %bb.18: 7177 cmp edi, 5 7178 je .LBB1_166 7179 # %bb.19: 7180 cmp edi, 6 7181 jne .LBB1_1069 7182 # %bb.20: 7183 test r9d, r9d 7184 jle .LBB1_1069 7185 # %bb.21: 7186 mov eax, dword ptr [rcx] 7187 mov r10d, r9d 7188 cmp r9d, 8 7189 jb .LBB1_22 7190 # %bb.268: 7191 lea rcx, [rdx + 4*r10] 7192 cmp rcx, r8 7193 jbe .LBB1_456 7194 # %bb.269: 7195 lea rcx, [r8 + 4*r10] 7196 cmp rcx, rdx 7197 jbe .LBB1_456 7198 .LBB1_22: 7199 xor esi, esi 7200 .LBB1_633: 7201 mov r9, rsi 7202 not r9 7203 add r9, r10 7204 mov rdi, r10 7205 and rdi, 3 7206 je .LBB1_635 7207 .LBB1_634: # =>This Inner Loop Header: Depth=1 7208 mov ecx, dword ptr [rdx + 4*rsi] 7209 imul ecx, eax 7210 mov dword ptr [r8 + 4*rsi], ecx 7211 add rsi, 1 7212 add rdi, -1 7213 jne .LBB1_634 7214 .LBB1_635: 7215 cmp r9, 3 7216 jb .LBB1_1069 7217 .LBB1_636: # =>This Inner Loop Header: Depth=1 7218 mov ecx, dword ptr [rdx + 4*rsi] 7219 imul ecx, eax 7220 mov dword ptr [r8 + 4*rsi], ecx 7221 mov ecx, dword ptr [rdx + 4*rsi + 4] 7222 imul ecx, eax 7223 mov dword ptr [r8 + 4*rsi + 4], ecx 7224 mov ecx, dword ptr [rdx + 4*rsi + 8] 7225 imul ecx, eax 7226 mov dword ptr [r8 + 4*rsi + 8], ecx 7227 mov ecx, dword ptr [rdx + 4*rsi + 12] 7228 imul ecx, eax 7229 mov dword ptr [r8 + 4*rsi + 12], ecx 7230 add rsi, 4 7231 cmp r10, rsi 7232 jne .LBB1_636 7233 jmp .LBB1_1069 7234 .LBB1_23: 7235 cmp edi, 6 7236 jg .LBB1_69 7237 # %bb.24: 7238 cmp edi, 3 7239 jle .LBB1_107 7240 # %bb.25: 7241 cmp edi, 4 7242 je .LBB1_169 7243 # %bb.26: 7244 cmp edi, 5 7245 je .LBB1_172 7246 # %bb.27: 7247 cmp edi, 6 7248 jne .LBB1_1069 7249 # %bb.28: 7250 test r9d, r9d 7251 jle .LBB1_1069 7252 # %bb.29: 7253 mov eax, dword ptr [rcx] 7254 mov r10d, r9d 7255 cmp r9d, 8 7256 jb .LBB1_30 7257 # %bb.271: 7258 lea rcx, [rdx + 4*r10] 7259 cmp rcx, r8 7260 jbe .LBB1_459 7261 # %bb.272: 7262 lea rcx, [r8 + 4*r10] 7263 cmp rcx, rdx 7264 jbe .LBB1_459 7265 .LBB1_30: 7266 xor esi, esi 7267 .LBB1_641: 7268 mov r9, rsi 7269 not r9 7270 add r9, r10 7271 mov rdi, r10 7272 and rdi, 3 7273 je .LBB1_643 7274 .LBB1_642: # =>This Inner Loop Header: Depth=1 7275 mov ecx, dword ptr [rdx + 4*rsi] 7276 add ecx, eax 7277 mov dword ptr [r8 + 4*rsi], ecx 7278 add rsi, 1 7279 add rdi, -1 7280 jne .LBB1_642 7281 .LBB1_643: 7282 cmp r9, 3 7283 jb .LBB1_1069 7284 .LBB1_644: # =>This Inner Loop Header: Depth=1 7285 mov ecx, dword ptr [rdx + 4*rsi] 7286 add ecx, eax 7287 mov dword ptr [r8 + 4*rsi], ecx 7288 mov ecx, dword ptr [rdx + 4*rsi + 4] 7289 add ecx, eax 7290 mov dword ptr [r8 + 4*rsi + 4], ecx 7291 mov ecx, dword ptr [rdx + 4*rsi + 8] 7292 add ecx, eax 7293 mov dword ptr [r8 + 4*rsi + 8], ecx 7294 mov ecx, dword ptr [rdx + 4*rsi + 12] 7295 add ecx, eax 7296 mov dword ptr [r8 + 4*rsi + 12], ecx 7297 add rsi, 4 7298 cmp r10, rsi 7299 jne .LBB1_644 7300 jmp .LBB1_1069 7301 .LBB1_31: 7302 cmp edi, 6 7303 jg .LBB1_76 7304 # %bb.32: 7305 cmp edi, 3 7306 jle .LBB1_112 7307 # %bb.33: 7308 cmp edi, 4 7309 je .LBB1_175 7310 # %bb.34: 7311 cmp edi, 5 7312 je .LBB1_178 7313 # %bb.35: 7314 cmp edi, 6 7315 jne .LBB1_1069 7316 # %bb.36: 7317 test r9d, r9d 7318 jle .LBB1_1069 7319 # %bb.37: 7320 mov eax, dword ptr [rcx] 7321 mov r10d, r9d 7322 cmp r9d, 8 7323 jb .LBB1_38 7324 # %bb.274: 7325 lea rcx, [rdx + 4*r10] 7326 cmp rcx, r8 7327 jbe .LBB1_462 7328 # %bb.275: 7329 lea rcx, [r8 + 4*r10] 7330 cmp rcx, rdx 7331 jbe .LBB1_462 7332 .LBB1_38: 7333 xor esi, esi 7334 .LBB1_649: 7335 mov r9, rsi 7336 not r9 7337 add r9, r10 7338 mov rdi, r10 7339 and rdi, 3 7340 je .LBB1_651 7341 .LBB1_650: # =>This Inner Loop Header: Depth=1 7342 mov ecx, dword ptr [rdx + 4*rsi] 7343 sub ecx, eax 7344 mov dword ptr [r8 + 4*rsi], ecx 7345 add rsi, 1 7346 add rdi, -1 7347 jne .LBB1_650 7348 .LBB1_651: 7349 cmp r9, 3 7350 jb .LBB1_1069 7351 .LBB1_652: # =>This Inner Loop Header: Depth=1 7352 mov ecx, dword ptr [rdx + 4*rsi] 7353 sub ecx, eax 7354 mov dword ptr [r8 + 4*rsi], ecx 7355 mov ecx, dword ptr [rdx + 4*rsi + 4] 7356 sub ecx, eax 7357 mov dword ptr [r8 + 4*rsi + 4], ecx 7358 mov ecx, dword ptr [rdx + 4*rsi + 8] 7359 sub ecx, eax 7360 mov dword ptr [r8 + 4*rsi + 8], ecx 7361 mov ecx, dword ptr [rdx + 4*rsi + 12] 7362 sub ecx, eax 7363 mov dword ptr [r8 + 4*rsi + 12], ecx 7364 add rsi, 4 7365 cmp r10, rsi 7366 jne .LBB1_652 7367 jmp .LBB1_1069 7368 .LBB1_39: 7369 cmp edi, 6 7370 jg .LBB1_83 7371 # %bb.40: 7372 cmp edi, 3 7373 jle .LBB1_117 7374 # %bb.41: 7375 cmp edi, 4 7376 je .LBB1_181 7377 # %bb.42: 7378 cmp edi, 5 7379 je .LBB1_184 7380 # %bb.43: 7381 cmp edi, 6 7382 jne .LBB1_1069 7383 # %bb.44: 7384 test r9d, r9d 7385 jle .LBB1_1069 7386 # %bb.45: 7387 mov eax, dword ptr [rcx] 7388 mov r10d, r9d 7389 cmp r9d, 8 7390 jb .LBB1_46 7391 # %bb.277: 7392 lea rcx, [rdx + 4*r10] 7393 cmp rcx, r8 7394 jbe .LBB1_465 7395 # %bb.278: 7396 lea rcx, [r8 + 4*r10] 7397 cmp rcx, rdx 7398 jbe .LBB1_465 7399 .LBB1_46: 7400 xor esi, esi 7401 .LBB1_657: 7402 mov r9, rsi 7403 not r9 7404 add r9, r10 7405 mov rdi, r10 7406 and rdi, 3 7407 je .LBB1_659 7408 .LBB1_658: # =>This Inner Loop Header: Depth=1 7409 mov ecx, dword ptr [rdx + 4*rsi] 7410 add ecx, eax 7411 mov dword ptr [r8 + 4*rsi], ecx 7412 add rsi, 1 7413 add rdi, -1 7414 jne .LBB1_658 7415 .LBB1_659: 7416 cmp r9, 3 7417 jb .LBB1_1069 7418 .LBB1_660: # =>This Inner Loop Header: Depth=1 7419 mov ecx, dword ptr [rdx + 4*rsi] 7420 add ecx, eax 7421 mov dword ptr [r8 + 4*rsi], ecx 7422 mov ecx, dword ptr [rdx + 4*rsi + 4] 7423 add ecx, eax 7424 mov dword ptr [r8 + 4*rsi + 4], ecx 7425 mov ecx, dword ptr [rdx + 4*rsi + 8] 7426 add ecx, eax 7427 mov dword ptr [r8 + 4*rsi + 8], ecx 7428 mov ecx, dword ptr [rdx + 4*rsi + 12] 7429 add ecx, eax 7430 mov dword ptr [r8 + 4*rsi + 12], ecx 7431 add rsi, 4 7432 cmp r10, rsi 7433 jne .LBB1_660 7434 jmp .LBB1_1069 7435 .LBB1_47: 7436 cmp edi, 6 7437 jg .LBB1_90 7438 # %bb.48: 7439 cmp edi, 3 7440 jle .LBB1_122 7441 # %bb.49: 7442 cmp edi, 4 7443 je .LBB1_187 7444 # %bb.50: 7445 cmp edi, 5 7446 je .LBB1_190 7447 # %bb.51: 7448 cmp edi, 6 7449 jne .LBB1_1069 7450 # %bb.52: 7451 test r9d, r9d 7452 jle .LBB1_1069 7453 # %bb.53: 7454 mov eax, dword ptr [rcx] 7455 mov r10d, r9d 7456 cmp r9d, 8 7457 jb .LBB1_54 7458 # %bb.280: 7459 lea rcx, [rdx + 4*r10] 7460 cmp rcx, r8 7461 jbe .LBB1_468 7462 # %bb.281: 7463 lea rcx, [r8 + 4*r10] 7464 cmp rcx, rdx 7465 jbe .LBB1_468 7466 .LBB1_54: 7467 xor esi, esi 7468 .LBB1_665: 7469 mov r9, rsi 7470 not r9 7471 add r9, r10 7472 mov rdi, r10 7473 and rdi, 3 7474 je .LBB1_667 7475 .LBB1_666: # =>This Inner Loop Header: Depth=1 7476 mov ecx, dword ptr [rdx + 4*rsi] 7477 sub ecx, eax 7478 mov dword ptr [r8 + 4*rsi], ecx 7479 add rsi, 1 7480 add rdi, -1 7481 jne .LBB1_666 7482 .LBB1_667: 7483 cmp r9, 3 7484 jb .LBB1_1069 7485 .LBB1_668: # =>This Inner Loop Header: Depth=1 7486 mov ecx, dword ptr [rdx + 4*rsi] 7487 sub ecx, eax 7488 mov dword ptr [r8 + 4*rsi], ecx 7489 mov ecx, dword ptr [rdx + 4*rsi + 4] 7490 sub ecx, eax 7491 mov dword ptr [r8 + 4*rsi + 4], ecx 7492 mov ecx, dword ptr [rdx + 4*rsi + 8] 7493 sub ecx, eax 7494 mov dword ptr [r8 + 4*rsi + 8], ecx 7495 mov ecx, dword ptr [rdx + 4*rsi + 12] 7496 sub ecx, eax 7497 mov dword ptr [r8 + 4*rsi + 12], ecx 7498 add rsi, 4 7499 cmp r10, rsi 7500 jne .LBB1_668 7501 jmp .LBB1_1069 7502 .LBB1_55: 7503 cmp edi, 8 7504 jle .LBB1_127 7505 # %bb.56: 7506 cmp edi, 9 7507 je .LBB1_193 7508 # %bb.57: 7509 cmp edi, 11 7510 je .LBB1_196 7511 # %bb.58: 7512 cmp edi, 12 7513 jne .LBB1_1069 7514 # %bb.59: 7515 test r9d, r9d 7516 jle .LBB1_1069 7517 # %bb.60: 7518 movsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero 7519 mov eax, r9d 7520 cmp r9d, 4 7521 jb .LBB1_61 7522 # %bb.283: 7523 lea rcx, [rdx + 8*rax] 7524 cmp rcx, r8 7525 jbe .LBB1_471 7526 # %bb.284: 7527 lea rcx, [r8 + 8*rax] 7528 cmp rcx, rdx 7529 jbe .LBB1_471 7530 .LBB1_61: 7531 xor ecx, ecx 7532 .LBB1_673: 7533 mov rsi, rcx 7534 not rsi 7535 add rsi, rax 7536 mov rdi, rax 7537 and rdi, 3 7538 je .LBB1_675 7539 .LBB1_674: # =>This Inner Loop Header: Depth=1 7540 movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero 7541 mulsd xmm1, xmm0 7542 movsd qword ptr [r8 + 8*rcx], xmm1 7543 add rcx, 1 7544 add rdi, -1 7545 jne .LBB1_674 7546 .LBB1_675: 7547 cmp rsi, 3 7548 jb .LBB1_1069 7549 .LBB1_676: # =>This Inner Loop Header: Depth=1 7550 movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero 7551 mulsd xmm1, xmm0 7552 movsd qword ptr [r8 + 8*rcx], xmm1 7553 movsd xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero 7554 mulsd xmm1, xmm0 7555 movsd qword ptr [r8 + 8*rcx + 8], xmm1 7556 movsd xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero 7557 mulsd xmm1, xmm0 7558 movsd qword ptr [r8 + 8*rcx + 16], xmm1 7559 movsd xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero 7560 mulsd xmm1, xmm0 7561 movsd qword ptr [r8 + 8*rcx + 24], xmm1 7562 add rcx, 4 7563 cmp rax, rcx 7564 jne .LBB1_676 7565 jmp .LBB1_1069 7566 .LBB1_62: 7567 cmp edi, 8 7568 jle .LBB1_132 7569 # %bb.63: 7570 cmp edi, 9 7571 je .LBB1_199 7572 # %bb.64: 7573 cmp edi, 11 7574 je .LBB1_202 7575 # %bb.65: 7576 cmp edi, 12 7577 jne .LBB1_1069 7578 # %bb.66: 7579 test r9d, r9d 7580 jle .LBB1_1069 7581 # %bb.67: 7582 movsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero 7583 mov eax, r9d 7584 cmp r9d, 4 7585 jb .LBB1_68 7586 # %bb.286: 7587 lea rcx, [rdx + 8*rax] 7588 cmp rcx, r8 7589 jbe .LBB1_474 7590 # %bb.287: 7591 lea rcx, [r8 + 8*rax] 7592 cmp rcx, rdx 7593 jbe .LBB1_474 7594 .LBB1_68: 7595 xor ecx, ecx 7596 .LBB1_681: 7597 mov rsi, rcx 7598 not rsi 7599 add rsi, rax 7600 mov rdi, rax 7601 and rdi, 3 7602 je .LBB1_683 7603 .LBB1_682: # =>This Inner Loop Header: Depth=1 7604 movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero 7605 mulsd xmm1, xmm0 7606 movsd qword ptr [r8 + 8*rcx], xmm1 7607 add rcx, 1 7608 add rdi, -1 7609 jne .LBB1_682 7610 .LBB1_683: 7611 cmp rsi, 3 7612 jb .LBB1_1069 7613 .LBB1_684: # =>This Inner Loop Header: Depth=1 7614 movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero 7615 mulsd xmm1, xmm0 7616 movsd qword ptr [r8 + 8*rcx], xmm1 7617 movsd xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero 7618 mulsd xmm1, xmm0 7619 movsd qword ptr [r8 + 8*rcx + 8], xmm1 7620 movsd xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero 7621 mulsd xmm1, xmm0 7622 movsd qword ptr [r8 + 8*rcx + 16], xmm1 7623 movsd xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero 7624 mulsd xmm1, xmm0 7625 movsd qword ptr [r8 + 8*rcx + 24], xmm1 7626 add rcx, 4 7627 cmp rax, rcx 7628 jne .LBB1_684 7629 jmp .LBB1_1069 7630 .LBB1_69: 7631 cmp edi, 8 7632 jle .LBB1_137 7633 # %bb.70: 7634 cmp edi, 9 7635 je .LBB1_205 7636 # %bb.71: 7637 cmp edi, 11 7638 je .LBB1_208 7639 # %bb.72: 7640 cmp edi, 12 7641 jne .LBB1_1069 7642 # %bb.73: 7643 test r9d, r9d 7644 jle .LBB1_1069 7645 # %bb.74: 7646 movsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero 7647 mov eax, r9d 7648 cmp r9d, 4 7649 jb .LBB1_75 7650 # %bb.289: 7651 lea rcx, [rdx + 8*rax] 7652 cmp rcx, r8 7653 jbe .LBB1_477 7654 # %bb.290: 7655 lea rcx, [r8 + 8*rax] 7656 cmp rcx, rdx 7657 jbe .LBB1_477 7658 .LBB1_75: 7659 xor ecx, ecx 7660 .LBB1_689: 7661 mov rsi, rcx 7662 not rsi 7663 add rsi, rax 7664 mov rdi, rax 7665 and rdi, 3 7666 je .LBB1_691 7667 .LBB1_690: # =>This Inner Loop Header: Depth=1 7668 movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero 7669 addsd xmm1, xmm0 7670 movsd qword ptr [r8 + 8*rcx], xmm1 7671 add rcx, 1 7672 add rdi, -1 7673 jne .LBB1_690 7674 .LBB1_691: 7675 cmp rsi, 3 7676 jb .LBB1_1069 7677 .LBB1_692: # =>This Inner Loop Header: Depth=1 7678 movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero 7679 addsd xmm1, xmm0 7680 movsd qword ptr [r8 + 8*rcx], xmm1 7681 movsd xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero 7682 addsd xmm1, xmm0 7683 movsd qword ptr [r8 + 8*rcx + 8], xmm1 7684 movsd xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero 7685 addsd xmm1, xmm0 7686 movsd qword ptr [r8 + 8*rcx + 16], xmm1 7687 movsd xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero 7688 addsd xmm1, xmm0 7689 movsd qword ptr [r8 + 8*rcx + 24], xmm1 7690 add rcx, 4 7691 cmp rax, rcx 7692 jne .LBB1_692 7693 jmp .LBB1_1069 7694 .LBB1_76: 7695 cmp edi, 8 7696 jle .LBB1_142 7697 # %bb.77: 7698 cmp edi, 9 7699 je .LBB1_211 7700 # %bb.78: 7701 cmp edi, 11 7702 je .LBB1_214 7703 # %bb.79: 7704 cmp edi, 12 7705 jne .LBB1_1069 7706 # %bb.80: 7707 test r9d, r9d 7708 jle .LBB1_1069 7709 # %bb.81: 7710 movsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero 7711 mov eax, r9d 7712 cmp r9d, 4 7713 jb .LBB1_82 7714 # %bb.292: 7715 lea rcx, [rdx + 8*rax] 7716 cmp rcx, r8 7717 jbe .LBB1_480 7718 # %bb.293: 7719 lea rcx, [r8 + 8*rax] 7720 cmp rcx, rdx 7721 jbe .LBB1_480 7722 .LBB1_82: 7723 xor ecx, ecx 7724 .LBB1_697: 7725 mov rsi, rcx 7726 not rsi 7727 add rsi, rax 7728 mov rdi, rax 7729 and rdi, 3 7730 je .LBB1_699 7731 .LBB1_698: # =>This Inner Loop Header: Depth=1 7732 movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero 7733 subsd xmm1, xmm0 7734 movsd qword ptr [r8 + 8*rcx], xmm1 7735 add rcx, 1 7736 add rdi, -1 7737 jne .LBB1_698 7738 .LBB1_699: 7739 cmp rsi, 3 7740 jb .LBB1_1069 7741 .LBB1_700: # =>This Inner Loop Header: Depth=1 7742 movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero 7743 subsd xmm1, xmm0 7744 movsd qword ptr [r8 + 8*rcx], xmm1 7745 movsd xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero 7746 subsd xmm1, xmm0 7747 movsd qword ptr [r8 + 8*rcx + 8], xmm1 7748 movsd xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero 7749 subsd xmm1, xmm0 7750 movsd qword ptr [r8 + 8*rcx + 16], xmm1 7751 movsd xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero 7752 subsd xmm1, xmm0 7753 movsd qword ptr [r8 + 8*rcx + 24], xmm1 7754 add rcx, 4 7755 cmp rax, rcx 7756 jne .LBB1_700 7757 jmp .LBB1_1069 7758 .LBB1_83: 7759 cmp edi, 8 7760 jle .LBB1_147 7761 # %bb.84: 7762 cmp edi, 9 7763 je .LBB1_217 7764 # %bb.85: 7765 cmp edi, 11 7766 je .LBB1_220 7767 # %bb.86: 7768 cmp edi, 12 7769 jne .LBB1_1069 7770 # %bb.87: 7771 test r9d, r9d 7772 jle .LBB1_1069 7773 # %bb.88: 7774 movsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero 7775 mov eax, r9d 7776 cmp r9d, 4 7777 jb .LBB1_89 7778 # %bb.295: 7779 lea rcx, [rdx + 8*rax] 7780 cmp rcx, r8 7781 jbe .LBB1_483 7782 # %bb.296: 7783 lea rcx, [r8 + 8*rax] 7784 cmp rcx, rdx 7785 jbe .LBB1_483 7786 .LBB1_89: 7787 xor ecx, ecx 7788 .LBB1_705: 7789 mov rsi, rcx 7790 not rsi 7791 add rsi, rax 7792 mov rdi, rax 7793 and rdi, 3 7794 je .LBB1_707 7795 .LBB1_706: # =>This Inner Loop Header: Depth=1 7796 movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero 7797 addsd xmm1, xmm0 7798 movsd qword ptr [r8 + 8*rcx], xmm1 7799 add rcx, 1 7800 add rdi, -1 7801 jne .LBB1_706 7802 .LBB1_707: 7803 cmp rsi, 3 7804 jb .LBB1_1069 7805 .LBB1_708: # =>This Inner Loop Header: Depth=1 7806 movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero 7807 addsd xmm1, xmm0 7808 movsd qword ptr [r8 + 8*rcx], xmm1 7809 movsd xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero 7810 addsd xmm1, xmm0 7811 movsd qword ptr [r8 + 8*rcx + 8], xmm1 7812 movsd xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero 7813 addsd xmm1, xmm0 7814 movsd qword ptr [r8 + 8*rcx + 16], xmm1 7815 movsd xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero 7816 addsd xmm1, xmm0 7817 movsd qword ptr [r8 + 8*rcx + 24], xmm1 7818 add rcx, 4 7819 cmp rax, rcx 7820 jne .LBB1_708 7821 jmp .LBB1_1069 7822 .LBB1_90: 7823 cmp edi, 8 7824 jle .LBB1_152 7825 # %bb.91: 7826 cmp edi, 9 7827 je .LBB1_223 7828 # %bb.92: 7829 cmp edi, 11 7830 je .LBB1_226 7831 # %bb.93: 7832 cmp edi, 12 7833 jne .LBB1_1069 7834 # %bb.94: 7835 test r9d, r9d 7836 jle .LBB1_1069 7837 # %bb.95: 7838 movsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero 7839 mov eax, r9d 7840 cmp r9d, 4 7841 jb .LBB1_96 7842 # %bb.298: 7843 lea rcx, [rdx + 8*rax] 7844 cmp rcx, r8 7845 jbe .LBB1_486 7846 # %bb.299: 7847 lea rcx, [r8 + 8*rax] 7848 cmp rcx, rdx 7849 jbe .LBB1_486 7850 .LBB1_96: 7851 xor ecx, ecx 7852 .LBB1_713: 7853 mov rsi, rcx 7854 not rsi 7855 add rsi, rax 7856 mov rdi, rax 7857 and rdi, 3 7858 je .LBB1_715 7859 .LBB1_714: # =>This Inner Loop Header: Depth=1 7860 movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero 7861 subsd xmm1, xmm0 7862 movsd qword ptr [r8 + 8*rcx], xmm1 7863 add rcx, 1 7864 add rdi, -1 7865 jne .LBB1_714 7866 .LBB1_715: 7867 cmp rsi, 3 7868 jb .LBB1_1069 7869 .LBB1_716: # =>This Inner Loop Header: Depth=1 7870 movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero 7871 subsd xmm1, xmm0 7872 movsd qword ptr [r8 + 8*rcx], xmm1 7873 movsd xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero 7874 subsd xmm1, xmm0 7875 movsd qword ptr [r8 + 8*rcx + 8], xmm1 7876 movsd xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero 7877 subsd xmm1, xmm0 7878 movsd qword ptr [r8 + 8*rcx + 16], xmm1 7879 movsd xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero 7880 subsd xmm1, xmm0 7881 movsd qword ptr [r8 + 8*rcx + 24], xmm1 7882 add rcx, 4 7883 cmp rax, rcx 7884 jne .LBB1_716 7885 jmp .LBB1_1069 7886 .LBB1_97: 7887 cmp edi, 2 7888 je .LBB1_229 7889 # %bb.98: 7890 cmp edi, 3 7891 jne .LBB1_1069 7892 # %bb.99: 7893 test r9d, r9d 7894 jle .LBB1_1069 7895 # %bb.100: 7896 mov cl, byte ptr [rcx] 7897 mov r10d, r9d 7898 cmp r9d, 32 7899 jb .LBB1_101 7900 # %bb.301: 7901 lea rax, [rdx + r10] 7902 cmp rax, r8 7903 jbe .LBB1_489 7904 # %bb.302: 7905 lea rax, [r8 + r10] 7906 cmp rax, rdx 7907 jbe .LBB1_489 7908 .LBB1_101: 7909 xor edi, edi 7910 .LBB1_721: 7911 mov r9, rdi 7912 not r9 7913 add r9, r10 7914 mov rsi, r10 7915 and rsi, 3 7916 je .LBB1_723 7917 .LBB1_722: # =>This Inner Loop Header: Depth=1 7918 movzx eax, byte ptr [rdx + rdi] 7919 mul cl 7920 mov byte ptr [r8 + rdi], al 7921 add rdi, 1 7922 add rsi, -1 7923 jne .LBB1_722 7924 .LBB1_723: 7925 cmp r9, 3 7926 jb .LBB1_1069 7927 .LBB1_724: # =>This Inner Loop Header: Depth=1 7928 movzx eax, byte ptr [rdx + rdi] 7929 mul cl 7930 mov byte ptr [r8 + rdi], al 7931 movzx eax, byte ptr [rdx + rdi + 1] 7932 mul cl 7933 mov byte ptr [r8 + rdi + 1], al 7934 movzx eax, byte ptr [rdx + rdi + 2] 7935 mul cl 7936 mov byte ptr [r8 + rdi + 2], al 7937 movzx eax, byte ptr [rdx + rdi + 3] 7938 mul cl 7939 mov byte ptr [r8 + rdi + 3], al 7940 add rdi, 4 7941 cmp r10, rdi 7942 jne .LBB1_724 7943 jmp .LBB1_1069 7944 .LBB1_102: 7945 cmp edi, 2 7946 je .LBB1_232 7947 # %bb.103: 7948 cmp edi, 3 7949 jne .LBB1_1069 7950 # %bb.104: 7951 test r9d, r9d 7952 jle .LBB1_1069 7953 # %bb.105: 7954 mov cl, byte ptr [rcx] 7955 mov r10d, r9d 7956 cmp r9d, 32 7957 jb .LBB1_106 7958 # %bb.304: 7959 lea rax, [rdx + r10] 7960 cmp rax, r8 7961 jbe .LBB1_492 7962 # %bb.305: 7963 lea rax, [r8 + r10] 7964 cmp rax, rdx 7965 jbe .LBB1_492 7966 .LBB1_106: 7967 xor edi, edi 7968 .LBB1_729: 7969 mov r9, rdi 7970 not r9 7971 add r9, r10 7972 mov rsi, r10 7973 and rsi, 3 7974 je .LBB1_731 7975 .LBB1_730: # =>This Inner Loop Header: Depth=1 7976 movzx eax, byte ptr [rdx + rdi] 7977 mul cl 7978 mov byte ptr [r8 + rdi], al 7979 add rdi, 1 7980 add rsi, -1 7981 jne .LBB1_730 7982 .LBB1_731: 7983 cmp r9, 3 7984 jb .LBB1_1069 7985 .LBB1_732: # =>This Inner Loop Header: Depth=1 7986 movzx eax, byte ptr [rdx + rdi] 7987 mul cl 7988 mov byte ptr [r8 + rdi], al 7989 movzx eax, byte ptr [rdx + rdi + 1] 7990 mul cl 7991 mov byte ptr [r8 + rdi + 1], al 7992 movzx eax, byte ptr [rdx + rdi + 2] 7993 mul cl 7994 mov byte ptr [r8 + rdi + 2], al 7995 movzx eax, byte ptr [rdx + rdi + 3] 7996 mul cl 7997 mov byte ptr [r8 + rdi + 3], al 7998 add rdi, 4 7999 cmp r10, rdi 8000 jne .LBB1_732 8001 jmp .LBB1_1069 8002 .LBB1_107: 8003 cmp edi, 2 8004 je .LBB1_235 8005 # %bb.108: 8006 cmp edi, 3 8007 jne .LBB1_1069 8008 # %bb.109: 8009 test r9d, r9d 8010 jle .LBB1_1069 8011 # %bb.110: 8012 mov al, byte ptr [rcx] 8013 mov r10d, r9d 8014 cmp r9d, 32 8015 jb .LBB1_111 8016 # %bb.307: 8017 lea rcx, [rdx + r10] 8018 cmp rcx, r8 8019 jbe .LBB1_495 8020 # %bb.308: 8021 lea rcx, [r8 + r10] 8022 cmp rcx, rdx 8023 jbe .LBB1_495 8024 .LBB1_111: 8025 xor esi, esi 8026 .LBB1_737: 8027 mov r9, rsi 8028 not r9 8029 add r9, r10 8030 mov rdi, r10 8031 and rdi, 3 8032 je .LBB1_739 8033 .LBB1_738: # =>This Inner Loop Header: Depth=1 8034 movzx ecx, byte ptr [rdx + rsi] 8035 add cl, al 8036 mov byte ptr [r8 + rsi], cl 8037 add rsi, 1 8038 add rdi, -1 8039 jne .LBB1_738 8040 .LBB1_739: 8041 cmp r9, 3 8042 jb .LBB1_1069 8043 .LBB1_740: # =>This Inner Loop Header: Depth=1 8044 movzx ecx, byte ptr [rdx + rsi] 8045 add cl, al 8046 mov byte ptr [r8 + rsi], cl 8047 movzx ecx, byte ptr [rdx + rsi + 1] 8048 add cl, al 8049 mov byte ptr [r8 + rsi + 1], cl 8050 movzx ecx, byte ptr [rdx + rsi + 2] 8051 add cl, al 8052 mov byte ptr [r8 + rsi + 2], cl 8053 movzx ecx, byte ptr [rdx + rsi + 3] 8054 add cl, al 8055 mov byte ptr [r8 + rsi + 3], cl 8056 add rsi, 4 8057 cmp r10, rsi 8058 jne .LBB1_740 8059 jmp .LBB1_1069 8060 .LBB1_112: 8061 cmp edi, 2 8062 je .LBB1_238 8063 # %bb.113: 8064 cmp edi, 3 8065 jne .LBB1_1069 8066 # %bb.114: 8067 test r9d, r9d 8068 jle .LBB1_1069 8069 # %bb.115: 8070 mov al, byte ptr [rcx] 8071 mov r10d, r9d 8072 cmp r9d, 32 8073 jb .LBB1_116 8074 # %bb.310: 8075 lea rcx, [rdx + r10] 8076 cmp rcx, r8 8077 jbe .LBB1_498 8078 # %bb.311: 8079 lea rcx, [r8 + r10] 8080 cmp rcx, rdx 8081 jbe .LBB1_498 8082 .LBB1_116: 8083 xor esi, esi 8084 .LBB1_745: 8085 mov r9, rsi 8086 not r9 8087 add r9, r10 8088 mov rdi, r10 8089 and rdi, 3 8090 je .LBB1_747 8091 .LBB1_746: # =>This Inner Loop Header: Depth=1 8092 movzx ecx, byte ptr [rdx + rsi] 8093 sub cl, al 8094 mov byte ptr [r8 + rsi], cl 8095 add rsi, 1 8096 add rdi, -1 8097 jne .LBB1_746 8098 .LBB1_747: 8099 cmp r9, 3 8100 jb .LBB1_1069 8101 .LBB1_748: # =>This Inner Loop Header: Depth=1 8102 movzx ecx, byte ptr [rdx + rsi] 8103 sub cl, al 8104 mov byte ptr [r8 + rsi], cl 8105 movzx ecx, byte ptr [rdx + rsi + 1] 8106 sub cl, al 8107 mov byte ptr [r8 + rsi + 1], cl 8108 movzx ecx, byte ptr [rdx + rsi + 2] 8109 sub cl, al 8110 mov byte ptr [r8 + rsi + 2], cl 8111 movzx ecx, byte ptr [rdx + rsi + 3] 8112 sub cl, al 8113 mov byte ptr [r8 + rsi + 3], cl 8114 add rsi, 4 8115 cmp r10, rsi 8116 jne .LBB1_748 8117 jmp .LBB1_1069 8118 .LBB1_117: 8119 cmp edi, 2 8120 je .LBB1_241 8121 # %bb.118: 8122 cmp edi, 3 8123 jne .LBB1_1069 8124 # %bb.119: 8125 test r9d, r9d 8126 jle .LBB1_1069 8127 # %bb.120: 8128 mov al, byte ptr [rcx] 8129 mov r10d, r9d 8130 cmp r9d, 32 8131 jb .LBB1_121 8132 # %bb.313: 8133 lea rcx, [rdx + r10] 8134 cmp rcx, r8 8135 jbe .LBB1_501 8136 # %bb.314: 8137 lea rcx, [r8 + r10] 8138 cmp rcx, rdx 8139 jbe .LBB1_501 8140 .LBB1_121: 8141 xor esi, esi 8142 .LBB1_753: 8143 mov r9, rsi 8144 not r9 8145 add r9, r10 8146 mov rdi, r10 8147 and rdi, 3 8148 je .LBB1_755 8149 .LBB1_754: # =>This Inner Loop Header: Depth=1 8150 movzx ecx, byte ptr [rdx + rsi] 8151 add cl, al 8152 mov byte ptr [r8 + rsi], cl 8153 add rsi, 1 8154 add rdi, -1 8155 jne .LBB1_754 8156 .LBB1_755: 8157 cmp r9, 3 8158 jb .LBB1_1069 8159 .LBB1_756: # =>This Inner Loop Header: Depth=1 8160 movzx ecx, byte ptr [rdx + rsi] 8161 add cl, al 8162 mov byte ptr [r8 + rsi], cl 8163 movzx ecx, byte ptr [rdx + rsi + 1] 8164 add cl, al 8165 mov byte ptr [r8 + rsi + 1], cl 8166 movzx ecx, byte ptr [rdx + rsi + 2] 8167 add cl, al 8168 mov byte ptr [r8 + rsi + 2], cl 8169 movzx ecx, byte ptr [rdx + rsi + 3] 8170 add cl, al 8171 mov byte ptr [r8 + rsi + 3], cl 8172 add rsi, 4 8173 cmp r10, rsi 8174 jne .LBB1_756 8175 jmp .LBB1_1069 8176 .LBB1_122: 8177 cmp edi, 2 8178 je .LBB1_244 8179 # %bb.123: 8180 cmp edi, 3 8181 jne .LBB1_1069 8182 # %bb.124: 8183 test r9d, r9d 8184 jle .LBB1_1069 8185 # %bb.125: 8186 mov al, byte ptr [rcx] 8187 mov r10d, r9d 8188 cmp r9d, 32 8189 jb .LBB1_126 8190 # %bb.316: 8191 lea rcx, [rdx + r10] 8192 cmp rcx, r8 8193 jbe .LBB1_504 8194 # %bb.317: 8195 lea rcx, [r8 + r10] 8196 cmp rcx, rdx 8197 jbe .LBB1_504 8198 .LBB1_126: 8199 xor esi, esi 8200 .LBB1_761: 8201 mov r9, rsi 8202 not r9 8203 add r9, r10 8204 mov rdi, r10 8205 and rdi, 3 8206 je .LBB1_763 8207 .LBB1_762: # =>This Inner Loop Header: Depth=1 8208 movzx ecx, byte ptr [rdx + rsi] 8209 sub cl, al 8210 mov byte ptr [r8 + rsi], cl 8211 add rsi, 1 8212 add rdi, -1 8213 jne .LBB1_762 8214 .LBB1_763: 8215 cmp r9, 3 8216 jb .LBB1_1069 8217 .LBB1_764: # =>This Inner Loop Header: Depth=1 8218 movzx ecx, byte ptr [rdx + rsi] 8219 sub cl, al 8220 mov byte ptr [r8 + rsi], cl 8221 movzx ecx, byte ptr [rdx + rsi + 1] 8222 sub cl, al 8223 mov byte ptr [r8 + rsi + 1], cl 8224 movzx ecx, byte ptr [rdx + rsi + 2] 8225 sub cl, al 8226 mov byte ptr [r8 + rsi + 2], cl 8227 movzx ecx, byte ptr [rdx + rsi + 3] 8228 sub cl, al 8229 mov byte ptr [r8 + rsi + 3], cl 8230 add rsi, 4 8231 cmp r10, rsi 8232 jne .LBB1_764 8233 jmp .LBB1_1069 8234 .LBB1_127: 8235 cmp edi, 7 8236 je .LBB1_247 8237 # %bb.128: 8238 cmp edi, 8 8239 jne .LBB1_1069 8240 # %bb.129: 8241 test r9d, r9d 8242 jle .LBB1_1069 8243 # %bb.130: 8244 mov rax, qword ptr [rcx] 8245 mov esi, r9d 8246 lea rdi, [rsi - 1] 8247 mov r9d, esi 8248 and r9d, 3 8249 cmp rdi, 3 8250 jae .LBB1_319 8251 # %bb.131: 8252 xor edi, edi 8253 jmp .LBB1_321 8254 .LBB1_132: 8255 cmp edi, 7 8256 je .LBB1_250 8257 # %bb.133: 8258 cmp edi, 8 8259 jne .LBB1_1069 8260 # %bb.134: 8261 test r9d, r9d 8262 jle .LBB1_1069 8263 # %bb.135: 8264 mov rax, qword ptr [rcx] 8265 mov esi, r9d 8266 lea rdi, [rsi - 1] 8267 mov r9d, esi 8268 and r9d, 3 8269 cmp rdi, 3 8270 jae .LBB1_324 8271 # %bb.136: 8272 xor edi, edi 8273 jmp .LBB1_326 8274 .LBB1_137: 8275 cmp edi, 7 8276 je .LBB1_253 8277 # %bb.138: 8278 cmp edi, 8 8279 jne .LBB1_1069 8280 # %bb.139: 8281 test r9d, r9d 8282 jle .LBB1_1069 8283 # %bb.140: 8284 mov rax, qword ptr [rcx] 8285 mov r10d, r9d 8286 cmp r9d, 4 8287 jb .LBB1_141 8288 # %bb.329: 8289 lea rcx, [rdx + 8*r10] 8290 cmp rcx, r8 8291 jbe .LBB1_507 8292 # %bb.330: 8293 lea rcx, [r8 + 8*r10] 8294 cmp rcx, rdx 8295 jbe .LBB1_507 8296 .LBB1_141: 8297 xor esi, esi 8298 .LBB1_769: 8299 mov r9, rsi 8300 not r9 8301 add r9, r10 8302 mov rdi, r10 8303 and rdi, 3 8304 je .LBB1_771 8305 .LBB1_770: # =>This Inner Loop Header: Depth=1 8306 mov rcx, qword ptr [rdx + 8*rsi] 8307 add rcx, rax 8308 mov qword ptr [r8 + 8*rsi], rcx 8309 add rsi, 1 8310 add rdi, -1 8311 jne .LBB1_770 8312 .LBB1_771: 8313 cmp r9, 3 8314 jb .LBB1_1069 8315 .LBB1_772: # =>This Inner Loop Header: Depth=1 8316 mov rcx, qword ptr [rdx + 8*rsi] 8317 add rcx, rax 8318 mov qword ptr [r8 + 8*rsi], rcx 8319 mov rcx, qword ptr [rdx + 8*rsi + 8] 8320 add rcx, rax 8321 mov qword ptr [r8 + 8*rsi + 8], rcx 8322 mov rcx, qword ptr [rdx + 8*rsi + 16] 8323 add rcx, rax 8324 mov qword ptr [r8 + 8*rsi + 16], rcx 8325 mov rcx, qword ptr [rdx + 8*rsi + 24] 8326 add rcx, rax 8327 mov qword ptr [r8 + 8*rsi + 24], rcx 8328 add rsi, 4 8329 cmp r10, rsi 8330 jne .LBB1_772 8331 jmp .LBB1_1069 8332 .LBB1_142: 8333 cmp edi, 7 8334 je .LBB1_256 8335 # %bb.143: 8336 cmp edi, 8 8337 jne .LBB1_1069 8338 # %bb.144: 8339 test r9d, r9d 8340 jle .LBB1_1069 8341 # %bb.145: 8342 mov rax, qword ptr [rcx] 8343 mov r10d, r9d 8344 cmp r9d, 4 8345 jb .LBB1_146 8346 # %bb.332: 8347 lea rcx, [rdx + 8*r10] 8348 cmp rcx, r8 8349 jbe .LBB1_510 8350 # %bb.333: 8351 lea rcx, [r8 + 8*r10] 8352 cmp rcx, rdx 8353 jbe .LBB1_510 8354 .LBB1_146: 8355 xor esi, esi 8356 .LBB1_777: 8357 mov r9, rsi 8358 not r9 8359 add r9, r10 8360 mov rdi, r10 8361 and rdi, 3 8362 je .LBB1_779 8363 .LBB1_778: # =>This Inner Loop Header: Depth=1 8364 mov rcx, qword ptr [rdx + 8*rsi] 8365 sub rcx, rax 8366 mov qword ptr [r8 + 8*rsi], rcx 8367 add rsi, 1 8368 add rdi, -1 8369 jne .LBB1_778 8370 .LBB1_779: 8371 cmp r9, 3 8372 jb .LBB1_1069 8373 .LBB1_780: # =>This Inner Loop Header: Depth=1 8374 mov rcx, qword ptr [rdx + 8*rsi] 8375 sub rcx, rax 8376 mov qword ptr [r8 + 8*rsi], rcx 8377 mov rcx, qword ptr [rdx + 8*rsi + 8] 8378 sub rcx, rax 8379 mov qword ptr [r8 + 8*rsi + 8], rcx 8380 mov rcx, qword ptr [rdx + 8*rsi + 16] 8381 sub rcx, rax 8382 mov qword ptr [r8 + 8*rsi + 16], rcx 8383 mov rcx, qword ptr [rdx + 8*rsi + 24] 8384 sub rcx, rax 8385 mov qword ptr [r8 + 8*rsi + 24], rcx 8386 add rsi, 4 8387 cmp r10, rsi 8388 jne .LBB1_780 8389 jmp .LBB1_1069 8390 .LBB1_147: 8391 cmp edi, 7 8392 je .LBB1_259 8393 # %bb.148: 8394 cmp edi, 8 8395 jne .LBB1_1069 8396 # %bb.149: 8397 test r9d, r9d 8398 jle .LBB1_1069 8399 # %bb.150: 8400 mov rax, qword ptr [rcx] 8401 mov r10d, r9d 8402 cmp r9d, 4 8403 jb .LBB1_151 8404 # %bb.335: 8405 lea rcx, [rdx + 8*r10] 8406 cmp rcx, r8 8407 jbe .LBB1_513 8408 # %bb.336: 8409 lea rcx, [r8 + 8*r10] 8410 cmp rcx, rdx 8411 jbe .LBB1_513 8412 .LBB1_151: 8413 xor esi, esi 8414 .LBB1_785: 8415 mov r9, rsi 8416 not r9 8417 add r9, r10 8418 mov rdi, r10 8419 and rdi, 3 8420 je .LBB1_787 8421 .LBB1_786: # =>This Inner Loop Header: Depth=1 8422 mov rcx, qword ptr [rdx + 8*rsi] 8423 add rcx, rax 8424 mov qword ptr [r8 + 8*rsi], rcx 8425 add rsi, 1 8426 add rdi, -1 8427 jne .LBB1_786 8428 .LBB1_787: 8429 cmp r9, 3 8430 jb .LBB1_1069 8431 .LBB1_788: # =>This Inner Loop Header: Depth=1 8432 mov rcx, qword ptr [rdx + 8*rsi] 8433 add rcx, rax 8434 mov qword ptr [r8 + 8*rsi], rcx 8435 mov rcx, qword ptr [rdx + 8*rsi + 8] 8436 add rcx, rax 8437 mov qword ptr [r8 + 8*rsi + 8], rcx 8438 mov rcx, qword ptr [rdx + 8*rsi + 16] 8439 add rcx, rax 8440 mov qword ptr [r8 + 8*rsi + 16], rcx 8441 mov rcx, qword ptr [rdx + 8*rsi + 24] 8442 add rcx, rax 8443 mov qword ptr [r8 + 8*rsi + 24], rcx 8444 add rsi, 4 8445 cmp r10, rsi 8446 jne .LBB1_788 8447 jmp .LBB1_1069 8448 .LBB1_152: 8449 cmp edi, 7 8450 je .LBB1_262 8451 # %bb.153: 8452 cmp edi, 8 8453 jne .LBB1_1069 8454 # %bb.154: 8455 test r9d, r9d 8456 jle .LBB1_1069 8457 # %bb.155: 8458 mov rax, qword ptr [rcx] 8459 mov r10d, r9d 8460 cmp r9d, 4 8461 jb .LBB1_156 8462 # %bb.338: 8463 lea rcx, [rdx + 8*r10] 8464 cmp rcx, r8 8465 jbe .LBB1_516 8466 # %bb.339: 8467 lea rcx, [r8 + 8*r10] 8468 cmp rcx, rdx 8469 jbe .LBB1_516 8470 .LBB1_156: 8471 xor esi, esi 8472 .LBB1_793: 8473 mov r9, rsi 8474 not r9 8475 add r9, r10 8476 mov rdi, r10 8477 and rdi, 3 8478 je .LBB1_795 8479 .LBB1_794: # =>This Inner Loop Header: Depth=1 8480 mov rcx, qword ptr [rdx + 8*rsi] 8481 sub rcx, rax 8482 mov qword ptr [r8 + 8*rsi], rcx 8483 add rsi, 1 8484 add rdi, -1 8485 jne .LBB1_794 8486 .LBB1_795: 8487 cmp r9, 3 8488 jb .LBB1_1069 8489 .LBB1_796: # =>This Inner Loop Header: Depth=1 8490 mov rcx, qword ptr [rdx + 8*rsi] 8491 sub rcx, rax 8492 mov qword ptr [r8 + 8*rsi], rcx 8493 mov rcx, qword ptr [rdx + 8*rsi + 8] 8494 sub rcx, rax 8495 mov qword ptr [r8 + 8*rsi + 8], rcx 8496 mov rcx, qword ptr [rdx + 8*rsi + 16] 8497 sub rcx, rax 8498 mov qword ptr [r8 + 8*rsi + 16], rcx 8499 mov rcx, qword ptr [rdx + 8*rsi + 24] 8500 sub rcx, rax 8501 mov qword ptr [r8 + 8*rsi + 24], rcx 8502 add rsi, 4 8503 cmp r10, rsi 8504 jne .LBB1_796 8505 jmp .LBB1_1069 8506 .LBB1_157: 8507 test r9d, r9d 8508 jle .LBB1_1069 8509 # %bb.158: 8510 movzx eax, word ptr [rcx] 8511 mov r10d, r9d 8512 cmp r9d, 16 8513 jb .LBB1_159 8514 # %bb.341: 8515 lea rcx, [rdx + 2*r10] 8516 cmp rcx, r8 8517 jbe .LBB1_519 8518 # %bb.342: 8519 lea rcx, [r8 + 2*r10] 8520 cmp rcx, rdx 8521 jbe .LBB1_519 8522 .LBB1_159: 8523 xor esi, esi 8524 .LBB1_801: 8525 mov r9, rsi 8526 not r9 8527 add r9, r10 8528 mov rdi, r10 8529 and rdi, 3 8530 je .LBB1_803 8531 .LBB1_802: # =>This Inner Loop Header: Depth=1 8532 movzx ecx, word ptr [rdx + 2*rsi] 8533 imul cx, ax 8534 mov word ptr [r8 + 2*rsi], cx 8535 add rsi, 1 8536 add rdi, -1 8537 jne .LBB1_802 8538 .LBB1_803: 8539 cmp r9, 3 8540 jb .LBB1_1069 8541 .LBB1_804: # =>This Inner Loop Header: Depth=1 8542 movzx ecx, word ptr [rdx + 2*rsi] 8543 imul cx, ax 8544 mov word ptr [r8 + 2*rsi], cx 8545 movzx ecx, word ptr [rdx + 2*rsi + 2] 8546 imul cx, ax 8547 mov word ptr [r8 + 2*rsi + 2], cx 8548 movzx ecx, word ptr [rdx + 2*rsi + 4] 8549 imul cx, ax 8550 mov word ptr [r8 + 2*rsi + 4], cx 8551 movzx ecx, word ptr [rdx + 2*rsi + 6] 8552 imul cx, ax 8553 mov word ptr [r8 + 2*rsi + 6], cx 8554 add rsi, 4 8555 cmp r10, rsi 8556 jne .LBB1_804 8557 jmp .LBB1_1069 8558 .LBB1_160: 8559 test r9d, r9d 8560 jle .LBB1_1069 8561 # %bb.161: 8562 movzx eax, word ptr [rcx] 8563 mov r10d, r9d 8564 cmp r9d, 16 8565 jb .LBB1_162 8566 # %bb.344: 8567 lea rcx, [rdx + 2*r10] 8568 cmp rcx, r8 8569 jbe .LBB1_522 8570 # %bb.345: 8571 lea rcx, [r8 + 2*r10] 8572 cmp rcx, rdx 8573 jbe .LBB1_522 8574 .LBB1_162: 8575 xor esi, esi 8576 .LBB1_809: 8577 mov r9, rsi 8578 not r9 8579 add r9, r10 8580 mov rdi, r10 8581 and rdi, 3 8582 je .LBB1_811 8583 .LBB1_810: # =>This Inner Loop Header: Depth=1 8584 movzx ecx, word ptr [rdx + 2*rsi] 8585 imul cx, ax 8586 mov word ptr [r8 + 2*rsi], cx 8587 add rsi, 1 8588 add rdi, -1 8589 jne .LBB1_810 8590 .LBB1_811: 8591 cmp r9, 3 8592 jb .LBB1_1069 8593 .LBB1_812: # =>This Inner Loop Header: Depth=1 8594 movzx ecx, word ptr [rdx + 2*rsi] 8595 imul cx, ax 8596 mov word ptr [r8 + 2*rsi], cx 8597 movzx ecx, word ptr [rdx + 2*rsi + 2] 8598 imul cx, ax 8599 mov word ptr [r8 + 2*rsi + 2], cx 8600 movzx ecx, word ptr [rdx + 2*rsi + 4] 8601 imul cx, ax 8602 mov word ptr [r8 + 2*rsi + 4], cx 8603 movzx ecx, word ptr [rdx + 2*rsi + 6] 8604 imul cx, ax 8605 mov word ptr [r8 + 2*rsi + 6], cx 8606 add rsi, 4 8607 cmp r10, rsi 8608 jne .LBB1_812 8609 jmp .LBB1_1069 8610 .LBB1_163: 8611 test r9d, r9d 8612 jle .LBB1_1069 8613 # %bb.164: 8614 movzx eax, word ptr [rcx] 8615 mov r10d, r9d 8616 cmp r9d, 16 8617 jb .LBB1_165 8618 # %bb.347: 8619 lea rcx, [rdx + 2*r10] 8620 cmp rcx, r8 8621 jbe .LBB1_525 8622 # %bb.348: 8623 lea rcx, [r8 + 2*r10] 8624 cmp rcx, rdx 8625 jbe .LBB1_525 8626 .LBB1_165: 8627 xor esi, esi 8628 .LBB1_817: 8629 mov r9, rsi 8630 not r9 8631 add r9, r10 8632 mov rdi, r10 8633 and rdi, 3 8634 je .LBB1_819 8635 .LBB1_818: # =>This Inner Loop Header: Depth=1 8636 movzx ecx, word ptr [rdx + 2*rsi] 8637 imul cx, ax 8638 mov word ptr [r8 + 2*rsi], cx 8639 add rsi, 1 8640 add rdi, -1 8641 jne .LBB1_818 8642 .LBB1_819: 8643 cmp r9, 3 8644 jb .LBB1_1069 8645 .LBB1_820: # =>This Inner Loop Header: Depth=1 8646 movzx ecx, word ptr [rdx + 2*rsi] 8647 imul cx, ax 8648 mov word ptr [r8 + 2*rsi], cx 8649 movzx ecx, word ptr [rdx + 2*rsi + 2] 8650 imul cx, ax 8651 mov word ptr [r8 + 2*rsi + 2], cx 8652 movzx ecx, word ptr [rdx + 2*rsi + 4] 8653 imul cx, ax 8654 mov word ptr [r8 + 2*rsi + 4], cx 8655 movzx ecx, word ptr [rdx + 2*rsi + 6] 8656 imul cx, ax 8657 mov word ptr [r8 + 2*rsi + 6], cx 8658 add rsi, 4 8659 cmp r10, rsi 8660 jne .LBB1_820 8661 jmp .LBB1_1069 8662 .LBB1_166: 8663 test r9d, r9d 8664 jle .LBB1_1069 8665 # %bb.167: 8666 movzx eax, word ptr [rcx] 8667 mov r10d, r9d 8668 cmp r9d, 16 8669 jb .LBB1_168 8670 # %bb.350: 8671 lea rcx, [rdx + 2*r10] 8672 cmp rcx, r8 8673 jbe .LBB1_528 8674 # %bb.351: 8675 lea rcx, [r8 + 2*r10] 8676 cmp rcx, rdx 8677 jbe .LBB1_528 8678 .LBB1_168: 8679 xor esi, esi 8680 .LBB1_825: 8681 mov r9, rsi 8682 not r9 8683 add r9, r10 8684 mov rdi, r10 8685 and rdi, 3 8686 je .LBB1_827 8687 .LBB1_826: # =>This Inner Loop Header: Depth=1 8688 movzx ecx, word ptr [rdx + 2*rsi] 8689 imul cx, ax 8690 mov word ptr [r8 + 2*rsi], cx 8691 add rsi, 1 8692 add rdi, -1 8693 jne .LBB1_826 8694 .LBB1_827: 8695 cmp r9, 3 8696 jb .LBB1_1069 8697 .LBB1_828: # =>This Inner Loop Header: Depth=1 8698 movzx ecx, word ptr [rdx + 2*rsi] 8699 imul cx, ax 8700 mov word ptr [r8 + 2*rsi], cx 8701 movzx ecx, word ptr [rdx + 2*rsi + 2] 8702 imul cx, ax 8703 mov word ptr [r8 + 2*rsi + 2], cx 8704 movzx ecx, word ptr [rdx + 2*rsi + 4] 8705 imul cx, ax 8706 mov word ptr [r8 + 2*rsi + 4], cx 8707 movzx ecx, word ptr [rdx + 2*rsi + 6] 8708 imul cx, ax 8709 mov word ptr [r8 + 2*rsi + 6], cx 8710 add rsi, 4 8711 cmp r10, rsi 8712 jne .LBB1_828 8713 jmp .LBB1_1069 8714 .LBB1_169: 8715 test r9d, r9d 8716 jle .LBB1_1069 8717 # %bb.170: 8718 movzx eax, word ptr [rcx] 8719 mov r10d, r9d 8720 cmp r9d, 16 8721 jb .LBB1_171 8722 # %bb.353: 8723 lea rcx, [rdx + 2*r10] 8724 cmp rcx, r8 8725 jbe .LBB1_531 8726 # %bb.354: 8727 lea rcx, [r8 + 2*r10] 8728 cmp rcx, rdx 8729 jbe .LBB1_531 8730 .LBB1_171: 8731 xor esi, esi 8732 .LBB1_833: 8733 mov r9, rsi 8734 not r9 8735 add r9, r10 8736 mov rdi, r10 8737 and rdi, 3 8738 je .LBB1_835 8739 .LBB1_834: # =>This Inner Loop Header: Depth=1 8740 movzx ecx, word ptr [rdx + 2*rsi] 8741 add cx, ax 8742 mov word ptr [r8 + 2*rsi], cx 8743 add rsi, 1 8744 add rdi, -1 8745 jne .LBB1_834 8746 .LBB1_835: 8747 cmp r9, 3 8748 jb .LBB1_1069 8749 .LBB1_836: # =>This Inner Loop Header: Depth=1 8750 movzx ecx, word ptr [rdx + 2*rsi] 8751 add cx, ax 8752 mov word ptr [r8 + 2*rsi], cx 8753 movzx ecx, word ptr [rdx + 2*rsi + 2] 8754 add cx, ax 8755 mov word ptr [r8 + 2*rsi + 2], cx 8756 movzx ecx, word ptr [rdx + 2*rsi + 4] 8757 add cx, ax 8758 mov word ptr [r8 + 2*rsi + 4], cx 8759 movzx ecx, word ptr [rdx + 2*rsi + 6] 8760 add cx, ax 8761 mov word ptr [r8 + 2*rsi + 6], cx 8762 add rsi, 4 8763 cmp r10, rsi 8764 jne .LBB1_836 8765 jmp .LBB1_1069 8766 .LBB1_172: 8767 test r9d, r9d 8768 jle .LBB1_1069 8769 # %bb.173: 8770 movzx eax, word ptr [rcx] 8771 mov r10d, r9d 8772 cmp r9d, 16 8773 jb .LBB1_174 8774 # %bb.356: 8775 lea rcx, [rdx + 2*r10] 8776 cmp rcx, r8 8777 jbe .LBB1_534 8778 # %bb.357: 8779 lea rcx, [r8 + 2*r10] 8780 cmp rcx, rdx 8781 jbe .LBB1_534 8782 .LBB1_174: 8783 xor esi, esi 8784 .LBB1_841: 8785 mov r9, rsi 8786 not r9 8787 add r9, r10 8788 mov rdi, r10 8789 and rdi, 3 8790 je .LBB1_843 8791 .LBB1_842: # =>This Inner Loop Header: Depth=1 8792 movzx ecx, word ptr [rdx + 2*rsi] 8793 add cx, ax 8794 mov word ptr [r8 + 2*rsi], cx 8795 add rsi, 1 8796 add rdi, -1 8797 jne .LBB1_842 8798 .LBB1_843: 8799 cmp r9, 3 8800 jb .LBB1_1069 8801 .LBB1_844: # =>This Inner Loop Header: Depth=1 8802 movzx ecx, word ptr [rdx + 2*rsi] 8803 add cx, ax 8804 mov word ptr [r8 + 2*rsi], cx 8805 movzx ecx, word ptr [rdx + 2*rsi + 2] 8806 add cx, ax 8807 mov word ptr [r8 + 2*rsi + 2], cx 8808 movzx ecx, word ptr [rdx + 2*rsi + 4] 8809 add cx, ax 8810 mov word ptr [r8 + 2*rsi + 4], cx 8811 movzx ecx, word ptr [rdx + 2*rsi + 6] 8812 add cx, ax 8813 mov word ptr [r8 + 2*rsi + 6], cx 8814 add rsi, 4 8815 cmp r10, rsi 8816 jne .LBB1_844 8817 jmp .LBB1_1069 8818 .LBB1_175: 8819 test r9d, r9d 8820 jle .LBB1_1069 8821 # %bb.176: 8822 movzx eax, word ptr [rcx] 8823 mov r10d, r9d 8824 cmp r9d, 16 8825 jb .LBB1_177 8826 # %bb.359: 8827 lea rcx, [rdx + 2*r10] 8828 cmp rcx, r8 8829 jbe .LBB1_537 8830 # %bb.360: 8831 lea rcx, [r8 + 2*r10] 8832 cmp rcx, rdx 8833 jbe .LBB1_537 8834 .LBB1_177: 8835 xor esi, esi 8836 .LBB1_849: 8837 mov r9, rsi 8838 not r9 8839 add r9, r10 8840 mov rdi, r10 8841 and rdi, 3 8842 je .LBB1_851 8843 .LBB1_850: # =>This Inner Loop Header: Depth=1 8844 movzx ecx, word ptr [rdx + 2*rsi] 8845 sub ecx, eax 8846 mov word ptr [r8 + 2*rsi], cx 8847 add rsi, 1 8848 add rdi, -1 8849 jne .LBB1_850 8850 .LBB1_851: 8851 cmp r9, 3 8852 jb .LBB1_1069 8853 .LBB1_852: # =>This Inner Loop Header: Depth=1 8854 movzx ecx, word ptr [rdx + 2*rsi] 8855 sub ecx, eax 8856 mov word ptr [r8 + 2*rsi], cx 8857 movzx ecx, word ptr [rdx + 2*rsi + 2] 8858 sub ecx, eax 8859 mov word ptr [r8 + 2*rsi + 2], cx 8860 movzx ecx, word ptr [rdx + 2*rsi + 4] 8861 sub ecx, eax 8862 mov word ptr [r8 + 2*rsi + 4], cx 8863 movzx ecx, word ptr [rdx + 2*rsi + 6] 8864 sub ecx, eax 8865 mov word ptr [r8 + 2*rsi + 6], cx 8866 add rsi, 4 8867 cmp r10, rsi 8868 jne .LBB1_852 8869 jmp .LBB1_1069 8870 .LBB1_178: 8871 test r9d, r9d 8872 jle .LBB1_1069 8873 # %bb.179: 8874 movzx eax, word ptr [rcx] 8875 mov r10d, r9d 8876 cmp r9d, 16 8877 jb .LBB1_180 8878 # %bb.362: 8879 lea rcx, [rdx + 2*r10] 8880 cmp rcx, r8 8881 jbe .LBB1_540 8882 # %bb.363: 8883 lea rcx, [r8 + 2*r10] 8884 cmp rcx, rdx 8885 jbe .LBB1_540 8886 .LBB1_180: 8887 xor esi, esi 8888 .LBB1_857: 8889 mov r9, rsi 8890 not r9 8891 add r9, r10 8892 mov rdi, r10 8893 and rdi, 3 8894 je .LBB1_859 8895 .LBB1_858: # =>This Inner Loop Header: Depth=1 8896 movzx ecx, word ptr [rdx + 2*rsi] 8897 sub ecx, eax 8898 mov word ptr [r8 + 2*rsi], cx 8899 add rsi, 1 8900 add rdi, -1 8901 jne .LBB1_858 8902 .LBB1_859: 8903 cmp r9, 3 8904 jb .LBB1_1069 8905 .LBB1_860: # =>This Inner Loop Header: Depth=1 8906 movzx ecx, word ptr [rdx + 2*rsi] 8907 sub ecx, eax 8908 mov word ptr [r8 + 2*rsi], cx 8909 movzx ecx, word ptr [rdx + 2*rsi + 2] 8910 sub ecx, eax 8911 mov word ptr [r8 + 2*rsi + 2], cx 8912 movzx ecx, word ptr [rdx + 2*rsi + 4] 8913 sub ecx, eax 8914 mov word ptr [r8 + 2*rsi + 4], cx 8915 movzx ecx, word ptr [rdx + 2*rsi + 6] 8916 sub ecx, eax 8917 mov word ptr [r8 + 2*rsi + 6], cx 8918 add rsi, 4 8919 cmp r10, rsi 8920 jne .LBB1_860 8921 jmp .LBB1_1069 8922 .LBB1_181: 8923 test r9d, r9d 8924 jle .LBB1_1069 8925 # %bb.182: 8926 movzx eax, word ptr [rcx] 8927 mov r10d, r9d 8928 cmp r9d, 16 8929 jb .LBB1_183 8930 # %bb.365: 8931 lea rcx, [rdx + 2*r10] 8932 cmp rcx, r8 8933 jbe .LBB1_543 8934 # %bb.366: 8935 lea rcx, [r8 + 2*r10] 8936 cmp rcx, rdx 8937 jbe .LBB1_543 8938 .LBB1_183: 8939 xor esi, esi 8940 .LBB1_865: 8941 mov r9, rsi 8942 not r9 8943 add r9, r10 8944 mov rdi, r10 8945 and rdi, 3 8946 je .LBB1_867 8947 .LBB1_866: # =>This Inner Loop Header: Depth=1 8948 movzx ecx, word ptr [rdx + 2*rsi] 8949 add cx, ax 8950 mov word ptr [r8 + 2*rsi], cx 8951 add rsi, 1 8952 add rdi, -1 8953 jne .LBB1_866 8954 .LBB1_867: 8955 cmp r9, 3 8956 jb .LBB1_1069 8957 .LBB1_868: # =>This Inner Loop Header: Depth=1 8958 movzx ecx, word ptr [rdx + 2*rsi] 8959 add cx, ax 8960 mov word ptr [r8 + 2*rsi], cx 8961 movzx ecx, word ptr [rdx + 2*rsi + 2] 8962 add cx, ax 8963 mov word ptr [r8 + 2*rsi + 2], cx 8964 movzx ecx, word ptr [rdx + 2*rsi + 4] 8965 add cx, ax 8966 mov word ptr [r8 + 2*rsi + 4], cx 8967 movzx ecx, word ptr [rdx + 2*rsi + 6] 8968 add cx, ax 8969 mov word ptr [r8 + 2*rsi + 6], cx 8970 add rsi, 4 8971 cmp r10, rsi 8972 jne .LBB1_868 8973 jmp .LBB1_1069 8974 .LBB1_184: 8975 test r9d, r9d 8976 jle .LBB1_1069 8977 # %bb.185: 8978 movzx eax, word ptr [rcx] 8979 mov r10d, r9d 8980 cmp r9d, 16 8981 jb .LBB1_186 8982 # %bb.368: 8983 lea rcx, [rdx + 2*r10] 8984 cmp rcx, r8 8985 jbe .LBB1_546 8986 # %bb.369: 8987 lea rcx, [r8 + 2*r10] 8988 cmp rcx, rdx 8989 jbe .LBB1_546 8990 .LBB1_186: 8991 xor esi, esi 8992 .LBB1_873: 8993 mov r9, rsi 8994 not r9 8995 add r9, r10 8996 mov rdi, r10 8997 and rdi, 3 8998 je .LBB1_875 8999 .LBB1_874: # =>This Inner Loop Header: Depth=1 9000 movzx ecx, word ptr [rdx + 2*rsi] 9001 add cx, ax 9002 mov word ptr [r8 + 2*rsi], cx 9003 add rsi, 1 9004 add rdi, -1 9005 jne .LBB1_874 9006 .LBB1_875: 9007 cmp r9, 3 9008 jb .LBB1_1069 9009 .LBB1_876: # =>This Inner Loop Header: Depth=1 9010 movzx ecx, word ptr [rdx + 2*rsi] 9011 add cx, ax 9012 mov word ptr [r8 + 2*rsi], cx 9013 movzx ecx, word ptr [rdx + 2*rsi + 2] 9014 add cx, ax 9015 mov word ptr [r8 + 2*rsi + 2], cx 9016 movzx ecx, word ptr [rdx + 2*rsi + 4] 9017 add cx, ax 9018 mov word ptr [r8 + 2*rsi + 4], cx 9019 movzx ecx, word ptr [rdx + 2*rsi + 6] 9020 add cx, ax 9021 mov word ptr [r8 + 2*rsi + 6], cx 9022 add rsi, 4 9023 cmp r10, rsi 9024 jne .LBB1_876 9025 jmp .LBB1_1069 9026 .LBB1_187: 9027 test r9d, r9d 9028 jle .LBB1_1069 9029 # %bb.188: 9030 movzx eax, word ptr [rcx] 9031 mov r10d, r9d 9032 cmp r9d, 16 9033 jb .LBB1_189 9034 # %bb.371: 9035 lea rcx, [rdx + 2*r10] 9036 cmp rcx, r8 9037 jbe .LBB1_549 9038 # %bb.372: 9039 lea rcx, [r8 + 2*r10] 9040 cmp rcx, rdx 9041 jbe .LBB1_549 9042 .LBB1_189: 9043 xor esi, esi 9044 .LBB1_881: 9045 mov r9, rsi 9046 not r9 9047 add r9, r10 9048 mov rdi, r10 9049 and rdi, 3 9050 je .LBB1_883 9051 .LBB1_882: # =>This Inner Loop Header: Depth=1 9052 movzx ecx, word ptr [rdx + 2*rsi] 9053 sub ecx, eax 9054 mov word ptr [r8 + 2*rsi], cx 9055 add rsi, 1 9056 add rdi, -1 9057 jne .LBB1_882 9058 .LBB1_883: 9059 cmp r9, 3 9060 jb .LBB1_1069 9061 .LBB1_884: # =>This Inner Loop Header: Depth=1 9062 movzx ecx, word ptr [rdx + 2*rsi] 9063 sub ecx, eax 9064 mov word ptr [r8 + 2*rsi], cx 9065 movzx ecx, word ptr [rdx + 2*rsi + 2] 9066 sub ecx, eax 9067 mov word ptr [r8 + 2*rsi + 2], cx 9068 movzx ecx, word ptr [rdx + 2*rsi + 4] 9069 sub ecx, eax 9070 mov word ptr [r8 + 2*rsi + 4], cx 9071 movzx ecx, word ptr [rdx + 2*rsi + 6] 9072 sub ecx, eax 9073 mov word ptr [r8 + 2*rsi + 6], cx 9074 add rsi, 4 9075 cmp r10, rsi 9076 jne .LBB1_884 9077 jmp .LBB1_1069 9078 .LBB1_190: 9079 test r9d, r9d 9080 jle .LBB1_1069 9081 # %bb.191: 9082 movzx eax, word ptr [rcx] 9083 mov r10d, r9d 9084 cmp r9d, 16 9085 jb .LBB1_192 9086 # %bb.374: 9087 lea rcx, [rdx + 2*r10] 9088 cmp rcx, r8 9089 jbe .LBB1_552 9090 # %bb.375: 9091 lea rcx, [r8 + 2*r10] 9092 cmp rcx, rdx 9093 jbe .LBB1_552 9094 .LBB1_192: 9095 xor esi, esi 9096 .LBB1_889: 9097 mov r9, rsi 9098 not r9 9099 add r9, r10 9100 mov rdi, r10 9101 and rdi, 3 9102 je .LBB1_891 9103 .LBB1_890: # =>This Inner Loop Header: Depth=1 9104 movzx ecx, word ptr [rdx + 2*rsi] 9105 sub ecx, eax 9106 mov word ptr [r8 + 2*rsi], cx 9107 add rsi, 1 9108 add rdi, -1 9109 jne .LBB1_890 9110 .LBB1_891: 9111 cmp r9, 3 9112 jb .LBB1_1069 9113 .LBB1_892: # =>This Inner Loop Header: Depth=1 9114 movzx ecx, word ptr [rdx + 2*rsi] 9115 sub ecx, eax 9116 mov word ptr [r8 + 2*rsi], cx 9117 movzx ecx, word ptr [rdx + 2*rsi + 2] 9118 sub ecx, eax 9119 mov word ptr [r8 + 2*rsi + 2], cx 9120 movzx ecx, word ptr [rdx + 2*rsi + 4] 9121 sub ecx, eax 9122 mov word ptr [r8 + 2*rsi + 4], cx 9123 movzx ecx, word ptr [rdx + 2*rsi + 6] 9124 sub ecx, eax 9125 mov word ptr [r8 + 2*rsi + 6], cx 9126 add rsi, 4 9127 cmp r10, rsi 9128 jne .LBB1_892 9129 jmp .LBB1_1069 9130 .LBB1_193: 9131 test r9d, r9d 9132 jle .LBB1_1069 9133 # %bb.194: 9134 mov rax, qword ptr [rcx] 9135 mov esi, r9d 9136 lea rdi, [rsi - 1] 9137 mov r9d, esi 9138 and r9d, 3 9139 cmp rdi, 3 9140 jae .LBB1_377 9141 # %bb.195: 9142 xor edi, edi 9143 jmp .LBB1_379 9144 .LBB1_196: 9145 test r9d, r9d 9146 jle .LBB1_1069 9147 # %bb.197: 9148 movss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero 9149 mov eax, r9d 9150 cmp r9d, 8 9151 jb .LBB1_198 9152 # %bb.382: 9153 lea rcx, [rdx + 4*rax] 9154 cmp rcx, r8 9155 jbe .LBB1_555 9156 # %bb.383: 9157 lea rcx, [r8 + 4*rax] 9158 cmp rcx, rdx 9159 jbe .LBB1_555 9160 .LBB1_198: 9161 xor ecx, ecx 9162 .LBB1_897: 9163 mov rsi, rcx 9164 not rsi 9165 add rsi, rax 9166 mov rdi, rax 9167 and rdi, 3 9168 je .LBB1_899 9169 .LBB1_898: # =>This Inner Loop Header: Depth=1 9170 movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero 9171 mulss xmm1, xmm0 9172 movss dword ptr [r8 + 4*rcx], xmm1 9173 add rcx, 1 9174 add rdi, -1 9175 jne .LBB1_898 9176 .LBB1_899: 9177 cmp rsi, 3 9178 jb .LBB1_1069 9179 .LBB1_900: # =>This Inner Loop Header: Depth=1 9180 movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero 9181 mulss xmm1, xmm0 9182 movss dword ptr [r8 + 4*rcx], xmm1 9183 movss xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero 9184 mulss xmm1, xmm0 9185 movss dword ptr [r8 + 4*rcx + 4], xmm1 9186 movss xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero 9187 mulss xmm1, xmm0 9188 movss dword ptr [r8 + 4*rcx + 8], xmm1 9189 movss xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero 9190 mulss xmm1, xmm0 9191 movss dword ptr [r8 + 4*rcx + 12], xmm1 9192 add rcx, 4 9193 cmp rax, rcx 9194 jne .LBB1_900 9195 jmp .LBB1_1069 9196 .LBB1_199: 9197 test r9d, r9d 9198 jle .LBB1_1069 9199 # %bb.200: 9200 mov rax, qword ptr [rcx] 9201 mov esi, r9d 9202 lea rdi, [rsi - 1] 9203 mov r9d, esi 9204 and r9d, 3 9205 cmp rdi, 3 9206 jae .LBB1_385 9207 # %bb.201: 9208 xor edi, edi 9209 jmp .LBB1_387 9210 .LBB1_202: 9211 test r9d, r9d 9212 jle .LBB1_1069 9213 # %bb.203: 9214 movss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero 9215 mov eax, r9d 9216 cmp r9d, 8 9217 jb .LBB1_204 9218 # %bb.390: 9219 lea rcx, [rdx + 4*rax] 9220 cmp rcx, r8 9221 jbe .LBB1_558 9222 # %bb.391: 9223 lea rcx, [r8 + 4*rax] 9224 cmp rcx, rdx 9225 jbe .LBB1_558 9226 .LBB1_204: 9227 xor ecx, ecx 9228 .LBB1_905: 9229 mov rsi, rcx 9230 not rsi 9231 add rsi, rax 9232 mov rdi, rax 9233 and rdi, 3 9234 je .LBB1_907 9235 .LBB1_906: # =>This Inner Loop Header: Depth=1 9236 movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero 9237 mulss xmm1, xmm0 9238 movss dword ptr [r8 + 4*rcx], xmm1 9239 add rcx, 1 9240 add rdi, -1 9241 jne .LBB1_906 9242 .LBB1_907: 9243 cmp rsi, 3 9244 jb .LBB1_1069 9245 .LBB1_908: # =>This Inner Loop Header: Depth=1 9246 movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero 9247 mulss xmm1, xmm0 9248 movss dword ptr [r8 + 4*rcx], xmm1 9249 movss xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero 9250 mulss xmm1, xmm0 9251 movss dword ptr [r8 + 4*rcx + 4], xmm1 9252 movss xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero 9253 mulss xmm1, xmm0 9254 movss dword ptr [r8 + 4*rcx + 8], xmm1 9255 movss xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero 9256 mulss xmm1, xmm0 9257 movss dword ptr [r8 + 4*rcx + 12], xmm1 9258 add rcx, 4 9259 cmp rax, rcx 9260 jne .LBB1_908 9261 jmp .LBB1_1069 9262 .LBB1_205: 9263 test r9d, r9d 9264 jle .LBB1_1069 9265 # %bb.206: 9266 mov rax, qword ptr [rcx] 9267 mov r10d, r9d 9268 cmp r9d, 4 9269 jb .LBB1_207 9270 # %bb.393: 9271 lea rcx, [rdx + 8*r10] 9272 cmp rcx, r8 9273 jbe .LBB1_561 9274 # %bb.394: 9275 lea rcx, [r8 + 8*r10] 9276 cmp rcx, rdx 9277 jbe .LBB1_561 9278 .LBB1_207: 9279 xor esi, esi 9280 .LBB1_913: 9281 mov r9, rsi 9282 not r9 9283 add r9, r10 9284 mov rdi, r10 9285 and rdi, 3 9286 je .LBB1_915 9287 .LBB1_914: # =>This Inner Loop Header: Depth=1 9288 mov rcx, qword ptr [rdx + 8*rsi] 9289 add rcx, rax 9290 mov qword ptr [r8 + 8*rsi], rcx 9291 add rsi, 1 9292 add rdi, -1 9293 jne .LBB1_914 9294 .LBB1_915: 9295 cmp r9, 3 9296 jb .LBB1_1069 9297 .LBB1_916: # =>This Inner Loop Header: Depth=1 9298 mov rcx, qword ptr [rdx + 8*rsi] 9299 add rcx, rax 9300 mov qword ptr [r8 + 8*rsi], rcx 9301 mov rcx, qword ptr [rdx + 8*rsi + 8] 9302 add rcx, rax 9303 mov qword ptr [r8 + 8*rsi + 8], rcx 9304 mov rcx, qword ptr [rdx + 8*rsi + 16] 9305 add rcx, rax 9306 mov qword ptr [r8 + 8*rsi + 16], rcx 9307 mov rcx, qword ptr [rdx + 8*rsi + 24] 9308 add rcx, rax 9309 mov qword ptr [r8 + 8*rsi + 24], rcx 9310 add rsi, 4 9311 cmp r10, rsi 9312 jne .LBB1_916 9313 jmp .LBB1_1069 9314 .LBB1_208: 9315 test r9d, r9d 9316 jle .LBB1_1069 9317 # %bb.209: 9318 movss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero 9319 mov eax, r9d 9320 cmp r9d, 8 9321 jb .LBB1_210 9322 # %bb.396: 9323 lea rcx, [rdx + 4*rax] 9324 cmp rcx, r8 9325 jbe .LBB1_564 9326 # %bb.397: 9327 lea rcx, [r8 + 4*rax] 9328 cmp rcx, rdx 9329 jbe .LBB1_564 9330 .LBB1_210: 9331 xor ecx, ecx 9332 .LBB1_921: 9333 mov rsi, rcx 9334 not rsi 9335 add rsi, rax 9336 mov rdi, rax 9337 and rdi, 3 9338 je .LBB1_923 9339 .LBB1_922: # =>This Inner Loop Header: Depth=1 9340 movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero 9341 addss xmm1, xmm0 9342 movss dword ptr [r8 + 4*rcx], xmm1 9343 add rcx, 1 9344 add rdi, -1 9345 jne .LBB1_922 9346 .LBB1_923: 9347 cmp rsi, 3 9348 jb .LBB1_1069 9349 .LBB1_924: # =>This Inner Loop Header: Depth=1 9350 movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero 9351 addss xmm1, xmm0 9352 movss dword ptr [r8 + 4*rcx], xmm1 9353 movss xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero 9354 addss xmm1, xmm0 9355 movss dword ptr [r8 + 4*rcx + 4], xmm1 9356 movss xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero 9357 addss xmm1, xmm0 9358 movss dword ptr [r8 + 4*rcx + 8], xmm1 9359 movss xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero 9360 addss xmm1, xmm0 9361 movss dword ptr [r8 + 4*rcx + 12], xmm1 9362 add rcx, 4 9363 cmp rax, rcx 9364 jne .LBB1_924 9365 jmp .LBB1_1069 9366 .LBB1_211: 9367 test r9d, r9d 9368 jle .LBB1_1069 9369 # %bb.212: 9370 mov rax, qword ptr [rcx] 9371 mov r10d, r9d 9372 cmp r9d, 4 9373 jb .LBB1_213 9374 # %bb.399: 9375 lea rcx, [rdx + 8*r10] 9376 cmp rcx, r8 9377 jbe .LBB1_567 9378 # %bb.400: 9379 lea rcx, [r8 + 8*r10] 9380 cmp rcx, rdx 9381 jbe .LBB1_567 9382 .LBB1_213: 9383 xor esi, esi 9384 .LBB1_929: 9385 mov r9, rsi 9386 not r9 9387 add r9, r10 9388 mov rdi, r10 9389 and rdi, 3 9390 je .LBB1_931 9391 .LBB1_930: # =>This Inner Loop Header: Depth=1 9392 mov rcx, qword ptr [rdx + 8*rsi] 9393 sub rcx, rax 9394 mov qword ptr [r8 + 8*rsi], rcx 9395 add rsi, 1 9396 add rdi, -1 9397 jne .LBB1_930 9398 .LBB1_931: 9399 cmp r9, 3 9400 jb .LBB1_1069 9401 .LBB1_932: # =>This Inner Loop Header: Depth=1 9402 mov rcx, qword ptr [rdx + 8*rsi] 9403 sub rcx, rax 9404 mov qword ptr [r8 + 8*rsi], rcx 9405 mov rcx, qword ptr [rdx + 8*rsi + 8] 9406 sub rcx, rax 9407 mov qword ptr [r8 + 8*rsi + 8], rcx 9408 mov rcx, qword ptr [rdx + 8*rsi + 16] 9409 sub rcx, rax 9410 mov qword ptr [r8 + 8*rsi + 16], rcx 9411 mov rcx, qword ptr [rdx + 8*rsi + 24] 9412 sub rcx, rax 9413 mov qword ptr [r8 + 8*rsi + 24], rcx 9414 add rsi, 4 9415 cmp r10, rsi 9416 jne .LBB1_932 9417 jmp .LBB1_1069 9418 .LBB1_214: 9419 test r9d, r9d 9420 jle .LBB1_1069 9421 # %bb.215: 9422 movss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero 9423 mov eax, r9d 9424 cmp r9d, 8 9425 jb .LBB1_216 9426 # %bb.402: 9427 lea rcx, [rdx + 4*rax] 9428 cmp rcx, r8 9429 jbe .LBB1_570 9430 # %bb.403: 9431 lea rcx, [r8 + 4*rax] 9432 cmp rcx, rdx 9433 jbe .LBB1_570 9434 .LBB1_216: 9435 xor ecx, ecx 9436 .LBB1_937: 9437 mov rsi, rcx 9438 not rsi 9439 add rsi, rax 9440 mov rdi, rax 9441 and rdi, 3 9442 je .LBB1_939 9443 .LBB1_938: # =>This Inner Loop Header: Depth=1 9444 movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero 9445 subss xmm1, xmm0 9446 movss dword ptr [r8 + 4*rcx], xmm1 9447 add rcx, 1 9448 add rdi, -1 9449 jne .LBB1_938 9450 .LBB1_939: 9451 cmp rsi, 3 9452 jb .LBB1_1069 9453 .LBB1_940: # =>This Inner Loop Header: Depth=1 9454 movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero 9455 subss xmm1, xmm0 9456 movss dword ptr [r8 + 4*rcx], xmm1 9457 movss xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero 9458 subss xmm1, xmm0 9459 movss dword ptr [r8 + 4*rcx + 4], xmm1 9460 movss xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero 9461 subss xmm1, xmm0 9462 movss dword ptr [r8 + 4*rcx + 8], xmm1 9463 movss xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero 9464 subss xmm1, xmm0 9465 movss dword ptr [r8 + 4*rcx + 12], xmm1 9466 add rcx, 4 9467 cmp rax, rcx 9468 jne .LBB1_940 9469 jmp .LBB1_1069 9470 .LBB1_217: 9471 test r9d, r9d 9472 jle .LBB1_1069 9473 # %bb.218: 9474 mov rax, qword ptr [rcx] 9475 mov r10d, r9d 9476 cmp r9d, 4 9477 jb .LBB1_219 9478 # %bb.405: 9479 lea rcx, [rdx + 8*r10] 9480 cmp rcx, r8 9481 jbe .LBB1_573 9482 # %bb.406: 9483 lea rcx, [r8 + 8*r10] 9484 cmp rcx, rdx 9485 jbe .LBB1_573 9486 .LBB1_219: 9487 xor esi, esi 9488 .LBB1_945: 9489 mov r9, rsi 9490 not r9 9491 add r9, r10 9492 mov rdi, r10 9493 and rdi, 3 9494 je .LBB1_947 9495 .LBB1_946: # =>This Inner Loop Header: Depth=1 9496 mov rcx, qword ptr [rdx + 8*rsi] 9497 add rcx, rax 9498 mov qword ptr [r8 + 8*rsi], rcx 9499 add rsi, 1 9500 add rdi, -1 9501 jne .LBB1_946 9502 .LBB1_947: 9503 cmp r9, 3 9504 jb .LBB1_1069 9505 .LBB1_948: # =>This Inner Loop Header: Depth=1 9506 mov rcx, qword ptr [rdx + 8*rsi] 9507 add rcx, rax 9508 mov qword ptr [r8 + 8*rsi], rcx 9509 mov rcx, qword ptr [rdx + 8*rsi + 8] 9510 add rcx, rax 9511 mov qword ptr [r8 + 8*rsi + 8], rcx 9512 mov rcx, qword ptr [rdx + 8*rsi + 16] 9513 add rcx, rax 9514 mov qword ptr [r8 + 8*rsi + 16], rcx 9515 mov rcx, qword ptr [rdx + 8*rsi + 24] 9516 add rcx, rax 9517 mov qword ptr [r8 + 8*rsi + 24], rcx 9518 add rsi, 4 9519 cmp r10, rsi 9520 jne .LBB1_948 9521 jmp .LBB1_1069 9522 .LBB1_220: 9523 test r9d, r9d 9524 jle .LBB1_1069 9525 # %bb.221: 9526 movss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero 9527 mov eax, r9d 9528 cmp r9d, 8 9529 jb .LBB1_222 9530 # %bb.408: 9531 lea rcx, [rdx + 4*rax] 9532 cmp rcx, r8 9533 jbe .LBB1_576 9534 # %bb.409: 9535 lea rcx, [r8 + 4*rax] 9536 cmp rcx, rdx 9537 jbe .LBB1_576 9538 .LBB1_222: 9539 xor ecx, ecx 9540 .LBB1_953: 9541 mov rsi, rcx 9542 not rsi 9543 add rsi, rax 9544 mov rdi, rax 9545 and rdi, 3 9546 je .LBB1_955 9547 .LBB1_954: # =>This Inner Loop Header: Depth=1 9548 movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero 9549 addss xmm1, xmm0 9550 movss dword ptr [r8 + 4*rcx], xmm1 9551 add rcx, 1 9552 add rdi, -1 9553 jne .LBB1_954 9554 .LBB1_955: 9555 cmp rsi, 3 9556 jb .LBB1_1069 9557 .LBB1_956: # =>This Inner Loop Header: Depth=1 9558 movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero 9559 addss xmm1, xmm0 9560 movss dword ptr [r8 + 4*rcx], xmm1 9561 movss xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero 9562 addss xmm1, xmm0 9563 movss dword ptr [r8 + 4*rcx + 4], xmm1 9564 movss xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero 9565 addss xmm1, xmm0 9566 movss dword ptr [r8 + 4*rcx + 8], xmm1 9567 movss xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero 9568 addss xmm1, xmm0 9569 movss dword ptr [r8 + 4*rcx + 12], xmm1 9570 add rcx, 4 9571 cmp rax, rcx 9572 jne .LBB1_956 9573 jmp .LBB1_1069 9574 .LBB1_223: 9575 test r9d, r9d 9576 jle .LBB1_1069 9577 # %bb.224: 9578 mov rax, qword ptr [rcx] 9579 mov r10d, r9d 9580 cmp r9d, 4 9581 jb .LBB1_225 9582 # %bb.411: 9583 lea rcx, [rdx + 8*r10] 9584 cmp rcx, r8 9585 jbe .LBB1_579 9586 # %bb.412: 9587 lea rcx, [r8 + 8*r10] 9588 cmp rcx, rdx 9589 jbe .LBB1_579 9590 .LBB1_225: 9591 xor esi, esi 9592 .LBB1_961: 9593 mov r9, rsi 9594 not r9 9595 add r9, r10 9596 mov rdi, r10 9597 and rdi, 3 9598 je .LBB1_963 9599 .LBB1_962: # =>This Inner Loop Header: Depth=1 9600 mov rcx, qword ptr [rdx + 8*rsi] 9601 sub rcx, rax 9602 mov qword ptr [r8 + 8*rsi], rcx 9603 add rsi, 1 9604 add rdi, -1 9605 jne .LBB1_962 9606 .LBB1_963: 9607 cmp r9, 3 9608 jb .LBB1_1069 9609 .LBB1_964: # =>This Inner Loop Header: Depth=1 9610 mov rcx, qword ptr [rdx + 8*rsi] 9611 sub rcx, rax 9612 mov qword ptr [r8 + 8*rsi], rcx 9613 mov rcx, qword ptr [rdx + 8*rsi + 8] 9614 sub rcx, rax 9615 mov qword ptr [r8 + 8*rsi + 8], rcx 9616 mov rcx, qword ptr [rdx + 8*rsi + 16] 9617 sub rcx, rax 9618 mov qword ptr [r8 + 8*rsi + 16], rcx 9619 mov rcx, qword ptr [rdx + 8*rsi + 24] 9620 sub rcx, rax 9621 mov qword ptr [r8 + 8*rsi + 24], rcx 9622 add rsi, 4 9623 cmp r10, rsi 9624 jne .LBB1_964 9625 jmp .LBB1_1069 9626 .LBB1_226: 9627 test r9d, r9d 9628 jle .LBB1_1069 9629 # %bb.227: 9630 movss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero 9631 mov eax, r9d 9632 cmp r9d, 8 9633 jb .LBB1_228 9634 # %bb.414: 9635 lea rcx, [rdx + 4*rax] 9636 cmp rcx, r8 9637 jbe .LBB1_582 9638 # %bb.415: 9639 lea rcx, [r8 + 4*rax] 9640 cmp rcx, rdx 9641 jbe .LBB1_582 9642 .LBB1_228: 9643 xor ecx, ecx 9644 .LBB1_969: 9645 mov rsi, rcx 9646 not rsi 9647 add rsi, rax 9648 mov rdi, rax 9649 and rdi, 3 9650 je .LBB1_971 9651 .LBB1_970: # =>This Inner Loop Header: Depth=1 9652 movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero 9653 subss xmm1, xmm0 9654 movss dword ptr [r8 + 4*rcx], xmm1 9655 add rcx, 1 9656 add rdi, -1 9657 jne .LBB1_970 9658 .LBB1_971: 9659 cmp rsi, 3 9660 jb .LBB1_1069 9661 .LBB1_972: # =>This Inner Loop Header: Depth=1 9662 movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero 9663 subss xmm1, xmm0 9664 movss dword ptr [r8 + 4*rcx], xmm1 9665 movss xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero 9666 subss xmm1, xmm0 9667 movss dword ptr [r8 + 4*rcx + 4], xmm1 9668 movss xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero 9669 subss xmm1, xmm0 9670 movss dword ptr [r8 + 4*rcx + 8], xmm1 9671 movss xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero 9672 subss xmm1, xmm0 9673 movss dword ptr [r8 + 4*rcx + 12], xmm1 9674 add rcx, 4 9675 cmp rax, rcx 9676 jne .LBB1_972 9677 jmp .LBB1_1069 9678 .LBB1_229: 9679 test r9d, r9d 9680 jle .LBB1_1069 9681 # %bb.230: 9682 mov cl, byte ptr [rcx] 9683 mov r10d, r9d 9684 cmp r9d, 32 9685 jb .LBB1_231 9686 # %bb.417: 9687 lea rax, [rdx + r10] 9688 cmp rax, r8 9689 jbe .LBB1_585 9690 # %bb.418: 9691 lea rax, [r8 + r10] 9692 cmp rax, rdx 9693 jbe .LBB1_585 9694 .LBB1_231: 9695 xor edi, edi 9696 .LBB1_977: 9697 mov r9, rdi 9698 not r9 9699 add r9, r10 9700 mov rsi, r10 9701 and rsi, 3 9702 je .LBB1_979 9703 .LBB1_978: # =>This Inner Loop Header: Depth=1 9704 movzx eax, byte ptr [rdx + rdi] 9705 mul cl 9706 mov byte ptr [r8 + rdi], al 9707 add rdi, 1 9708 add rsi, -1 9709 jne .LBB1_978 9710 .LBB1_979: 9711 cmp r9, 3 9712 jb .LBB1_1069 9713 .LBB1_980: # =>This Inner Loop Header: Depth=1 9714 movzx eax, byte ptr [rdx + rdi] 9715 mul cl 9716 mov byte ptr [r8 + rdi], al 9717 movzx eax, byte ptr [rdx + rdi + 1] 9718 mul cl 9719 mov byte ptr [r8 + rdi + 1], al 9720 movzx eax, byte ptr [rdx + rdi + 2] 9721 mul cl 9722 mov byte ptr [r8 + rdi + 2], al 9723 movzx eax, byte ptr [rdx + rdi + 3] 9724 mul cl 9725 mov byte ptr [r8 + rdi + 3], al 9726 add rdi, 4 9727 cmp r10, rdi 9728 jne .LBB1_980 9729 jmp .LBB1_1069 9730 .LBB1_232: 9731 test r9d, r9d 9732 jle .LBB1_1069 9733 # %bb.233: 9734 mov cl, byte ptr [rcx] 9735 mov r10d, r9d 9736 cmp r9d, 32 9737 jb .LBB1_234 9738 # %bb.420: 9739 lea rax, [rdx + r10] 9740 cmp rax, r8 9741 jbe .LBB1_588 9742 # %bb.421: 9743 lea rax, [r8 + r10] 9744 cmp rax, rdx 9745 jbe .LBB1_588 9746 .LBB1_234: 9747 xor edi, edi 9748 .LBB1_985: 9749 mov r9, rdi 9750 not r9 9751 add r9, r10 9752 mov rsi, r10 9753 and rsi, 3 9754 je .LBB1_987 9755 .LBB1_986: # =>This Inner Loop Header: Depth=1 9756 movzx eax, byte ptr [rdx + rdi] 9757 mul cl 9758 mov byte ptr [r8 + rdi], al 9759 add rdi, 1 9760 add rsi, -1 9761 jne .LBB1_986 9762 .LBB1_987: 9763 cmp r9, 3 9764 jb .LBB1_1069 9765 .LBB1_988: # =>This Inner Loop Header: Depth=1 9766 movzx eax, byte ptr [rdx + rdi] 9767 mul cl 9768 mov byte ptr [r8 + rdi], al 9769 movzx eax, byte ptr [rdx + rdi + 1] 9770 mul cl 9771 mov byte ptr [r8 + rdi + 1], al 9772 movzx eax, byte ptr [rdx + rdi + 2] 9773 mul cl 9774 mov byte ptr [r8 + rdi + 2], al 9775 movzx eax, byte ptr [rdx + rdi + 3] 9776 mul cl 9777 mov byte ptr [r8 + rdi + 3], al 9778 add rdi, 4 9779 cmp r10, rdi 9780 jne .LBB1_988 9781 jmp .LBB1_1069 9782 .LBB1_235: 9783 test r9d, r9d 9784 jle .LBB1_1069 9785 # %bb.236: 9786 mov al, byte ptr [rcx] 9787 mov r10d, r9d 9788 cmp r9d, 32 9789 jb .LBB1_237 9790 # %bb.423: 9791 lea rcx, [rdx + r10] 9792 cmp rcx, r8 9793 jbe .LBB1_591 9794 # %bb.424: 9795 lea rcx, [r8 + r10] 9796 cmp rcx, rdx 9797 jbe .LBB1_591 9798 .LBB1_237: 9799 xor esi, esi 9800 .LBB1_993: 9801 mov r9, rsi 9802 not r9 9803 add r9, r10 9804 mov rdi, r10 9805 and rdi, 3 9806 je .LBB1_995 9807 .LBB1_994: # =>This Inner Loop Header: Depth=1 9808 movzx ecx, byte ptr [rdx + rsi] 9809 add cl, al 9810 mov byte ptr [r8 + rsi], cl 9811 add rsi, 1 9812 add rdi, -1 9813 jne .LBB1_994 9814 .LBB1_995: 9815 cmp r9, 3 9816 jb .LBB1_1069 9817 .LBB1_996: # =>This Inner Loop Header: Depth=1 9818 movzx ecx, byte ptr [rdx + rsi] 9819 add cl, al 9820 mov byte ptr [r8 + rsi], cl 9821 movzx ecx, byte ptr [rdx + rsi + 1] 9822 add cl, al 9823 mov byte ptr [r8 + rsi + 1], cl 9824 movzx ecx, byte ptr [rdx + rsi + 2] 9825 add cl, al 9826 mov byte ptr [r8 + rsi + 2], cl 9827 movzx ecx, byte ptr [rdx + rsi + 3] 9828 add cl, al 9829 mov byte ptr [r8 + rsi + 3], cl 9830 add rsi, 4 9831 cmp r10, rsi 9832 jne .LBB1_996 9833 jmp .LBB1_1069 9834 .LBB1_238: 9835 test r9d, r9d 9836 jle .LBB1_1069 9837 # %bb.239: 9838 mov al, byte ptr [rcx] 9839 mov r10d, r9d 9840 cmp r9d, 32 9841 jb .LBB1_240 9842 # %bb.426: 9843 lea rcx, [rdx + r10] 9844 cmp rcx, r8 9845 jbe .LBB1_594 9846 # %bb.427: 9847 lea rcx, [r8 + r10] 9848 cmp rcx, rdx 9849 jbe .LBB1_594 9850 .LBB1_240: 9851 xor esi, esi 9852 .LBB1_1001: 9853 mov r9, rsi 9854 not r9 9855 add r9, r10 9856 mov rdi, r10 9857 and rdi, 3 9858 je .LBB1_1003 9859 .LBB1_1002: # =>This Inner Loop Header: Depth=1 9860 movzx ecx, byte ptr [rdx + rsi] 9861 sub cl, al 9862 mov byte ptr [r8 + rsi], cl 9863 add rsi, 1 9864 add rdi, -1 9865 jne .LBB1_1002 9866 .LBB1_1003: 9867 cmp r9, 3 9868 jb .LBB1_1069 9869 .LBB1_1004: # =>This Inner Loop Header: Depth=1 9870 movzx ecx, byte ptr [rdx + rsi] 9871 sub cl, al 9872 mov byte ptr [r8 + rsi], cl 9873 movzx ecx, byte ptr [rdx + rsi + 1] 9874 sub cl, al 9875 mov byte ptr [r8 + rsi + 1], cl 9876 movzx ecx, byte ptr [rdx + rsi + 2] 9877 sub cl, al 9878 mov byte ptr [r8 + rsi + 2], cl 9879 movzx ecx, byte ptr [rdx + rsi + 3] 9880 sub cl, al 9881 mov byte ptr [r8 + rsi + 3], cl 9882 add rsi, 4 9883 cmp r10, rsi 9884 jne .LBB1_1004 9885 jmp .LBB1_1069 9886 .LBB1_241: 9887 test r9d, r9d 9888 jle .LBB1_1069 9889 # %bb.242: 9890 mov al, byte ptr [rcx] 9891 mov r10d, r9d 9892 cmp r9d, 32 9893 jb .LBB1_243 9894 # %bb.429: 9895 lea rcx, [rdx + r10] 9896 cmp rcx, r8 9897 jbe .LBB1_597 9898 # %bb.430: 9899 lea rcx, [r8 + r10] 9900 cmp rcx, rdx 9901 jbe .LBB1_597 9902 .LBB1_243: 9903 xor esi, esi 9904 .LBB1_1009: 9905 mov r9, rsi 9906 not r9 9907 add r9, r10 9908 mov rdi, r10 9909 and rdi, 3 9910 je .LBB1_1011 9911 .LBB1_1010: # =>This Inner Loop Header: Depth=1 9912 movzx ecx, byte ptr [rdx + rsi] 9913 add cl, al 9914 mov byte ptr [r8 + rsi], cl 9915 add rsi, 1 9916 add rdi, -1 9917 jne .LBB1_1010 9918 .LBB1_1011: 9919 cmp r9, 3 9920 jb .LBB1_1069 9921 .LBB1_1012: # =>This Inner Loop Header: Depth=1 9922 movzx ecx, byte ptr [rdx + rsi] 9923 add cl, al 9924 mov byte ptr [r8 + rsi], cl 9925 movzx ecx, byte ptr [rdx + rsi + 1] 9926 add cl, al 9927 mov byte ptr [r8 + rsi + 1], cl 9928 movzx ecx, byte ptr [rdx + rsi + 2] 9929 add cl, al 9930 mov byte ptr [r8 + rsi + 2], cl 9931 movzx ecx, byte ptr [rdx + rsi + 3] 9932 add cl, al 9933 mov byte ptr [r8 + rsi + 3], cl 9934 add rsi, 4 9935 cmp r10, rsi 9936 jne .LBB1_1012 9937 jmp .LBB1_1069 9938 .LBB1_244: 9939 test r9d, r9d 9940 jle .LBB1_1069 9941 # %bb.245: 9942 mov al, byte ptr [rcx] 9943 mov r10d, r9d 9944 cmp r9d, 32 9945 jb .LBB1_246 9946 # %bb.432: 9947 lea rcx, [rdx + r10] 9948 cmp rcx, r8 9949 jbe .LBB1_600 9950 # %bb.433: 9951 lea rcx, [r8 + r10] 9952 cmp rcx, rdx 9953 jbe .LBB1_600 9954 .LBB1_246: 9955 xor esi, esi 9956 .LBB1_1017: 9957 mov r9, rsi 9958 not r9 9959 add r9, r10 9960 mov rdi, r10 9961 and rdi, 3 9962 je .LBB1_1019 9963 .LBB1_1018: # =>This Inner Loop Header: Depth=1 9964 movzx ecx, byte ptr [rdx + rsi] 9965 sub cl, al 9966 mov byte ptr [r8 + rsi], cl 9967 add rsi, 1 9968 add rdi, -1 9969 jne .LBB1_1018 9970 .LBB1_1019: 9971 cmp r9, 3 9972 jb .LBB1_1069 9973 .LBB1_1020: # =>This Inner Loop Header: Depth=1 9974 movzx ecx, byte ptr [rdx + rsi] 9975 sub cl, al 9976 mov byte ptr [r8 + rsi], cl 9977 movzx ecx, byte ptr [rdx + rsi + 1] 9978 sub cl, al 9979 mov byte ptr [r8 + rsi + 1], cl 9980 movzx ecx, byte ptr [rdx + rsi + 2] 9981 sub cl, al 9982 mov byte ptr [r8 + rsi + 2], cl 9983 movzx ecx, byte ptr [rdx + rsi + 3] 9984 sub cl, al 9985 mov byte ptr [r8 + rsi + 3], cl 9986 add rsi, 4 9987 cmp r10, rsi 9988 jne .LBB1_1020 9989 jmp .LBB1_1069 9990 .LBB1_247: 9991 test r9d, r9d 9992 jle .LBB1_1069 9993 # %bb.248: 9994 mov eax, dword ptr [rcx] 9995 mov r10d, r9d 9996 cmp r9d, 8 9997 jb .LBB1_249 9998 # %bb.435: 9999 lea rcx, [rdx + 4*r10] 10000 cmp rcx, r8 10001 jbe .LBB1_603 10002 # %bb.436: 10003 lea rcx, [r8 + 4*r10] 10004 cmp rcx, rdx 10005 jbe .LBB1_603 10006 .LBB1_249: 10007 xor esi, esi 10008 .LBB1_1025: 10009 mov r9, rsi 10010 not r9 10011 add r9, r10 10012 mov rdi, r10 10013 and rdi, 3 10014 je .LBB1_1027 10015 .LBB1_1026: # =>This Inner Loop Header: Depth=1 10016 mov ecx, dword ptr [rdx + 4*rsi] 10017 imul ecx, eax 10018 mov dword ptr [r8 + 4*rsi], ecx 10019 add rsi, 1 10020 add rdi, -1 10021 jne .LBB1_1026 10022 .LBB1_1027: 10023 cmp r9, 3 10024 jb .LBB1_1069 10025 .LBB1_1028: # =>This Inner Loop Header: Depth=1 10026 mov ecx, dword ptr [rdx + 4*rsi] 10027 imul ecx, eax 10028 mov dword ptr [r8 + 4*rsi], ecx 10029 mov ecx, dword ptr [rdx + 4*rsi + 4] 10030 imul ecx, eax 10031 mov dword ptr [r8 + 4*rsi + 4], ecx 10032 mov ecx, dword ptr [rdx + 4*rsi + 8] 10033 imul ecx, eax 10034 mov dword ptr [r8 + 4*rsi + 8], ecx 10035 mov ecx, dword ptr [rdx + 4*rsi + 12] 10036 imul ecx, eax 10037 mov dword ptr [r8 + 4*rsi + 12], ecx 10038 add rsi, 4 10039 cmp r10, rsi 10040 jne .LBB1_1028 10041 jmp .LBB1_1069 10042 .LBB1_250: 10043 test r9d, r9d 10044 jle .LBB1_1069 10045 # %bb.251: 10046 mov eax, dword ptr [rcx] 10047 mov r10d, r9d 10048 cmp r9d, 8 10049 jb .LBB1_252 10050 # %bb.438: 10051 lea rcx, [rdx + 4*r10] 10052 cmp rcx, r8 10053 jbe .LBB1_606 10054 # %bb.439: 10055 lea rcx, [r8 + 4*r10] 10056 cmp rcx, rdx 10057 jbe .LBB1_606 10058 .LBB1_252: 10059 xor esi, esi 10060 .LBB1_1033: 10061 mov r9, rsi 10062 not r9 10063 add r9, r10 10064 mov rdi, r10 10065 and rdi, 3 10066 je .LBB1_1035 10067 .LBB1_1034: # =>This Inner Loop Header: Depth=1 10068 mov ecx, dword ptr [rdx + 4*rsi] 10069 imul ecx, eax 10070 mov dword ptr [r8 + 4*rsi], ecx 10071 add rsi, 1 10072 add rdi, -1 10073 jne .LBB1_1034 10074 .LBB1_1035: 10075 cmp r9, 3 10076 jb .LBB1_1069 10077 .LBB1_1036: # =>This Inner Loop Header: Depth=1 10078 mov ecx, dword ptr [rdx + 4*rsi] 10079 imul ecx, eax 10080 mov dword ptr [r8 + 4*rsi], ecx 10081 mov ecx, dword ptr [rdx + 4*rsi + 4] 10082 imul ecx, eax 10083 mov dword ptr [r8 + 4*rsi + 4], ecx 10084 mov ecx, dword ptr [rdx + 4*rsi + 8] 10085 imul ecx, eax 10086 mov dword ptr [r8 + 4*rsi + 8], ecx 10087 mov ecx, dword ptr [rdx + 4*rsi + 12] 10088 imul ecx, eax 10089 mov dword ptr [r8 + 4*rsi + 12], ecx 10090 add rsi, 4 10091 cmp r10, rsi 10092 jne .LBB1_1036 10093 jmp .LBB1_1069 10094 .LBB1_253: 10095 test r9d, r9d 10096 jle .LBB1_1069 10097 # %bb.254: 10098 mov eax, dword ptr [rcx] 10099 mov r10d, r9d 10100 cmp r9d, 8 10101 jb .LBB1_255 10102 # %bb.441: 10103 lea rcx, [rdx + 4*r10] 10104 cmp rcx, r8 10105 jbe .LBB1_609 10106 # %bb.442: 10107 lea rcx, [r8 + 4*r10] 10108 cmp rcx, rdx 10109 jbe .LBB1_609 10110 .LBB1_255: 10111 xor esi, esi 10112 .LBB1_1041: 10113 mov r9, rsi 10114 not r9 10115 add r9, r10 10116 mov rdi, r10 10117 and rdi, 3 10118 je .LBB1_1043 10119 .LBB1_1042: # =>This Inner Loop Header: Depth=1 10120 mov ecx, dword ptr [rdx + 4*rsi] 10121 add ecx, eax 10122 mov dword ptr [r8 + 4*rsi], ecx 10123 add rsi, 1 10124 add rdi, -1 10125 jne .LBB1_1042 10126 .LBB1_1043: 10127 cmp r9, 3 10128 jb .LBB1_1069 10129 .LBB1_1044: # =>This Inner Loop Header: Depth=1 10130 mov ecx, dword ptr [rdx + 4*rsi] 10131 add ecx, eax 10132 mov dword ptr [r8 + 4*rsi], ecx 10133 mov ecx, dword ptr [rdx + 4*rsi + 4] 10134 add ecx, eax 10135 mov dword ptr [r8 + 4*rsi + 4], ecx 10136 mov ecx, dword ptr [rdx + 4*rsi + 8] 10137 add ecx, eax 10138 mov dword ptr [r8 + 4*rsi + 8], ecx 10139 mov ecx, dword ptr [rdx + 4*rsi + 12] 10140 add ecx, eax 10141 mov dword ptr [r8 + 4*rsi + 12], ecx 10142 add rsi, 4 10143 cmp r10, rsi 10144 jne .LBB1_1044 10145 jmp .LBB1_1069 10146 .LBB1_256: 10147 test r9d, r9d 10148 jle .LBB1_1069 10149 # %bb.257: 10150 mov eax, dword ptr [rcx] 10151 mov r10d, r9d 10152 cmp r9d, 8 10153 jb .LBB1_258 10154 # %bb.444: 10155 lea rcx, [rdx + 4*r10] 10156 cmp rcx, r8 10157 jbe .LBB1_612 10158 # %bb.445: 10159 lea rcx, [r8 + 4*r10] 10160 cmp rcx, rdx 10161 jbe .LBB1_612 10162 .LBB1_258: 10163 xor esi, esi 10164 .LBB1_1049: 10165 mov r9, rsi 10166 not r9 10167 add r9, r10 10168 mov rdi, r10 10169 and rdi, 3 10170 je .LBB1_1051 10171 .LBB1_1050: # =>This Inner Loop Header: Depth=1 10172 mov ecx, dword ptr [rdx + 4*rsi] 10173 sub ecx, eax 10174 mov dword ptr [r8 + 4*rsi], ecx 10175 add rsi, 1 10176 add rdi, -1 10177 jne .LBB1_1050 10178 .LBB1_1051: 10179 cmp r9, 3 10180 jb .LBB1_1069 10181 .LBB1_1052: # =>This Inner Loop Header: Depth=1 10182 mov ecx, dword ptr [rdx + 4*rsi] 10183 sub ecx, eax 10184 mov dword ptr [r8 + 4*rsi], ecx 10185 mov ecx, dword ptr [rdx + 4*rsi + 4] 10186 sub ecx, eax 10187 mov dword ptr [r8 + 4*rsi + 4], ecx 10188 mov ecx, dword ptr [rdx + 4*rsi + 8] 10189 sub ecx, eax 10190 mov dword ptr [r8 + 4*rsi + 8], ecx 10191 mov ecx, dword ptr [rdx + 4*rsi + 12] 10192 sub ecx, eax 10193 mov dword ptr [r8 + 4*rsi + 12], ecx 10194 add rsi, 4 10195 cmp r10, rsi 10196 jne .LBB1_1052 10197 jmp .LBB1_1069 10198 .LBB1_259: 10199 test r9d, r9d 10200 jle .LBB1_1069 10201 # %bb.260: 10202 mov eax, dword ptr [rcx] 10203 mov r10d, r9d 10204 cmp r9d, 8 10205 jb .LBB1_261 10206 # %bb.447: 10207 lea rcx, [rdx + 4*r10] 10208 cmp rcx, r8 10209 jbe .LBB1_615 10210 # %bb.448: 10211 lea rcx, [r8 + 4*r10] 10212 cmp rcx, rdx 10213 jbe .LBB1_615 10214 .LBB1_261: 10215 xor esi, esi 10216 .LBB1_1057: 10217 mov r9, rsi 10218 not r9 10219 add r9, r10 10220 mov rdi, r10 10221 and rdi, 3 10222 je .LBB1_1059 10223 .LBB1_1058: # =>This Inner Loop Header: Depth=1 10224 mov ecx, dword ptr [rdx + 4*rsi] 10225 add ecx, eax 10226 mov dword ptr [r8 + 4*rsi], ecx 10227 add rsi, 1 10228 add rdi, -1 10229 jne .LBB1_1058 10230 .LBB1_1059: 10231 cmp r9, 3 10232 jb .LBB1_1069 10233 .LBB1_1060: # =>This Inner Loop Header: Depth=1 10234 mov ecx, dword ptr [rdx + 4*rsi] 10235 add ecx, eax 10236 mov dword ptr [r8 + 4*rsi], ecx 10237 mov ecx, dword ptr [rdx + 4*rsi + 4] 10238 add ecx, eax 10239 mov dword ptr [r8 + 4*rsi + 4], ecx 10240 mov ecx, dword ptr [rdx + 4*rsi + 8] 10241 add ecx, eax 10242 mov dword ptr [r8 + 4*rsi + 8], ecx 10243 mov ecx, dword ptr [rdx + 4*rsi + 12] 10244 add ecx, eax 10245 mov dword ptr [r8 + 4*rsi + 12], ecx 10246 add rsi, 4 10247 cmp r10, rsi 10248 jne .LBB1_1060 10249 jmp .LBB1_1069 10250 .LBB1_262: 10251 test r9d, r9d 10252 jle .LBB1_1069 10253 # %bb.263: 10254 mov eax, dword ptr [rcx] 10255 mov r10d, r9d 10256 cmp r9d, 8 10257 jb .LBB1_264 10258 # %bb.450: 10259 lea rcx, [rdx + 4*r10] 10260 cmp rcx, r8 10261 jbe .LBB1_618 10262 # %bb.451: 10263 lea rcx, [r8 + 4*r10] 10264 cmp rcx, rdx 10265 jbe .LBB1_618 10266 .LBB1_264: 10267 xor esi, esi 10268 .LBB1_1065: 10269 mov r9, rsi 10270 not r9 10271 add r9, r10 10272 mov rdi, r10 10273 and rdi, 3 10274 je .LBB1_1067 10275 .LBB1_1066: # =>This Inner Loop Header: Depth=1 10276 mov ecx, dword ptr [rdx + 4*rsi] 10277 sub ecx, eax 10278 mov dword ptr [r8 + 4*rsi], ecx 10279 add rsi, 1 10280 add rdi, -1 10281 jne .LBB1_1066 10282 .LBB1_1067: 10283 cmp r9, 3 10284 jb .LBB1_1069 10285 .LBB1_1068: # =>This Inner Loop Header: Depth=1 10286 mov ecx, dword ptr [rdx + 4*rsi] 10287 sub ecx, eax 10288 mov dword ptr [r8 + 4*rsi], ecx 10289 mov ecx, dword ptr [rdx + 4*rsi + 4] 10290 sub ecx, eax 10291 mov dword ptr [r8 + 4*rsi + 4], ecx 10292 mov ecx, dword ptr [rdx + 4*rsi + 8] 10293 sub ecx, eax 10294 mov dword ptr [r8 + 4*rsi + 8], ecx 10295 mov ecx, dword ptr [rdx + 4*rsi + 12] 10296 sub ecx, eax 10297 mov dword ptr [r8 + 4*rsi + 12], ecx 10298 add rsi, 4 10299 cmp r10, rsi 10300 jne .LBB1_1068 10301 jmp .LBB1_1069 10302 .LBB1_319: 10303 and esi, -4 10304 xor edi, edi 10305 .LBB1_320: # =>This Inner Loop Header: Depth=1 10306 mov rcx, qword ptr [rdx + 8*rdi] 10307 imul rcx, rax 10308 mov qword ptr [r8 + 8*rdi], rcx 10309 mov rcx, qword ptr [rdx + 8*rdi + 8] 10310 imul rcx, rax 10311 mov qword ptr [r8 + 8*rdi + 8], rcx 10312 mov rcx, qword ptr [rdx + 8*rdi + 16] 10313 imul rcx, rax 10314 mov qword ptr [r8 + 8*rdi + 16], rcx 10315 mov rcx, qword ptr [rdx + 8*rdi + 24] 10316 imul rcx, rax 10317 mov qword ptr [r8 + 8*rdi + 24], rcx 10318 add rdi, 4 10319 cmp rsi, rdi 10320 jne .LBB1_320 10321 .LBB1_321: 10322 test r9, r9 10323 je .LBB1_1069 10324 # %bb.322: 10325 lea rsi, [r8 + 8*rdi] 10326 lea rdx, [rdx + 8*rdi] 10327 xor edi, edi 10328 .LBB1_323: # =>This Inner Loop Header: Depth=1 10329 mov rcx, qword ptr [rdx + 8*rdi] 10330 imul rcx, rax 10331 mov qword ptr [rsi + 8*rdi], rcx 10332 add rdi, 1 10333 cmp r9, rdi 10334 jne .LBB1_323 10335 jmp .LBB1_1069 10336 .LBB1_324: 10337 and esi, -4 10338 xor edi, edi 10339 .LBB1_325: # =>This Inner Loop Header: Depth=1 10340 mov rcx, qword ptr [rdx + 8*rdi] 10341 imul rcx, rax 10342 mov qword ptr [r8 + 8*rdi], rcx 10343 mov rcx, qword ptr [rdx + 8*rdi + 8] 10344 imul rcx, rax 10345 mov qword ptr [r8 + 8*rdi + 8], rcx 10346 mov rcx, qword ptr [rdx + 8*rdi + 16] 10347 imul rcx, rax 10348 mov qword ptr [r8 + 8*rdi + 16], rcx 10349 mov rcx, qword ptr [rdx + 8*rdi + 24] 10350 imul rcx, rax 10351 mov qword ptr [r8 + 8*rdi + 24], rcx 10352 add rdi, 4 10353 cmp rsi, rdi 10354 jne .LBB1_325 10355 .LBB1_326: 10356 test r9, r9 10357 je .LBB1_1069 10358 # %bb.327: 10359 lea rsi, [r8 + 8*rdi] 10360 lea rdx, [rdx + 8*rdi] 10361 xor edi, edi 10362 .LBB1_328: # =>This Inner Loop Header: Depth=1 10363 mov rcx, qword ptr [rdx + 8*rdi] 10364 imul rcx, rax 10365 mov qword ptr [rsi + 8*rdi], rcx 10366 add rdi, 1 10367 cmp r9, rdi 10368 jne .LBB1_328 10369 jmp .LBB1_1069 10370 .LBB1_377: 10371 and esi, -4 10372 xor edi, edi 10373 .LBB1_378: # =>This Inner Loop Header: Depth=1 10374 mov rcx, qword ptr [rdx + 8*rdi] 10375 imul rcx, rax 10376 mov qword ptr [r8 + 8*rdi], rcx 10377 mov rcx, qword ptr [rdx + 8*rdi + 8] 10378 imul rcx, rax 10379 mov qword ptr [r8 + 8*rdi + 8], rcx 10380 mov rcx, qword ptr [rdx + 8*rdi + 16] 10381 imul rcx, rax 10382 mov qword ptr [r8 + 8*rdi + 16], rcx 10383 mov rcx, qword ptr [rdx + 8*rdi + 24] 10384 imul rcx, rax 10385 mov qword ptr [r8 + 8*rdi + 24], rcx 10386 add rdi, 4 10387 cmp rsi, rdi 10388 jne .LBB1_378 10389 .LBB1_379: 10390 test r9, r9 10391 je .LBB1_1069 10392 # %bb.380: 10393 lea rsi, [r8 + 8*rdi] 10394 lea rdx, [rdx + 8*rdi] 10395 xor edi, edi 10396 .LBB1_381: # =>This Inner Loop Header: Depth=1 10397 mov rcx, qword ptr [rdx + 8*rdi] 10398 imul rcx, rax 10399 mov qword ptr [rsi + 8*rdi], rcx 10400 add rdi, 1 10401 cmp r9, rdi 10402 jne .LBB1_381 10403 jmp .LBB1_1069 10404 .LBB1_385: 10405 and esi, -4 10406 xor edi, edi 10407 .LBB1_386: # =>This Inner Loop Header: Depth=1 10408 mov rcx, qword ptr [rdx + 8*rdi] 10409 imul rcx, rax 10410 mov qword ptr [r8 + 8*rdi], rcx 10411 mov rcx, qword ptr [rdx + 8*rdi + 8] 10412 imul rcx, rax 10413 mov qword ptr [r8 + 8*rdi + 8], rcx 10414 mov rcx, qword ptr [rdx + 8*rdi + 16] 10415 imul rcx, rax 10416 mov qword ptr [r8 + 8*rdi + 16], rcx 10417 mov rcx, qword ptr [rdx + 8*rdi + 24] 10418 imul rcx, rax 10419 mov qword ptr [r8 + 8*rdi + 24], rcx 10420 add rdi, 4 10421 cmp rsi, rdi 10422 jne .LBB1_386 10423 .LBB1_387: 10424 test r9, r9 10425 je .LBB1_1069 10426 # %bb.388: 10427 lea rsi, [r8 + 8*rdi] 10428 lea rdx, [rdx + 8*rdi] 10429 xor edi, edi 10430 .LBB1_389: # =>This Inner Loop Header: Depth=1 10431 mov rcx, qword ptr [rdx + 8*rdi] 10432 imul rcx, rax 10433 mov qword ptr [rsi + 8*rdi], rcx 10434 add rdi, 1 10435 cmp r9, rdi 10436 jne .LBB1_389 10437 .LBB1_1069: 10438 mov rsp, rbp 10439 pop rbp 10440 ret 10441 .LBB1_453: 10442 mov esi, r10d 10443 and esi, -8 10444 movd xmm0, eax 10445 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 10446 lea rcx, [rsi - 8] 10447 mov r9, rcx 10448 shr r9, 3 10449 add r9, 1 10450 test rcx, rcx 10451 je .LBB1_621 10452 # %bb.454: 10453 mov rcx, r9 10454 and rcx, -2 10455 neg rcx 10456 xor edi, edi 10457 .LBB1_455: # =>This Inner Loop Header: Depth=1 10458 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 10459 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 10460 pmulld xmm1, xmm0 10461 pmulld xmm2, xmm0 10462 movdqu xmmword ptr [r8 + 4*rdi], xmm1 10463 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 10464 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] 10465 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] 10466 pmulld xmm1, xmm0 10467 pmulld xmm2, xmm0 10468 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 10469 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 10470 add rdi, 16 10471 add rcx, 2 10472 jne .LBB1_455 10473 jmp .LBB1_622 10474 .LBB1_456: 10475 mov esi, r10d 10476 and esi, -8 10477 movd xmm0, eax 10478 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 10479 lea rcx, [rsi - 8] 10480 mov r9, rcx 10481 shr r9, 3 10482 add r9, 1 10483 test rcx, rcx 10484 je .LBB1_629 10485 # %bb.457: 10486 mov rcx, r9 10487 and rcx, -2 10488 neg rcx 10489 xor edi, edi 10490 .LBB1_458: # =>This Inner Loop Header: Depth=1 10491 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 10492 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 10493 pmulld xmm1, xmm0 10494 pmulld xmm2, xmm0 10495 movdqu xmmword ptr [r8 + 4*rdi], xmm1 10496 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 10497 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] 10498 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] 10499 pmulld xmm1, xmm0 10500 pmulld xmm2, xmm0 10501 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 10502 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 10503 add rdi, 16 10504 add rcx, 2 10505 jne .LBB1_458 10506 jmp .LBB1_630 10507 .LBB1_459: 10508 mov esi, r10d 10509 and esi, -8 10510 movd xmm0, eax 10511 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 10512 lea rcx, [rsi - 8] 10513 mov r9, rcx 10514 shr r9, 3 10515 add r9, 1 10516 test rcx, rcx 10517 je .LBB1_637 10518 # %bb.460: 10519 mov rcx, r9 10520 and rcx, -2 10521 neg rcx 10522 xor edi, edi 10523 .LBB1_461: # =>This Inner Loop Header: Depth=1 10524 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 10525 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 10526 paddd xmm1, xmm0 10527 paddd xmm2, xmm0 10528 movdqu xmmword ptr [r8 + 4*rdi], xmm1 10529 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 10530 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] 10531 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] 10532 paddd xmm1, xmm0 10533 paddd xmm2, xmm0 10534 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 10535 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 10536 add rdi, 16 10537 add rcx, 2 10538 jne .LBB1_461 10539 jmp .LBB1_638 10540 .LBB1_462: 10541 mov esi, r10d 10542 and esi, -8 10543 movd xmm0, eax 10544 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 10545 lea rcx, [rsi - 8] 10546 mov r9, rcx 10547 shr r9, 3 10548 add r9, 1 10549 test rcx, rcx 10550 je .LBB1_645 10551 # %bb.463: 10552 mov rcx, r9 10553 and rcx, -2 10554 neg rcx 10555 xor edi, edi 10556 .LBB1_464: # =>This Inner Loop Header: Depth=1 10557 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 10558 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 10559 psubd xmm1, xmm0 10560 psubd xmm2, xmm0 10561 movdqu xmmword ptr [r8 + 4*rdi], xmm1 10562 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 10563 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] 10564 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] 10565 psubd xmm1, xmm0 10566 psubd xmm2, xmm0 10567 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 10568 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 10569 add rdi, 16 10570 add rcx, 2 10571 jne .LBB1_464 10572 jmp .LBB1_646 10573 .LBB1_465: 10574 mov esi, r10d 10575 and esi, -8 10576 movd xmm0, eax 10577 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 10578 lea rcx, [rsi - 8] 10579 mov r9, rcx 10580 shr r9, 3 10581 add r9, 1 10582 test rcx, rcx 10583 je .LBB1_653 10584 # %bb.466: 10585 mov rcx, r9 10586 and rcx, -2 10587 neg rcx 10588 xor edi, edi 10589 .LBB1_467: # =>This Inner Loop Header: Depth=1 10590 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 10591 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 10592 paddd xmm1, xmm0 10593 paddd xmm2, xmm0 10594 movdqu xmmword ptr [r8 + 4*rdi], xmm1 10595 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 10596 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] 10597 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] 10598 paddd xmm1, xmm0 10599 paddd xmm2, xmm0 10600 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 10601 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 10602 add rdi, 16 10603 add rcx, 2 10604 jne .LBB1_467 10605 jmp .LBB1_654 10606 .LBB1_468: 10607 mov esi, r10d 10608 and esi, -8 10609 movd xmm0, eax 10610 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 10611 lea rcx, [rsi - 8] 10612 mov r9, rcx 10613 shr r9, 3 10614 add r9, 1 10615 test rcx, rcx 10616 je .LBB1_661 10617 # %bb.469: 10618 mov rcx, r9 10619 and rcx, -2 10620 neg rcx 10621 xor edi, edi 10622 .LBB1_470: # =>This Inner Loop Header: Depth=1 10623 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 10624 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 10625 psubd xmm1, xmm0 10626 psubd xmm2, xmm0 10627 movdqu xmmword ptr [r8 + 4*rdi], xmm1 10628 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 10629 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] 10630 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] 10631 psubd xmm1, xmm0 10632 psubd xmm2, xmm0 10633 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 10634 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 10635 add rdi, 16 10636 add rcx, 2 10637 jne .LBB1_470 10638 jmp .LBB1_662 10639 .LBB1_471: 10640 mov ecx, eax 10641 and ecx, -4 10642 movddup xmm1, xmm0 # xmm1 = xmm0[0,0] 10643 lea rsi, [rcx - 4] 10644 mov r9, rsi 10645 shr r9, 2 10646 add r9, 1 10647 test rsi, rsi 10648 je .LBB1_669 10649 # %bb.472: 10650 mov rsi, r9 10651 and rsi, -2 10652 neg rsi 10653 xor edi, edi 10654 .LBB1_473: # =>This Inner Loop Header: Depth=1 10655 movupd xmm2, xmmword ptr [rdx + 8*rdi] 10656 movupd xmm3, xmmword ptr [rdx + 8*rdi + 16] 10657 mulpd xmm2, xmm1 10658 mulpd xmm3, xmm1 10659 movupd xmmword ptr [r8 + 8*rdi], xmm2 10660 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 10661 movupd xmm2, xmmword ptr [rdx + 8*rdi + 32] 10662 movupd xmm3, xmmword ptr [rdx + 8*rdi + 48] 10663 mulpd xmm2, xmm1 10664 mulpd xmm3, xmm1 10665 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 10666 movupd xmmword ptr [r8 + 8*rdi + 48], xmm3 10667 add rdi, 8 10668 add rsi, 2 10669 jne .LBB1_473 10670 jmp .LBB1_670 10671 .LBB1_474: 10672 mov ecx, eax 10673 and ecx, -4 10674 movddup xmm1, xmm0 # xmm1 = xmm0[0,0] 10675 lea rsi, [rcx - 4] 10676 mov r9, rsi 10677 shr r9, 2 10678 add r9, 1 10679 test rsi, rsi 10680 je .LBB1_677 10681 # %bb.475: 10682 mov rsi, r9 10683 and rsi, -2 10684 neg rsi 10685 xor edi, edi 10686 .LBB1_476: # =>This Inner Loop Header: Depth=1 10687 movupd xmm2, xmmword ptr [rdx + 8*rdi] 10688 movupd xmm3, xmmword ptr [rdx + 8*rdi + 16] 10689 mulpd xmm2, xmm1 10690 mulpd xmm3, xmm1 10691 movupd xmmword ptr [r8 + 8*rdi], xmm2 10692 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 10693 movupd xmm2, xmmword ptr [rdx + 8*rdi + 32] 10694 movupd xmm3, xmmword ptr [rdx + 8*rdi + 48] 10695 mulpd xmm2, xmm1 10696 mulpd xmm3, xmm1 10697 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 10698 movupd xmmword ptr [r8 + 8*rdi + 48], xmm3 10699 add rdi, 8 10700 add rsi, 2 10701 jne .LBB1_476 10702 jmp .LBB1_678 10703 .LBB1_477: 10704 mov ecx, eax 10705 and ecx, -4 10706 movddup xmm1, xmm0 # xmm1 = xmm0[0,0] 10707 lea rsi, [rcx - 4] 10708 mov r9, rsi 10709 shr r9, 2 10710 add r9, 1 10711 test rsi, rsi 10712 je .LBB1_685 10713 # %bb.478: 10714 mov rsi, r9 10715 and rsi, -2 10716 neg rsi 10717 xor edi, edi 10718 .LBB1_479: # =>This Inner Loop Header: Depth=1 10719 movupd xmm2, xmmword ptr [rdx + 8*rdi] 10720 movupd xmm3, xmmword ptr [rdx + 8*rdi + 16] 10721 addpd xmm2, xmm1 10722 addpd xmm3, xmm1 10723 movupd xmmword ptr [r8 + 8*rdi], xmm2 10724 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 10725 movupd xmm2, xmmword ptr [rdx + 8*rdi + 32] 10726 movupd xmm3, xmmword ptr [rdx + 8*rdi + 48] 10727 addpd xmm2, xmm1 10728 addpd xmm3, xmm1 10729 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 10730 movupd xmmword ptr [r8 + 8*rdi + 48], xmm3 10731 add rdi, 8 10732 add rsi, 2 10733 jne .LBB1_479 10734 jmp .LBB1_686 10735 .LBB1_480: 10736 mov ecx, eax 10737 and ecx, -4 10738 movddup xmm1, xmm0 # xmm1 = xmm0[0,0] 10739 lea rsi, [rcx - 4] 10740 mov r9, rsi 10741 shr r9, 2 10742 add r9, 1 10743 test rsi, rsi 10744 je .LBB1_693 10745 # %bb.481: 10746 mov rsi, r9 10747 and rsi, -2 10748 neg rsi 10749 xor edi, edi 10750 .LBB1_482: # =>This Inner Loop Header: Depth=1 10751 movupd xmm2, xmmword ptr [rdx + 8*rdi] 10752 movupd xmm3, xmmword ptr [rdx + 8*rdi + 16] 10753 subpd xmm2, xmm1 10754 subpd xmm3, xmm1 10755 movupd xmmword ptr [r8 + 8*rdi], xmm2 10756 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 10757 movupd xmm2, xmmword ptr [rdx + 8*rdi + 32] 10758 movupd xmm3, xmmword ptr [rdx + 8*rdi + 48] 10759 subpd xmm2, xmm1 10760 subpd xmm3, xmm1 10761 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 10762 movupd xmmword ptr [r8 + 8*rdi + 48], xmm3 10763 add rdi, 8 10764 add rsi, 2 10765 jne .LBB1_482 10766 jmp .LBB1_694 10767 .LBB1_483: 10768 mov ecx, eax 10769 and ecx, -4 10770 movddup xmm1, xmm0 # xmm1 = xmm0[0,0] 10771 lea rsi, [rcx - 4] 10772 mov r9, rsi 10773 shr r9, 2 10774 add r9, 1 10775 test rsi, rsi 10776 je .LBB1_701 10777 # %bb.484: 10778 mov rsi, r9 10779 and rsi, -2 10780 neg rsi 10781 xor edi, edi 10782 .LBB1_485: # =>This Inner Loop Header: Depth=1 10783 movupd xmm2, xmmword ptr [rdx + 8*rdi] 10784 movupd xmm3, xmmword ptr [rdx + 8*rdi + 16] 10785 addpd xmm2, xmm1 10786 addpd xmm3, xmm1 10787 movupd xmmword ptr [r8 + 8*rdi], xmm2 10788 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 10789 movupd xmm2, xmmword ptr [rdx + 8*rdi + 32] 10790 movupd xmm3, xmmword ptr [rdx + 8*rdi + 48] 10791 addpd xmm2, xmm1 10792 addpd xmm3, xmm1 10793 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 10794 movupd xmmword ptr [r8 + 8*rdi + 48], xmm3 10795 add rdi, 8 10796 add rsi, 2 10797 jne .LBB1_485 10798 jmp .LBB1_702 10799 .LBB1_486: 10800 mov ecx, eax 10801 and ecx, -4 10802 movddup xmm1, xmm0 # xmm1 = xmm0[0,0] 10803 lea rsi, [rcx - 4] 10804 mov r9, rsi 10805 shr r9, 2 10806 add r9, 1 10807 test rsi, rsi 10808 je .LBB1_709 10809 # %bb.487: 10810 mov rsi, r9 10811 and rsi, -2 10812 neg rsi 10813 xor edi, edi 10814 .LBB1_488: # =>This Inner Loop Header: Depth=1 10815 movupd xmm2, xmmword ptr [rdx + 8*rdi] 10816 movupd xmm3, xmmword ptr [rdx + 8*rdi + 16] 10817 subpd xmm2, xmm1 10818 subpd xmm3, xmm1 10819 movupd xmmword ptr [r8 + 8*rdi], xmm2 10820 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 10821 movupd xmm2, xmmword ptr [rdx + 8*rdi + 32] 10822 movupd xmm3, xmmword ptr [rdx + 8*rdi + 48] 10823 subpd xmm2, xmm1 10824 subpd xmm3, xmm1 10825 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 10826 movupd xmmword ptr [r8 + 8*rdi + 48], xmm3 10827 add rdi, 8 10828 add rsi, 2 10829 jne .LBB1_488 10830 jmp .LBB1_710 10831 .LBB1_489: 10832 mov edi, r10d 10833 and edi, -32 10834 movzx eax, cl 10835 movd xmm0, eax 10836 pxor xmm1, xmm1 10837 pshufb xmm0, xmm1 10838 lea rax, [rdi - 32] 10839 mov r9, rax 10840 shr r9, 5 10841 add r9, 1 10842 pmovzxbw xmm1, xmm0 # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 10843 test rax, rax 10844 je .LBB1_717 10845 # %bb.490: 10846 mov rsi, r9 10847 and rsi, -2 10848 neg rsi 10849 xor eax, eax 10850 movdqa xmm2, xmm0 10851 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 10852 movdqa xmm3, xmmword ptr [rip + .LCPI1_0] # xmm3 = [255,255,255,255,255,255,255,255] 10853 movdqa xmm4, xmm0 10854 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 10855 .LBB1_491: # =>This Inner Loop Header: Depth=1 10856 movdqu xmm5, xmmword ptr [rdx + rax] 10857 movdqu xmm6, xmmword ptr [rdx + rax + 16] 10858 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 10859 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 10860 pmullw xmm5, xmm2 10861 pand xmm5, xmm3 10862 pmullw xmm7, xmm1 10863 pand xmm7, xmm3 10864 packuswb xmm7, xmm5 10865 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 10866 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 10867 pmullw xmm6, xmm4 10868 pand xmm6, xmm3 10869 pmullw xmm5, xmm1 10870 pand xmm5, xmm3 10871 packuswb xmm5, xmm6 10872 movdqu xmmword ptr [r8 + rax], xmm7 10873 movdqu xmmword ptr [r8 + rax + 16], xmm5 10874 movdqu xmm5, xmmword ptr [rdx + rax + 32] 10875 movdqu xmm6, xmmword ptr [rdx + rax + 48] 10876 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 10877 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 10878 pmullw xmm5, xmm2 10879 pand xmm5, xmm3 10880 pmullw xmm7, xmm1 10881 pand xmm7, xmm3 10882 packuswb xmm7, xmm5 10883 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 10884 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 10885 pmullw xmm6, xmm4 10886 pand xmm6, xmm3 10887 pmullw xmm5, xmm1 10888 pand xmm5, xmm3 10889 packuswb xmm5, xmm6 10890 movdqu xmmword ptr [r8 + rax + 32], xmm7 10891 movdqu xmmword ptr [r8 + rax + 48], xmm5 10892 add rax, 64 10893 add rsi, 2 10894 jne .LBB1_491 10895 jmp .LBB1_718 10896 .LBB1_492: 10897 mov edi, r10d 10898 and edi, -32 10899 movzx eax, cl 10900 movd xmm0, eax 10901 pxor xmm1, xmm1 10902 pshufb xmm0, xmm1 10903 lea rax, [rdi - 32] 10904 mov r9, rax 10905 shr r9, 5 10906 add r9, 1 10907 pmovzxbw xmm1, xmm0 # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 10908 test rax, rax 10909 je .LBB1_725 10910 # %bb.493: 10911 mov rsi, r9 10912 and rsi, -2 10913 neg rsi 10914 xor eax, eax 10915 movdqa xmm2, xmm0 10916 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 10917 movdqa xmm3, xmmword ptr [rip + .LCPI1_0] # xmm3 = [255,255,255,255,255,255,255,255] 10918 movdqa xmm4, xmm0 10919 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 10920 .LBB1_494: # =>This Inner Loop Header: Depth=1 10921 movdqu xmm5, xmmword ptr [rdx + rax] 10922 movdqu xmm6, xmmword ptr [rdx + rax + 16] 10923 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 10924 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 10925 pmullw xmm5, xmm2 10926 pand xmm5, xmm3 10927 pmullw xmm7, xmm1 10928 pand xmm7, xmm3 10929 packuswb xmm7, xmm5 10930 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 10931 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 10932 pmullw xmm6, xmm4 10933 pand xmm6, xmm3 10934 pmullw xmm5, xmm1 10935 pand xmm5, xmm3 10936 packuswb xmm5, xmm6 10937 movdqu xmmword ptr [r8 + rax], xmm7 10938 movdqu xmmword ptr [r8 + rax + 16], xmm5 10939 movdqu xmm5, xmmword ptr [rdx + rax + 32] 10940 movdqu xmm6, xmmword ptr [rdx + rax + 48] 10941 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 10942 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 10943 pmullw xmm5, xmm2 10944 pand xmm5, xmm3 10945 pmullw xmm7, xmm1 10946 pand xmm7, xmm3 10947 packuswb xmm7, xmm5 10948 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 10949 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 10950 pmullw xmm6, xmm4 10951 pand xmm6, xmm3 10952 pmullw xmm5, xmm1 10953 pand xmm5, xmm3 10954 packuswb xmm5, xmm6 10955 movdqu xmmword ptr [r8 + rax + 32], xmm7 10956 movdqu xmmword ptr [r8 + rax + 48], xmm5 10957 add rax, 64 10958 add rsi, 2 10959 jne .LBB1_494 10960 jmp .LBB1_726 10961 .LBB1_495: 10962 mov esi, r10d 10963 and esi, -32 10964 movzx ecx, al 10965 movd xmm0, ecx 10966 pxor xmm1, xmm1 10967 pshufb xmm0, xmm1 10968 lea rcx, [rsi - 32] 10969 mov r9, rcx 10970 shr r9, 5 10971 add r9, 1 10972 test rcx, rcx 10973 je .LBB1_733 10974 # %bb.496: 10975 mov rcx, r9 10976 and rcx, -2 10977 neg rcx 10978 xor edi, edi 10979 .LBB1_497: # =>This Inner Loop Header: Depth=1 10980 movdqu xmm1, xmmword ptr [rdx + rdi] 10981 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 10982 paddb xmm1, xmm0 10983 paddb xmm2, xmm0 10984 movdqu xmmword ptr [r8 + rdi], xmm1 10985 movdqu xmmword ptr [r8 + rdi + 16], xmm2 10986 movdqu xmm1, xmmword ptr [rdx + rdi + 32] 10987 movdqu xmm2, xmmword ptr [rdx + rdi + 48] 10988 paddb xmm1, xmm0 10989 paddb xmm2, xmm0 10990 movdqu xmmword ptr [r8 + rdi + 32], xmm1 10991 movdqu xmmword ptr [r8 + rdi + 48], xmm2 10992 add rdi, 64 10993 add rcx, 2 10994 jne .LBB1_497 10995 jmp .LBB1_734 10996 .LBB1_498: 10997 mov esi, r10d 10998 and esi, -32 10999 movzx ecx, al 11000 movd xmm0, ecx 11001 pxor xmm1, xmm1 11002 pshufb xmm0, xmm1 11003 lea rcx, [rsi - 32] 11004 mov r9, rcx 11005 shr r9, 5 11006 add r9, 1 11007 test rcx, rcx 11008 je .LBB1_741 11009 # %bb.499: 11010 mov rcx, r9 11011 and rcx, -2 11012 neg rcx 11013 xor edi, edi 11014 .LBB1_500: # =>This Inner Loop Header: Depth=1 11015 movdqu xmm1, xmmword ptr [rdx + rdi] 11016 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 11017 psubb xmm1, xmm0 11018 psubb xmm2, xmm0 11019 movdqu xmmword ptr [r8 + rdi], xmm1 11020 movdqu xmmword ptr [r8 + rdi + 16], xmm2 11021 movdqu xmm1, xmmword ptr [rdx + rdi + 32] 11022 movdqu xmm2, xmmword ptr [rdx + rdi + 48] 11023 psubb xmm1, xmm0 11024 psubb xmm2, xmm0 11025 movdqu xmmword ptr [r8 + rdi + 32], xmm1 11026 movdqu xmmword ptr [r8 + rdi + 48], xmm2 11027 add rdi, 64 11028 add rcx, 2 11029 jne .LBB1_500 11030 jmp .LBB1_742 11031 .LBB1_501: 11032 mov esi, r10d 11033 and esi, -32 11034 movzx ecx, al 11035 movd xmm0, ecx 11036 pxor xmm1, xmm1 11037 pshufb xmm0, xmm1 11038 lea rcx, [rsi - 32] 11039 mov r9, rcx 11040 shr r9, 5 11041 add r9, 1 11042 test rcx, rcx 11043 je .LBB1_749 11044 # %bb.502: 11045 mov rcx, r9 11046 and rcx, -2 11047 neg rcx 11048 xor edi, edi 11049 .LBB1_503: # =>This Inner Loop Header: Depth=1 11050 movdqu xmm1, xmmword ptr [rdx + rdi] 11051 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 11052 paddb xmm1, xmm0 11053 paddb xmm2, xmm0 11054 movdqu xmmword ptr [r8 + rdi], xmm1 11055 movdqu xmmword ptr [r8 + rdi + 16], xmm2 11056 movdqu xmm1, xmmword ptr [rdx + rdi + 32] 11057 movdqu xmm2, xmmword ptr [rdx + rdi + 48] 11058 paddb xmm1, xmm0 11059 paddb xmm2, xmm0 11060 movdqu xmmword ptr [r8 + rdi + 32], xmm1 11061 movdqu xmmword ptr [r8 + rdi + 48], xmm2 11062 add rdi, 64 11063 add rcx, 2 11064 jne .LBB1_503 11065 jmp .LBB1_750 11066 .LBB1_504: 11067 mov esi, r10d 11068 and esi, -32 11069 movzx ecx, al 11070 movd xmm0, ecx 11071 pxor xmm1, xmm1 11072 pshufb xmm0, xmm1 11073 lea rcx, [rsi - 32] 11074 mov r9, rcx 11075 shr r9, 5 11076 add r9, 1 11077 test rcx, rcx 11078 je .LBB1_757 11079 # %bb.505: 11080 mov rcx, r9 11081 and rcx, -2 11082 neg rcx 11083 xor edi, edi 11084 .LBB1_506: # =>This Inner Loop Header: Depth=1 11085 movdqu xmm1, xmmword ptr [rdx + rdi] 11086 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 11087 psubb xmm1, xmm0 11088 psubb xmm2, xmm0 11089 movdqu xmmword ptr [r8 + rdi], xmm1 11090 movdqu xmmword ptr [r8 + rdi + 16], xmm2 11091 movdqu xmm1, xmmword ptr [rdx + rdi + 32] 11092 movdqu xmm2, xmmword ptr [rdx + rdi + 48] 11093 psubb xmm1, xmm0 11094 psubb xmm2, xmm0 11095 movdqu xmmword ptr [r8 + rdi + 32], xmm1 11096 movdqu xmmword ptr [r8 + rdi + 48], xmm2 11097 add rdi, 64 11098 add rcx, 2 11099 jne .LBB1_506 11100 jmp .LBB1_758 11101 .LBB1_507: 11102 mov esi, r10d 11103 and esi, -4 11104 movq xmm0, rax 11105 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 11106 lea rcx, [rsi - 4] 11107 mov r9, rcx 11108 shr r9, 2 11109 add r9, 1 11110 test rcx, rcx 11111 je .LBB1_765 11112 # %bb.508: 11113 mov rcx, r9 11114 and rcx, -2 11115 neg rcx 11116 xor edi, edi 11117 .LBB1_509: # =>This Inner Loop Header: Depth=1 11118 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 11119 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 11120 paddq xmm1, xmm0 11121 paddq xmm2, xmm0 11122 movdqu xmmword ptr [r8 + 8*rdi], xmm1 11123 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 11124 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] 11125 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] 11126 paddq xmm1, xmm0 11127 paddq xmm2, xmm0 11128 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1 11129 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2 11130 add rdi, 8 11131 add rcx, 2 11132 jne .LBB1_509 11133 jmp .LBB1_766 11134 .LBB1_510: 11135 mov esi, r10d 11136 and esi, -4 11137 movq xmm0, rax 11138 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 11139 lea rcx, [rsi - 4] 11140 mov r9, rcx 11141 shr r9, 2 11142 add r9, 1 11143 test rcx, rcx 11144 je .LBB1_773 11145 # %bb.511: 11146 mov rcx, r9 11147 and rcx, -2 11148 neg rcx 11149 xor edi, edi 11150 .LBB1_512: # =>This Inner Loop Header: Depth=1 11151 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 11152 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 11153 psubq xmm1, xmm0 11154 psubq xmm2, xmm0 11155 movdqu xmmword ptr [r8 + 8*rdi], xmm1 11156 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 11157 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] 11158 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] 11159 psubq xmm1, xmm0 11160 psubq xmm2, xmm0 11161 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1 11162 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2 11163 add rdi, 8 11164 add rcx, 2 11165 jne .LBB1_512 11166 jmp .LBB1_774 11167 .LBB1_513: 11168 mov esi, r10d 11169 and esi, -4 11170 movq xmm0, rax 11171 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 11172 lea rcx, [rsi - 4] 11173 mov r9, rcx 11174 shr r9, 2 11175 add r9, 1 11176 test rcx, rcx 11177 je .LBB1_781 11178 # %bb.514: 11179 mov rcx, r9 11180 and rcx, -2 11181 neg rcx 11182 xor edi, edi 11183 .LBB1_515: # =>This Inner Loop Header: Depth=1 11184 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 11185 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 11186 paddq xmm1, xmm0 11187 paddq xmm2, xmm0 11188 movdqu xmmword ptr [r8 + 8*rdi], xmm1 11189 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 11190 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] 11191 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] 11192 paddq xmm1, xmm0 11193 paddq xmm2, xmm0 11194 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1 11195 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2 11196 add rdi, 8 11197 add rcx, 2 11198 jne .LBB1_515 11199 jmp .LBB1_782 11200 .LBB1_516: 11201 mov esi, r10d 11202 and esi, -4 11203 movq xmm0, rax 11204 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 11205 lea rcx, [rsi - 4] 11206 mov r9, rcx 11207 shr r9, 2 11208 add r9, 1 11209 test rcx, rcx 11210 je .LBB1_789 11211 # %bb.517: 11212 mov rcx, r9 11213 and rcx, -2 11214 neg rcx 11215 xor edi, edi 11216 .LBB1_518: # =>This Inner Loop Header: Depth=1 11217 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 11218 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 11219 psubq xmm1, xmm0 11220 psubq xmm2, xmm0 11221 movdqu xmmword ptr [r8 + 8*rdi], xmm1 11222 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 11223 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] 11224 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] 11225 psubq xmm1, xmm0 11226 psubq xmm2, xmm0 11227 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1 11228 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2 11229 add rdi, 8 11230 add rcx, 2 11231 jne .LBB1_518 11232 jmp .LBB1_790 11233 .LBB1_519: 11234 mov esi, r10d 11235 and esi, -16 11236 movd xmm0, eax 11237 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 11238 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 11239 lea rcx, [rsi - 16] 11240 mov r9, rcx 11241 shr r9, 4 11242 add r9, 1 11243 test rcx, rcx 11244 je .LBB1_797 11245 # %bb.520: 11246 mov rcx, r9 11247 and rcx, -2 11248 neg rcx 11249 xor edi, edi 11250 .LBB1_521: # =>This Inner Loop Header: Depth=1 11251 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 11252 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 11253 pmullw xmm1, xmm0 11254 pmullw xmm2, xmm0 11255 movdqu xmmword ptr [r8 + 2*rdi], xmm1 11256 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 11257 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] 11258 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] 11259 pmullw xmm1, xmm0 11260 pmullw xmm2, xmm0 11261 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 11262 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 11263 add rdi, 32 11264 add rcx, 2 11265 jne .LBB1_521 11266 jmp .LBB1_798 11267 .LBB1_522: 11268 mov esi, r10d 11269 and esi, -16 11270 movd xmm0, eax 11271 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 11272 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 11273 lea rcx, [rsi - 16] 11274 mov r9, rcx 11275 shr r9, 4 11276 add r9, 1 11277 test rcx, rcx 11278 je .LBB1_805 11279 # %bb.523: 11280 mov rcx, r9 11281 and rcx, -2 11282 neg rcx 11283 xor edi, edi 11284 .LBB1_524: # =>This Inner Loop Header: Depth=1 11285 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 11286 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 11287 pmullw xmm1, xmm0 11288 pmullw xmm2, xmm0 11289 movdqu xmmword ptr [r8 + 2*rdi], xmm1 11290 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 11291 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] 11292 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] 11293 pmullw xmm1, xmm0 11294 pmullw xmm2, xmm0 11295 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 11296 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 11297 add rdi, 32 11298 add rcx, 2 11299 jne .LBB1_524 11300 jmp .LBB1_806 11301 .LBB1_525: 11302 mov esi, r10d 11303 and esi, -16 11304 movd xmm0, eax 11305 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 11306 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 11307 lea rcx, [rsi - 16] 11308 mov r9, rcx 11309 shr r9, 4 11310 add r9, 1 11311 test rcx, rcx 11312 je .LBB1_813 11313 # %bb.526: 11314 mov rcx, r9 11315 and rcx, -2 11316 neg rcx 11317 xor edi, edi 11318 .LBB1_527: # =>This Inner Loop Header: Depth=1 11319 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 11320 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 11321 pmullw xmm1, xmm0 11322 pmullw xmm2, xmm0 11323 movdqu xmmword ptr [r8 + 2*rdi], xmm1 11324 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 11325 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] 11326 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] 11327 pmullw xmm1, xmm0 11328 pmullw xmm2, xmm0 11329 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 11330 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 11331 add rdi, 32 11332 add rcx, 2 11333 jne .LBB1_527 11334 jmp .LBB1_814 11335 .LBB1_528: 11336 mov esi, r10d 11337 and esi, -16 11338 movd xmm0, eax 11339 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 11340 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 11341 lea rcx, [rsi - 16] 11342 mov r9, rcx 11343 shr r9, 4 11344 add r9, 1 11345 test rcx, rcx 11346 je .LBB1_821 11347 # %bb.529: 11348 mov rcx, r9 11349 and rcx, -2 11350 neg rcx 11351 xor edi, edi 11352 .LBB1_530: # =>This Inner Loop Header: Depth=1 11353 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 11354 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 11355 pmullw xmm1, xmm0 11356 pmullw xmm2, xmm0 11357 movdqu xmmword ptr [r8 + 2*rdi], xmm1 11358 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 11359 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] 11360 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] 11361 pmullw xmm1, xmm0 11362 pmullw xmm2, xmm0 11363 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 11364 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 11365 add rdi, 32 11366 add rcx, 2 11367 jne .LBB1_530 11368 jmp .LBB1_822 11369 .LBB1_531: 11370 mov esi, r10d 11371 and esi, -16 11372 movd xmm0, eax 11373 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 11374 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 11375 lea rcx, [rsi - 16] 11376 mov r9, rcx 11377 shr r9, 4 11378 add r9, 1 11379 test rcx, rcx 11380 je .LBB1_829 11381 # %bb.532: 11382 mov rcx, r9 11383 and rcx, -2 11384 neg rcx 11385 xor edi, edi 11386 .LBB1_533: # =>This Inner Loop Header: Depth=1 11387 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 11388 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 11389 paddw xmm1, xmm0 11390 paddw xmm2, xmm0 11391 movdqu xmmword ptr [r8 + 2*rdi], xmm1 11392 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 11393 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] 11394 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] 11395 paddw xmm1, xmm0 11396 paddw xmm2, xmm0 11397 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 11398 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 11399 add rdi, 32 11400 add rcx, 2 11401 jne .LBB1_533 11402 jmp .LBB1_830 11403 .LBB1_534: 11404 mov esi, r10d 11405 and esi, -16 11406 movd xmm0, eax 11407 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 11408 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 11409 lea rcx, [rsi - 16] 11410 mov r9, rcx 11411 shr r9, 4 11412 add r9, 1 11413 test rcx, rcx 11414 je .LBB1_837 11415 # %bb.535: 11416 mov rcx, r9 11417 and rcx, -2 11418 neg rcx 11419 xor edi, edi 11420 .LBB1_536: # =>This Inner Loop Header: Depth=1 11421 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 11422 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 11423 paddw xmm1, xmm0 11424 paddw xmm2, xmm0 11425 movdqu xmmword ptr [r8 + 2*rdi], xmm1 11426 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 11427 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] 11428 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] 11429 paddw xmm1, xmm0 11430 paddw xmm2, xmm0 11431 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 11432 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 11433 add rdi, 32 11434 add rcx, 2 11435 jne .LBB1_536 11436 jmp .LBB1_838 11437 .LBB1_537: 11438 mov esi, r10d 11439 and esi, -16 11440 movd xmm0, eax 11441 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 11442 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 11443 lea rcx, [rsi - 16] 11444 mov r9, rcx 11445 shr r9, 4 11446 add r9, 1 11447 test rcx, rcx 11448 je .LBB1_845 11449 # %bb.538: 11450 mov rcx, r9 11451 and rcx, -2 11452 neg rcx 11453 xor edi, edi 11454 .LBB1_539: # =>This Inner Loop Header: Depth=1 11455 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 11456 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 11457 psubw xmm1, xmm0 11458 psubw xmm2, xmm0 11459 movdqu xmmword ptr [r8 + 2*rdi], xmm1 11460 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 11461 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] 11462 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] 11463 psubw xmm1, xmm0 11464 psubw xmm2, xmm0 11465 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 11466 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 11467 add rdi, 32 11468 add rcx, 2 11469 jne .LBB1_539 11470 jmp .LBB1_846 11471 .LBB1_540: 11472 mov esi, r10d 11473 and esi, -16 11474 movd xmm0, eax 11475 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 11476 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 11477 lea rcx, [rsi - 16] 11478 mov r9, rcx 11479 shr r9, 4 11480 add r9, 1 11481 test rcx, rcx 11482 je .LBB1_853 11483 # %bb.541: 11484 mov rcx, r9 11485 and rcx, -2 11486 neg rcx 11487 xor edi, edi 11488 .LBB1_542: # =>This Inner Loop Header: Depth=1 11489 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 11490 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 11491 psubw xmm1, xmm0 11492 psubw xmm2, xmm0 11493 movdqu xmmword ptr [r8 + 2*rdi], xmm1 11494 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 11495 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] 11496 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] 11497 psubw xmm1, xmm0 11498 psubw xmm2, xmm0 11499 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 11500 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 11501 add rdi, 32 11502 add rcx, 2 11503 jne .LBB1_542 11504 jmp .LBB1_854 11505 .LBB1_543: 11506 mov esi, r10d 11507 and esi, -16 11508 movd xmm0, eax 11509 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 11510 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 11511 lea rcx, [rsi - 16] 11512 mov r9, rcx 11513 shr r9, 4 11514 add r9, 1 11515 test rcx, rcx 11516 je .LBB1_861 11517 # %bb.544: 11518 mov rcx, r9 11519 and rcx, -2 11520 neg rcx 11521 xor edi, edi 11522 .LBB1_545: # =>This Inner Loop Header: Depth=1 11523 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 11524 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 11525 paddw xmm1, xmm0 11526 paddw xmm2, xmm0 11527 movdqu xmmword ptr [r8 + 2*rdi], xmm1 11528 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 11529 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] 11530 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] 11531 paddw xmm1, xmm0 11532 paddw xmm2, xmm0 11533 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 11534 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 11535 add rdi, 32 11536 add rcx, 2 11537 jne .LBB1_545 11538 jmp .LBB1_862 11539 .LBB1_546: 11540 mov esi, r10d 11541 and esi, -16 11542 movd xmm0, eax 11543 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 11544 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 11545 lea rcx, [rsi - 16] 11546 mov r9, rcx 11547 shr r9, 4 11548 add r9, 1 11549 test rcx, rcx 11550 je .LBB1_869 11551 # %bb.547: 11552 mov rcx, r9 11553 and rcx, -2 11554 neg rcx 11555 xor edi, edi 11556 .LBB1_548: # =>This Inner Loop Header: Depth=1 11557 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 11558 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 11559 paddw xmm1, xmm0 11560 paddw xmm2, xmm0 11561 movdqu xmmword ptr [r8 + 2*rdi], xmm1 11562 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 11563 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] 11564 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] 11565 paddw xmm1, xmm0 11566 paddw xmm2, xmm0 11567 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 11568 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 11569 add rdi, 32 11570 add rcx, 2 11571 jne .LBB1_548 11572 jmp .LBB1_870 11573 .LBB1_549: 11574 mov esi, r10d 11575 and esi, -16 11576 movd xmm0, eax 11577 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 11578 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 11579 lea rcx, [rsi - 16] 11580 mov r9, rcx 11581 shr r9, 4 11582 add r9, 1 11583 test rcx, rcx 11584 je .LBB1_877 11585 # %bb.550: 11586 mov rcx, r9 11587 and rcx, -2 11588 neg rcx 11589 xor edi, edi 11590 .LBB1_551: # =>This Inner Loop Header: Depth=1 11591 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 11592 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 11593 psubw xmm1, xmm0 11594 psubw xmm2, xmm0 11595 movdqu xmmword ptr [r8 + 2*rdi], xmm1 11596 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 11597 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] 11598 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] 11599 psubw xmm1, xmm0 11600 psubw xmm2, xmm0 11601 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 11602 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 11603 add rdi, 32 11604 add rcx, 2 11605 jne .LBB1_551 11606 jmp .LBB1_878 11607 .LBB1_552: 11608 mov esi, r10d 11609 and esi, -16 11610 movd xmm0, eax 11611 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 11612 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 11613 lea rcx, [rsi - 16] 11614 mov r9, rcx 11615 shr r9, 4 11616 add r9, 1 11617 test rcx, rcx 11618 je .LBB1_885 11619 # %bb.553: 11620 mov rcx, r9 11621 and rcx, -2 11622 neg rcx 11623 xor edi, edi 11624 .LBB1_554: # =>This Inner Loop Header: Depth=1 11625 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 11626 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 11627 psubw xmm1, xmm0 11628 psubw xmm2, xmm0 11629 movdqu xmmword ptr [r8 + 2*rdi], xmm1 11630 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 11631 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] 11632 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] 11633 psubw xmm1, xmm0 11634 psubw xmm2, xmm0 11635 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 11636 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 11637 add rdi, 32 11638 add rcx, 2 11639 jne .LBB1_554 11640 jmp .LBB1_886 11641 .LBB1_555: 11642 mov ecx, eax 11643 and ecx, -8 11644 movaps xmm1, xmm0 11645 shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0] 11646 lea rsi, [rcx - 8] 11647 mov r9, rsi 11648 shr r9, 3 11649 add r9, 1 11650 test rsi, rsi 11651 je .LBB1_893 11652 # %bb.556: 11653 mov rsi, r9 11654 and rsi, -2 11655 neg rsi 11656 xor edi, edi 11657 .LBB1_557: # =>This Inner Loop Header: Depth=1 11658 movups xmm2, xmmword ptr [rdx + 4*rdi] 11659 movups xmm3, xmmword ptr [rdx + 4*rdi + 16] 11660 mulps xmm2, xmm1 11661 mulps xmm3, xmm1 11662 movups xmmword ptr [r8 + 4*rdi], xmm2 11663 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 11664 movups xmm2, xmmword ptr [rdx + 4*rdi + 32] 11665 movups xmm3, xmmword ptr [rdx + 4*rdi + 48] 11666 mulps xmm2, xmm1 11667 mulps xmm3, xmm1 11668 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 11669 movups xmmword ptr [r8 + 4*rdi + 48], xmm3 11670 add rdi, 16 11671 add rsi, 2 11672 jne .LBB1_557 11673 jmp .LBB1_894 11674 .LBB1_558: 11675 mov ecx, eax 11676 and ecx, -8 11677 movaps xmm1, xmm0 11678 shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0] 11679 lea rsi, [rcx - 8] 11680 mov r9, rsi 11681 shr r9, 3 11682 add r9, 1 11683 test rsi, rsi 11684 je .LBB1_901 11685 # %bb.559: 11686 mov rsi, r9 11687 and rsi, -2 11688 neg rsi 11689 xor edi, edi 11690 .LBB1_560: # =>This Inner Loop Header: Depth=1 11691 movups xmm2, xmmword ptr [rdx + 4*rdi] 11692 movups xmm3, xmmword ptr [rdx + 4*rdi + 16] 11693 mulps xmm2, xmm1 11694 mulps xmm3, xmm1 11695 movups xmmword ptr [r8 + 4*rdi], xmm2 11696 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 11697 movups xmm2, xmmword ptr [rdx + 4*rdi + 32] 11698 movups xmm3, xmmword ptr [rdx + 4*rdi + 48] 11699 mulps xmm2, xmm1 11700 mulps xmm3, xmm1 11701 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 11702 movups xmmword ptr [r8 + 4*rdi + 48], xmm3 11703 add rdi, 16 11704 add rsi, 2 11705 jne .LBB1_560 11706 jmp .LBB1_902 11707 .LBB1_561: 11708 mov esi, r10d 11709 and esi, -4 11710 movq xmm0, rax 11711 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 11712 lea rcx, [rsi - 4] 11713 mov r9, rcx 11714 shr r9, 2 11715 add r9, 1 11716 test rcx, rcx 11717 je .LBB1_909 11718 # %bb.562: 11719 mov rcx, r9 11720 and rcx, -2 11721 neg rcx 11722 xor edi, edi 11723 .LBB1_563: # =>This Inner Loop Header: Depth=1 11724 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 11725 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 11726 paddq xmm1, xmm0 11727 paddq xmm2, xmm0 11728 movdqu xmmword ptr [r8 + 8*rdi], xmm1 11729 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 11730 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] 11731 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] 11732 paddq xmm1, xmm0 11733 paddq xmm2, xmm0 11734 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1 11735 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2 11736 add rdi, 8 11737 add rcx, 2 11738 jne .LBB1_563 11739 jmp .LBB1_910 11740 .LBB1_564: 11741 mov ecx, eax 11742 and ecx, -8 11743 movaps xmm1, xmm0 11744 shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0] 11745 lea rsi, [rcx - 8] 11746 mov r9, rsi 11747 shr r9, 3 11748 add r9, 1 11749 test rsi, rsi 11750 je .LBB1_917 11751 # %bb.565: 11752 mov rsi, r9 11753 and rsi, -2 11754 neg rsi 11755 xor edi, edi 11756 .LBB1_566: # =>This Inner Loop Header: Depth=1 11757 movups xmm2, xmmword ptr [rdx + 4*rdi] 11758 movups xmm3, xmmword ptr [rdx + 4*rdi + 16] 11759 addps xmm2, xmm1 11760 addps xmm3, xmm1 11761 movups xmmword ptr [r8 + 4*rdi], xmm2 11762 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 11763 movups xmm2, xmmword ptr [rdx + 4*rdi + 32] 11764 movups xmm3, xmmword ptr [rdx + 4*rdi + 48] 11765 addps xmm2, xmm1 11766 addps xmm3, xmm1 11767 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 11768 movups xmmword ptr [r8 + 4*rdi + 48], xmm3 11769 add rdi, 16 11770 add rsi, 2 11771 jne .LBB1_566 11772 jmp .LBB1_918 11773 .LBB1_567: 11774 mov esi, r10d 11775 and esi, -4 11776 movq xmm0, rax 11777 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 11778 lea rcx, [rsi - 4] 11779 mov r9, rcx 11780 shr r9, 2 11781 add r9, 1 11782 test rcx, rcx 11783 je .LBB1_925 11784 # %bb.568: 11785 mov rcx, r9 11786 and rcx, -2 11787 neg rcx 11788 xor edi, edi 11789 .LBB1_569: # =>This Inner Loop Header: Depth=1 11790 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 11791 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 11792 psubq xmm1, xmm0 11793 psubq xmm2, xmm0 11794 movdqu xmmword ptr [r8 + 8*rdi], xmm1 11795 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 11796 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] 11797 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] 11798 psubq xmm1, xmm0 11799 psubq xmm2, xmm0 11800 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1 11801 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2 11802 add rdi, 8 11803 add rcx, 2 11804 jne .LBB1_569 11805 jmp .LBB1_926 11806 .LBB1_570: 11807 mov ecx, eax 11808 and ecx, -8 11809 movaps xmm1, xmm0 11810 shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0] 11811 lea rsi, [rcx - 8] 11812 mov r9, rsi 11813 shr r9, 3 11814 add r9, 1 11815 test rsi, rsi 11816 je .LBB1_933 11817 # %bb.571: 11818 mov rsi, r9 11819 and rsi, -2 11820 neg rsi 11821 xor edi, edi 11822 .LBB1_572: # =>This Inner Loop Header: Depth=1 11823 movups xmm2, xmmword ptr [rdx + 4*rdi] 11824 movups xmm3, xmmword ptr [rdx + 4*rdi + 16] 11825 subps xmm2, xmm1 11826 subps xmm3, xmm1 11827 movups xmmword ptr [r8 + 4*rdi], xmm2 11828 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 11829 movups xmm2, xmmword ptr [rdx + 4*rdi + 32] 11830 movups xmm3, xmmword ptr [rdx + 4*rdi + 48] 11831 subps xmm2, xmm1 11832 subps xmm3, xmm1 11833 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 11834 movups xmmword ptr [r8 + 4*rdi + 48], xmm3 11835 add rdi, 16 11836 add rsi, 2 11837 jne .LBB1_572 11838 jmp .LBB1_934 11839 .LBB1_573: 11840 mov esi, r10d 11841 and esi, -4 11842 movq xmm0, rax 11843 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 11844 lea rcx, [rsi - 4] 11845 mov r9, rcx 11846 shr r9, 2 11847 add r9, 1 11848 test rcx, rcx 11849 je .LBB1_941 11850 # %bb.574: 11851 mov rcx, r9 11852 and rcx, -2 11853 neg rcx 11854 xor edi, edi 11855 .LBB1_575: # =>This Inner Loop Header: Depth=1 11856 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 11857 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 11858 paddq xmm1, xmm0 11859 paddq xmm2, xmm0 11860 movdqu xmmword ptr [r8 + 8*rdi], xmm1 11861 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 11862 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] 11863 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] 11864 paddq xmm1, xmm0 11865 paddq xmm2, xmm0 11866 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1 11867 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2 11868 add rdi, 8 11869 add rcx, 2 11870 jne .LBB1_575 11871 jmp .LBB1_942 11872 .LBB1_576: 11873 mov ecx, eax 11874 and ecx, -8 11875 movaps xmm1, xmm0 11876 shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0] 11877 lea rsi, [rcx - 8] 11878 mov r9, rsi 11879 shr r9, 3 11880 add r9, 1 11881 test rsi, rsi 11882 je .LBB1_949 11883 # %bb.577: 11884 mov rsi, r9 11885 and rsi, -2 11886 neg rsi 11887 xor edi, edi 11888 .LBB1_578: # =>This Inner Loop Header: Depth=1 11889 movups xmm2, xmmword ptr [rdx + 4*rdi] 11890 movups xmm3, xmmword ptr [rdx + 4*rdi + 16] 11891 addps xmm2, xmm1 11892 addps xmm3, xmm1 11893 movups xmmword ptr [r8 + 4*rdi], xmm2 11894 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 11895 movups xmm2, xmmword ptr [rdx + 4*rdi + 32] 11896 movups xmm3, xmmword ptr [rdx + 4*rdi + 48] 11897 addps xmm2, xmm1 11898 addps xmm3, xmm1 11899 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 11900 movups xmmword ptr [r8 + 4*rdi + 48], xmm3 11901 add rdi, 16 11902 add rsi, 2 11903 jne .LBB1_578 11904 jmp .LBB1_950 11905 .LBB1_579: 11906 mov esi, r10d 11907 and esi, -4 11908 movq xmm0, rax 11909 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 11910 lea rcx, [rsi - 4] 11911 mov r9, rcx 11912 shr r9, 2 11913 add r9, 1 11914 test rcx, rcx 11915 je .LBB1_957 11916 # %bb.580: 11917 mov rcx, r9 11918 and rcx, -2 11919 neg rcx 11920 xor edi, edi 11921 .LBB1_581: # =>This Inner Loop Header: Depth=1 11922 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 11923 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 11924 psubq xmm1, xmm0 11925 psubq xmm2, xmm0 11926 movdqu xmmword ptr [r8 + 8*rdi], xmm1 11927 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 11928 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] 11929 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] 11930 psubq xmm1, xmm0 11931 psubq xmm2, xmm0 11932 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1 11933 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2 11934 add rdi, 8 11935 add rcx, 2 11936 jne .LBB1_581 11937 jmp .LBB1_958 11938 .LBB1_582: 11939 mov ecx, eax 11940 and ecx, -8 11941 movaps xmm1, xmm0 11942 shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0] 11943 lea rsi, [rcx - 8] 11944 mov r9, rsi 11945 shr r9, 3 11946 add r9, 1 11947 test rsi, rsi 11948 je .LBB1_965 11949 # %bb.583: 11950 mov rsi, r9 11951 and rsi, -2 11952 neg rsi 11953 xor edi, edi 11954 .LBB1_584: # =>This Inner Loop Header: Depth=1 11955 movups xmm2, xmmword ptr [rdx + 4*rdi] 11956 movups xmm3, xmmword ptr [rdx + 4*rdi + 16] 11957 subps xmm2, xmm1 11958 subps xmm3, xmm1 11959 movups xmmword ptr [r8 + 4*rdi], xmm2 11960 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 11961 movups xmm2, xmmword ptr [rdx + 4*rdi + 32] 11962 movups xmm3, xmmword ptr [rdx + 4*rdi + 48] 11963 subps xmm2, xmm1 11964 subps xmm3, xmm1 11965 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 11966 movups xmmword ptr [r8 + 4*rdi + 48], xmm3 11967 add rdi, 16 11968 add rsi, 2 11969 jne .LBB1_584 11970 jmp .LBB1_966 11971 .LBB1_585: 11972 mov edi, r10d 11973 and edi, -32 11974 movzx eax, cl 11975 movd xmm0, eax 11976 pxor xmm1, xmm1 11977 pshufb xmm0, xmm1 11978 lea rax, [rdi - 32] 11979 mov r9, rax 11980 shr r9, 5 11981 add r9, 1 11982 pmovzxbw xmm1, xmm0 # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 11983 test rax, rax 11984 je .LBB1_973 11985 # %bb.586: 11986 mov rsi, r9 11987 and rsi, -2 11988 neg rsi 11989 xor eax, eax 11990 movdqa xmm2, xmm0 11991 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 11992 movdqa xmm3, xmmword ptr [rip + .LCPI1_0] # xmm3 = [255,255,255,255,255,255,255,255] 11993 movdqa xmm4, xmm0 11994 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 11995 .LBB1_587: # =>This Inner Loop Header: Depth=1 11996 movdqu xmm5, xmmword ptr [rdx + rax] 11997 movdqu xmm6, xmmword ptr [rdx + rax + 16] 11998 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 11999 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12000 pmullw xmm5, xmm2 12001 pand xmm5, xmm3 12002 pmullw xmm7, xmm1 12003 pand xmm7, xmm3 12004 packuswb xmm7, xmm5 12005 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 12006 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12007 pmullw xmm6, xmm4 12008 pand xmm6, xmm3 12009 pmullw xmm5, xmm1 12010 pand xmm5, xmm3 12011 packuswb xmm5, xmm6 12012 movdqu xmmword ptr [r8 + rax], xmm7 12013 movdqu xmmword ptr [r8 + rax + 16], xmm5 12014 movdqu xmm5, xmmword ptr [rdx + rax + 32] 12015 movdqu xmm6, xmmword ptr [rdx + rax + 48] 12016 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 12017 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12018 pmullw xmm5, xmm2 12019 pand xmm5, xmm3 12020 pmullw xmm7, xmm1 12021 pand xmm7, xmm3 12022 packuswb xmm7, xmm5 12023 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 12024 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12025 pmullw xmm6, xmm4 12026 pand xmm6, xmm3 12027 pmullw xmm5, xmm1 12028 pand xmm5, xmm3 12029 packuswb xmm5, xmm6 12030 movdqu xmmword ptr [r8 + rax + 32], xmm7 12031 movdqu xmmword ptr [r8 + rax + 48], xmm5 12032 add rax, 64 12033 add rsi, 2 12034 jne .LBB1_587 12035 jmp .LBB1_974 12036 .LBB1_588: 12037 mov edi, r10d 12038 and edi, -32 12039 movzx eax, cl 12040 movd xmm0, eax 12041 pxor xmm1, xmm1 12042 pshufb xmm0, xmm1 12043 lea rax, [rdi - 32] 12044 mov r9, rax 12045 shr r9, 5 12046 add r9, 1 12047 pmovzxbw xmm1, xmm0 # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 12048 test rax, rax 12049 je .LBB1_981 12050 # %bb.589: 12051 mov rsi, r9 12052 and rsi, -2 12053 neg rsi 12054 xor eax, eax 12055 movdqa xmm2, xmm0 12056 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12057 movdqa xmm3, xmmword ptr [rip + .LCPI1_0] # xmm3 = [255,255,255,255,255,255,255,255] 12058 movdqa xmm4, xmm0 12059 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12060 .LBB1_590: # =>This Inner Loop Header: Depth=1 12061 movdqu xmm5, xmmword ptr [rdx + rax] 12062 movdqu xmm6, xmmword ptr [rdx + rax + 16] 12063 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 12064 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12065 pmullw xmm5, xmm2 12066 pand xmm5, xmm3 12067 pmullw xmm7, xmm1 12068 pand xmm7, xmm3 12069 packuswb xmm7, xmm5 12070 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 12071 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12072 pmullw xmm6, xmm4 12073 pand xmm6, xmm3 12074 pmullw xmm5, xmm1 12075 pand xmm5, xmm3 12076 packuswb xmm5, xmm6 12077 movdqu xmmword ptr [r8 + rax], xmm7 12078 movdqu xmmword ptr [r8 + rax + 16], xmm5 12079 movdqu xmm5, xmmword ptr [rdx + rax + 32] 12080 movdqu xmm6, xmmword ptr [rdx + rax + 48] 12081 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 12082 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12083 pmullw xmm5, xmm2 12084 pand xmm5, xmm3 12085 pmullw xmm7, xmm1 12086 pand xmm7, xmm3 12087 packuswb xmm7, xmm5 12088 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 12089 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12090 pmullw xmm6, xmm4 12091 pand xmm6, xmm3 12092 pmullw xmm5, xmm1 12093 pand xmm5, xmm3 12094 packuswb xmm5, xmm6 12095 movdqu xmmword ptr [r8 + rax + 32], xmm7 12096 movdqu xmmword ptr [r8 + rax + 48], xmm5 12097 add rax, 64 12098 add rsi, 2 12099 jne .LBB1_590 12100 jmp .LBB1_982 12101 .LBB1_591: 12102 mov esi, r10d 12103 and esi, -32 12104 movzx ecx, al 12105 movd xmm0, ecx 12106 pxor xmm1, xmm1 12107 pshufb xmm0, xmm1 12108 lea rcx, [rsi - 32] 12109 mov r9, rcx 12110 shr r9, 5 12111 add r9, 1 12112 test rcx, rcx 12113 je .LBB1_989 12114 # %bb.592: 12115 mov rcx, r9 12116 and rcx, -2 12117 neg rcx 12118 xor edi, edi 12119 .LBB1_593: # =>This Inner Loop Header: Depth=1 12120 movdqu xmm1, xmmword ptr [rdx + rdi] 12121 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 12122 paddb xmm1, xmm0 12123 paddb xmm2, xmm0 12124 movdqu xmmword ptr [r8 + rdi], xmm1 12125 movdqu xmmword ptr [r8 + rdi + 16], xmm2 12126 movdqu xmm1, xmmword ptr [rdx + rdi + 32] 12127 movdqu xmm2, xmmword ptr [rdx + rdi + 48] 12128 paddb xmm1, xmm0 12129 paddb xmm2, xmm0 12130 movdqu xmmword ptr [r8 + rdi + 32], xmm1 12131 movdqu xmmword ptr [r8 + rdi + 48], xmm2 12132 add rdi, 64 12133 add rcx, 2 12134 jne .LBB1_593 12135 jmp .LBB1_990 12136 .LBB1_594: 12137 mov esi, r10d 12138 and esi, -32 12139 movzx ecx, al 12140 movd xmm0, ecx 12141 pxor xmm1, xmm1 12142 pshufb xmm0, xmm1 12143 lea rcx, [rsi - 32] 12144 mov r9, rcx 12145 shr r9, 5 12146 add r9, 1 12147 test rcx, rcx 12148 je .LBB1_997 12149 # %bb.595: 12150 mov rcx, r9 12151 and rcx, -2 12152 neg rcx 12153 xor edi, edi 12154 .LBB1_596: # =>This Inner Loop Header: Depth=1 12155 movdqu xmm1, xmmword ptr [rdx + rdi] 12156 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 12157 psubb xmm1, xmm0 12158 psubb xmm2, xmm0 12159 movdqu xmmword ptr [r8 + rdi], xmm1 12160 movdqu xmmword ptr [r8 + rdi + 16], xmm2 12161 movdqu xmm1, xmmword ptr [rdx + rdi + 32] 12162 movdqu xmm2, xmmword ptr [rdx + rdi + 48] 12163 psubb xmm1, xmm0 12164 psubb xmm2, xmm0 12165 movdqu xmmword ptr [r8 + rdi + 32], xmm1 12166 movdqu xmmword ptr [r8 + rdi + 48], xmm2 12167 add rdi, 64 12168 add rcx, 2 12169 jne .LBB1_596 12170 jmp .LBB1_998 12171 .LBB1_597: 12172 mov esi, r10d 12173 and esi, -32 12174 movzx ecx, al 12175 movd xmm0, ecx 12176 pxor xmm1, xmm1 12177 pshufb xmm0, xmm1 12178 lea rcx, [rsi - 32] 12179 mov r9, rcx 12180 shr r9, 5 12181 add r9, 1 12182 test rcx, rcx 12183 je .LBB1_1005 12184 # %bb.598: 12185 mov rcx, r9 12186 and rcx, -2 12187 neg rcx 12188 xor edi, edi 12189 .LBB1_599: # =>This Inner Loop Header: Depth=1 12190 movdqu xmm1, xmmword ptr [rdx + rdi] 12191 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 12192 paddb xmm1, xmm0 12193 paddb xmm2, xmm0 12194 movdqu xmmword ptr [r8 + rdi], xmm1 12195 movdqu xmmword ptr [r8 + rdi + 16], xmm2 12196 movdqu xmm1, xmmword ptr [rdx + rdi + 32] 12197 movdqu xmm2, xmmword ptr [rdx + rdi + 48] 12198 paddb xmm1, xmm0 12199 paddb xmm2, xmm0 12200 movdqu xmmword ptr [r8 + rdi + 32], xmm1 12201 movdqu xmmword ptr [r8 + rdi + 48], xmm2 12202 add rdi, 64 12203 add rcx, 2 12204 jne .LBB1_599 12205 jmp .LBB1_1006 12206 .LBB1_600: 12207 mov esi, r10d 12208 and esi, -32 12209 movzx ecx, al 12210 movd xmm0, ecx 12211 pxor xmm1, xmm1 12212 pshufb xmm0, xmm1 12213 lea rcx, [rsi - 32] 12214 mov r9, rcx 12215 shr r9, 5 12216 add r9, 1 12217 test rcx, rcx 12218 je .LBB1_1013 12219 # %bb.601: 12220 mov rcx, r9 12221 and rcx, -2 12222 neg rcx 12223 xor edi, edi 12224 .LBB1_602: # =>This Inner Loop Header: Depth=1 12225 movdqu xmm1, xmmword ptr [rdx + rdi] 12226 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 12227 psubb xmm1, xmm0 12228 psubb xmm2, xmm0 12229 movdqu xmmword ptr [r8 + rdi], xmm1 12230 movdqu xmmword ptr [r8 + rdi + 16], xmm2 12231 movdqu xmm1, xmmword ptr [rdx + rdi + 32] 12232 movdqu xmm2, xmmword ptr [rdx + rdi + 48] 12233 psubb xmm1, xmm0 12234 psubb xmm2, xmm0 12235 movdqu xmmword ptr [r8 + rdi + 32], xmm1 12236 movdqu xmmword ptr [r8 + rdi + 48], xmm2 12237 add rdi, 64 12238 add rcx, 2 12239 jne .LBB1_602 12240 jmp .LBB1_1014 12241 .LBB1_603: 12242 mov esi, r10d 12243 and esi, -8 12244 movd xmm0, eax 12245 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 12246 lea rcx, [rsi - 8] 12247 mov r9, rcx 12248 shr r9, 3 12249 add r9, 1 12250 test rcx, rcx 12251 je .LBB1_1021 12252 # %bb.604: 12253 mov rcx, r9 12254 and rcx, -2 12255 neg rcx 12256 xor edi, edi 12257 .LBB1_605: # =>This Inner Loop Header: Depth=1 12258 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 12259 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 12260 pmulld xmm1, xmm0 12261 pmulld xmm2, xmm0 12262 movdqu xmmword ptr [r8 + 4*rdi], xmm1 12263 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 12264 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] 12265 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] 12266 pmulld xmm1, xmm0 12267 pmulld xmm2, xmm0 12268 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 12269 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 12270 add rdi, 16 12271 add rcx, 2 12272 jne .LBB1_605 12273 jmp .LBB1_1022 12274 .LBB1_606: 12275 mov esi, r10d 12276 and esi, -8 12277 movd xmm0, eax 12278 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 12279 lea rcx, [rsi - 8] 12280 mov r9, rcx 12281 shr r9, 3 12282 add r9, 1 12283 test rcx, rcx 12284 je .LBB1_1029 12285 # %bb.607: 12286 mov rcx, r9 12287 and rcx, -2 12288 neg rcx 12289 xor edi, edi 12290 .LBB1_608: # =>This Inner Loop Header: Depth=1 12291 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 12292 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 12293 pmulld xmm1, xmm0 12294 pmulld xmm2, xmm0 12295 movdqu xmmword ptr [r8 + 4*rdi], xmm1 12296 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 12297 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] 12298 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] 12299 pmulld xmm1, xmm0 12300 pmulld xmm2, xmm0 12301 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 12302 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 12303 add rdi, 16 12304 add rcx, 2 12305 jne .LBB1_608 12306 jmp .LBB1_1030 12307 .LBB1_609: 12308 mov esi, r10d 12309 and esi, -8 12310 movd xmm0, eax 12311 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 12312 lea rcx, [rsi - 8] 12313 mov r9, rcx 12314 shr r9, 3 12315 add r9, 1 12316 test rcx, rcx 12317 je .LBB1_1037 12318 # %bb.610: 12319 mov rcx, r9 12320 and rcx, -2 12321 neg rcx 12322 xor edi, edi 12323 .LBB1_611: # =>This Inner Loop Header: Depth=1 12324 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 12325 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 12326 paddd xmm1, xmm0 12327 paddd xmm2, xmm0 12328 movdqu xmmword ptr [r8 + 4*rdi], xmm1 12329 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 12330 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] 12331 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] 12332 paddd xmm1, xmm0 12333 paddd xmm2, xmm0 12334 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 12335 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 12336 add rdi, 16 12337 add rcx, 2 12338 jne .LBB1_611 12339 jmp .LBB1_1038 12340 .LBB1_612: 12341 mov esi, r10d 12342 and esi, -8 12343 movd xmm0, eax 12344 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 12345 lea rcx, [rsi - 8] 12346 mov r9, rcx 12347 shr r9, 3 12348 add r9, 1 12349 test rcx, rcx 12350 je .LBB1_1045 12351 # %bb.613: 12352 mov rcx, r9 12353 and rcx, -2 12354 neg rcx 12355 xor edi, edi 12356 .LBB1_614: # =>This Inner Loop Header: Depth=1 12357 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 12358 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 12359 psubd xmm1, xmm0 12360 psubd xmm2, xmm0 12361 movdqu xmmword ptr [r8 + 4*rdi], xmm1 12362 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 12363 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] 12364 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] 12365 psubd xmm1, xmm0 12366 psubd xmm2, xmm0 12367 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 12368 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 12369 add rdi, 16 12370 add rcx, 2 12371 jne .LBB1_614 12372 jmp .LBB1_1046 12373 .LBB1_615: 12374 mov esi, r10d 12375 and esi, -8 12376 movd xmm0, eax 12377 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 12378 lea rcx, [rsi - 8] 12379 mov r9, rcx 12380 shr r9, 3 12381 add r9, 1 12382 test rcx, rcx 12383 je .LBB1_1053 12384 # %bb.616: 12385 mov rcx, r9 12386 and rcx, -2 12387 neg rcx 12388 xor edi, edi 12389 .LBB1_617: # =>This Inner Loop Header: Depth=1 12390 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 12391 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 12392 paddd xmm1, xmm0 12393 paddd xmm2, xmm0 12394 movdqu xmmword ptr [r8 + 4*rdi], xmm1 12395 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 12396 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] 12397 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] 12398 paddd xmm1, xmm0 12399 paddd xmm2, xmm0 12400 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 12401 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 12402 add rdi, 16 12403 add rcx, 2 12404 jne .LBB1_617 12405 jmp .LBB1_1054 12406 .LBB1_618: 12407 mov esi, r10d 12408 and esi, -8 12409 movd xmm0, eax 12410 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 12411 lea rcx, [rsi - 8] 12412 mov r9, rcx 12413 shr r9, 3 12414 add r9, 1 12415 test rcx, rcx 12416 je .LBB1_1061 12417 # %bb.619: 12418 mov rcx, r9 12419 and rcx, -2 12420 neg rcx 12421 xor edi, edi 12422 .LBB1_620: # =>This Inner Loop Header: Depth=1 12423 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 12424 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 12425 psubd xmm1, xmm0 12426 psubd xmm2, xmm0 12427 movdqu xmmword ptr [r8 + 4*rdi], xmm1 12428 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 12429 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] 12430 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] 12431 psubd xmm1, xmm0 12432 psubd xmm2, xmm0 12433 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 12434 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 12435 add rdi, 16 12436 add rcx, 2 12437 jne .LBB1_620 12438 jmp .LBB1_1062 12439 .LBB1_621: 12440 xor edi, edi 12441 .LBB1_622: 12442 test r9b, 1 12443 je .LBB1_624 12444 # %bb.623: 12445 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 12446 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 12447 pmulld xmm1, xmm0 12448 pmulld xmm2, xmm0 12449 movdqu xmmword ptr [r8 + 4*rdi], xmm1 12450 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 12451 .LBB1_624: 12452 cmp rsi, r10 12453 je .LBB1_1069 12454 jmp .LBB1_625 12455 .LBB1_629: 12456 xor edi, edi 12457 .LBB1_630: 12458 test r9b, 1 12459 je .LBB1_632 12460 # %bb.631: 12461 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 12462 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 12463 pmulld xmm1, xmm0 12464 pmulld xmm2, xmm0 12465 movdqu xmmword ptr [r8 + 4*rdi], xmm1 12466 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 12467 .LBB1_632: 12468 cmp rsi, r10 12469 je .LBB1_1069 12470 jmp .LBB1_633 12471 .LBB1_637: 12472 xor edi, edi 12473 .LBB1_638: 12474 test r9b, 1 12475 je .LBB1_640 12476 # %bb.639: 12477 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 12478 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 12479 paddd xmm1, xmm0 12480 paddd xmm2, xmm0 12481 movdqu xmmword ptr [r8 + 4*rdi], xmm1 12482 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 12483 .LBB1_640: 12484 cmp rsi, r10 12485 je .LBB1_1069 12486 jmp .LBB1_641 12487 .LBB1_645: 12488 xor edi, edi 12489 .LBB1_646: 12490 test r9b, 1 12491 je .LBB1_648 12492 # %bb.647: 12493 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 12494 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 12495 psubd xmm1, xmm0 12496 psubd xmm2, xmm0 12497 movdqu xmmword ptr [r8 + 4*rdi], xmm1 12498 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 12499 .LBB1_648: 12500 cmp rsi, r10 12501 je .LBB1_1069 12502 jmp .LBB1_649 12503 .LBB1_653: 12504 xor edi, edi 12505 .LBB1_654: 12506 test r9b, 1 12507 je .LBB1_656 12508 # %bb.655: 12509 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 12510 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 12511 paddd xmm1, xmm0 12512 paddd xmm2, xmm0 12513 movdqu xmmword ptr [r8 + 4*rdi], xmm1 12514 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 12515 .LBB1_656: 12516 cmp rsi, r10 12517 je .LBB1_1069 12518 jmp .LBB1_657 12519 .LBB1_661: 12520 xor edi, edi 12521 .LBB1_662: 12522 test r9b, 1 12523 je .LBB1_664 12524 # %bb.663: 12525 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 12526 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 12527 psubd xmm1, xmm0 12528 psubd xmm2, xmm0 12529 movdqu xmmword ptr [r8 + 4*rdi], xmm1 12530 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 12531 .LBB1_664: 12532 cmp rsi, r10 12533 je .LBB1_1069 12534 jmp .LBB1_665 12535 .LBB1_669: 12536 xor edi, edi 12537 .LBB1_670: 12538 test r9b, 1 12539 je .LBB1_672 12540 # %bb.671: 12541 movupd xmm2, xmmword ptr [rdx + 8*rdi] 12542 movupd xmm3, xmmword ptr [rdx + 8*rdi + 16] 12543 mulpd xmm2, xmm1 12544 mulpd xmm3, xmm1 12545 movupd xmmword ptr [r8 + 8*rdi], xmm2 12546 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 12547 .LBB1_672: 12548 cmp rcx, rax 12549 je .LBB1_1069 12550 jmp .LBB1_673 12551 .LBB1_677: 12552 xor edi, edi 12553 .LBB1_678: 12554 test r9b, 1 12555 je .LBB1_680 12556 # %bb.679: 12557 movupd xmm2, xmmword ptr [rdx + 8*rdi] 12558 movupd xmm3, xmmword ptr [rdx + 8*rdi + 16] 12559 mulpd xmm2, xmm1 12560 mulpd xmm3, xmm1 12561 movupd xmmword ptr [r8 + 8*rdi], xmm2 12562 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 12563 .LBB1_680: 12564 cmp rcx, rax 12565 je .LBB1_1069 12566 jmp .LBB1_681 12567 .LBB1_685: 12568 xor edi, edi 12569 .LBB1_686: 12570 test r9b, 1 12571 je .LBB1_688 12572 # %bb.687: 12573 movupd xmm2, xmmword ptr [rdx + 8*rdi] 12574 movupd xmm3, xmmword ptr [rdx + 8*rdi + 16] 12575 addpd xmm2, xmm1 12576 addpd xmm3, xmm1 12577 movupd xmmword ptr [r8 + 8*rdi], xmm2 12578 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 12579 .LBB1_688: 12580 cmp rcx, rax 12581 je .LBB1_1069 12582 jmp .LBB1_689 12583 .LBB1_693: 12584 xor edi, edi 12585 .LBB1_694: 12586 test r9b, 1 12587 je .LBB1_696 12588 # %bb.695: 12589 movupd xmm2, xmmword ptr [rdx + 8*rdi] 12590 movupd xmm3, xmmword ptr [rdx + 8*rdi + 16] 12591 subpd xmm2, xmm1 12592 subpd xmm3, xmm1 12593 movupd xmmword ptr [r8 + 8*rdi], xmm2 12594 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 12595 .LBB1_696: 12596 cmp rcx, rax 12597 je .LBB1_1069 12598 jmp .LBB1_697 12599 .LBB1_701: 12600 xor edi, edi 12601 .LBB1_702: 12602 test r9b, 1 12603 je .LBB1_704 12604 # %bb.703: 12605 movupd xmm2, xmmword ptr [rdx + 8*rdi] 12606 movupd xmm3, xmmword ptr [rdx + 8*rdi + 16] 12607 addpd xmm2, xmm1 12608 addpd xmm3, xmm1 12609 movupd xmmword ptr [r8 + 8*rdi], xmm2 12610 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 12611 .LBB1_704: 12612 cmp rcx, rax 12613 je .LBB1_1069 12614 jmp .LBB1_705 12615 .LBB1_709: 12616 xor edi, edi 12617 .LBB1_710: 12618 test r9b, 1 12619 je .LBB1_712 12620 # %bb.711: 12621 movupd xmm2, xmmword ptr [rdx + 8*rdi] 12622 movupd xmm3, xmmword ptr [rdx + 8*rdi + 16] 12623 subpd xmm2, xmm1 12624 subpd xmm3, xmm1 12625 movupd xmmword ptr [r8 + 8*rdi], xmm2 12626 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 12627 .LBB1_712: 12628 cmp rcx, rax 12629 je .LBB1_1069 12630 jmp .LBB1_713 12631 .LBB1_717: 12632 xor eax, eax 12633 .LBB1_718: 12634 test r9b, 1 12635 je .LBB1_720 12636 # %bb.719: 12637 movdqu xmm2, xmmword ptr [rdx + rax] 12638 movdqu xmm3, xmmword ptr [rdx + rax + 16] 12639 movdqa xmm4, xmm0 12640 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12641 pmovzxbw xmm5, xmm2 # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 12642 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12643 pmullw xmm2, xmm4 12644 movdqa xmm4, xmmword ptr [rip + .LCPI1_0] # xmm4 = [255,255,255,255,255,255,255,255] 12645 pand xmm2, xmm4 12646 pmullw xmm5, xmm1 12647 pand xmm5, xmm4 12648 packuswb xmm5, xmm2 12649 punpckhbw xmm0, xmm0 # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12650 pmovzxbw xmm2, xmm3 # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 12651 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12652 pmullw xmm3, xmm0 12653 pand xmm3, xmm4 12654 pmullw xmm2, xmm1 12655 pand xmm2, xmm4 12656 packuswb xmm2, xmm3 12657 movdqu xmmword ptr [r8 + rax], xmm5 12658 movdqu xmmword ptr [r8 + rax + 16], xmm2 12659 .LBB1_720: 12660 cmp rdi, r10 12661 je .LBB1_1069 12662 jmp .LBB1_721 12663 .LBB1_725: 12664 xor eax, eax 12665 .LBB1_726: 12666 test r9b, 1 12667 je .LBB1_728 12668 # %bb.727: 12669 movdqu xmm2, xmmword ptr [rdx + rax] 12670 movdqu xmm3, xmmword ptr [rdx + rax + 16] 12671 movdqa xmm4, xmm0 12672 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12673 pmovzxbw xmm5, xmm2 # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 12674 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12675 pmullw xmm2, xmm4 12676 movdqa xmm4, xmmword ptr [rip + .LCPI1_0] # xmm4 = [255,255,255,255,255,255,255,255] 12677 pand xmm2, xmm4 12678 pmullw xmm5, xmm1 12679 pand xmm5, xmm4 12680 packuswb xmm5, xmm2 12681 punpckhbw xmm0, xmm0 # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12682 pmovzxbw xmm2, xmm3 # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 12683 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 12684 pmullw xmm3, xmm0 12685 pand xmm3, xmm4 12686 pmullw xmm2, xmm1 12687 pand xmm2, xmm4 12688 packuswb xmm2, xmm3 12689 movdqu xmmword ptr [r8 + rax], xmm5 12690 movdqu xmmword ptr [r8 + rax + 16], xmm2 12691 .LBB1_728: 12692 cmp rdi, r10 12693 je .LBB1_1069 12694 jmp .LBB1_729 12695 .LBB1_733: 12696 xor edi, edi 12697 .LBB1_734: 12698 test r9b, 1 12699 je .LBB1_736 12700 # %bb.735: 12701 movdqu xmm1, xmmword ptr [rdx + rdi] 12702 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 12703 paddb xmm1, xmm0 12704 paddb xmm2, xmm0 12705 movdqu xmmword ptr [r8 + rdi], xmm1 12706 movdqu xmmword ptr [r8 + rdi + 16], xmm2 12707 .LBB1_736: 12708 cmp rsi, r10 12709 je .LBB1_1069 12710 jmp .LBB1_737 12711 .LBB1_741: 12712 xor edi, edi 12713 .LBB1_742: 12714 test r9b, 1 12715 je .LBB1_744 12716 # %bb.743: 12717 movdqu xmm1, xmmword ptr [rdx + rdi] 12718 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 12719 psubb xmm1, xmm0 12720 psubb xmm2, xmm0 12721 movdqu xmmword ptr [r8 + rdi], xmm1 12722 movdqu xmmword ptr [r8 + rdi + 16], xmm2 12723 .LBB1_744: 12724 cmp rsi, r10 12725 je .LBB1_1069 12726 jmp .LBB1_745 12727 .LBB1_749: 12728 xor edi, edi 12729 .LBB1_750: 12730 test r9b, 1 12731 je .LBB1_752 12732 # %bb.751: 12733 movdqu xmm1, xmmword ptr [rdx + rdi] 12734 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 12735 paddb xmm1, xmm0 12736 paddb xmm2, xmm0 12737 movdqu xmmword ptr [r8 + rdi], xmm1 12738 movdqu xmmword ptr [r8 + rdi + 16], xmm2 12739 .LBB1_752: 12740 cmp rsi, r10 12741 je .LBB1_1069 12742 jmp .LBB1_753 12743 .LBB1_757: 12744 xor edi, edi 12745 .LBB1_758: 12746 test r9b, 1 12747 je .LBB1_760 12748 # %bb.759: 12749 movdqu xmm1, xmmword ptr [rdx + rdi] 12750 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 12751 psubb xmm1, xmm0 12752 psubb xmm2, xmm0 12753 movdqu xmmword ptr [r8 + rdi], xmm1 12754 movdqu xmmword ptr [r8 + rdi + 16], xmm2 12755 .LBB1_760: 12756 cmp rsi, r10 12757 je .LBB1_1069 12758 jmp .LBB1_761 12759 .LBB1_765: 12760 xor edi, edi 12761 .LBB1_766: 12762 test r9b, 1 12763 je .LBB1_768 12764 # %bb.767: 12765 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 12766 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 12767 paddq xmm1, xmm0 12768 paddq xmm2, xmm0 12769 movdqu xmmword ptr [r8 + 8*rdi], xmm1 12770 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 12771 .LBB1_768: 12772 cmp rsi, r10 12773 je .LBB1_1069 12774 jmp .LBB1_769 12775 .LBB1_773: 12776 xor edi, edi 12777 .LBB1_774: 12778 test r9b, 1 12779 je .LBB1_776 12780 # %bb.775: 12781 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 12782 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 12783 psubq xmm1, xmm0 12784 psubq xmm2, xmm0 12785 movdqu xmmword ptr [r8 + 8*rdi], xmm1 12786 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 12787 .LBB1_776: 12788 cmp rsi, r10 12789 je .LBB1_1069 12790 jmp .LBB1_777 12791 .LBB1_781: 12792 xor edi, edi 12793 .LBB1_782: 12794 test r9b, 1 12795 je .LBB1_784 12796 # %bb.783: 12797 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 12798 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 12799 paddq xmm1, xmm0 12800 paddq xmm2, xmm0 12801 movdqu xmmword ptr [r8 + 8*rdi], xmm1 12802 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 12803 .LBB1_784: 12804 cmp rsi, r10 12805 je .LBB1_1069 12806 jmp .LBB1_785 12807 .LBB1_789: 12808 xor edi, edi 12809 .LBB1_790: 12810 test r9b, 1 12811 je .LBB1_792 12812 # %bb.791: 12813 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 12814 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 12815 psubq xmm1, xmm0 12816 psubq xmm2, xmm0 12817 movdqu xmmword ptr [r8 + 8*rdi], xmm1 12818 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 12819 .LBB1_792: 12820 cmp rsi, r10 12821 je .LBB1_1069 12822 jmp .LBB1_793 12823 .LBB1_797: 12824 xor edi, edi 12825 .LBB1_798: 12826 test r9b, 1 12827 je .LBB1_800 12828 # %bb.799: 12829 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 12830 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 12831 pmullw xmm1, xmm0 12832 pmullw xmm2, xmm0 12833 movdqu xmmword ptr [r8 + 2*rdi], xmm1 12834 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 12835 .LBB1_800: 12836 cmp rsi, r10 12837 je .LBB1_1069 12838 jmp .LBB1_801 12839 .LBB1_805: 12840 xor edi, edi 12841 .LBB1_806: 12842 test r9b, 1 12843 je .LBB1_808 12844 # %bb.807: 12845 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 12846 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 12847 pmullw xmm1, xmm0 12848 pmullw xmm2, xmm0 12849 movdqu xmmword ptr [r8 + 2*rdi], xmm1 12850 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 12851 .LBB1_808: 12852 cmp rsi, r10 12853 je .LBB1_1069 12854 jmp .LBB1_809 12855 .LBB1_813: 12856 xor edi, edi 12857 .LBB1_814: 12858 test r9b, 1 12859 je .LBB1_816 12860 # %bb.815: 12861 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 12862 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 12863 pmullw xmm1, xmm0 12864 pmullw xmm2, xmm0 12865 movdqu xmmword ptr [r8 + 2*rdi], xmm1 12866 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 12867 .LBB1_816: 12868 cmp rsi, r10 12869 je .LBB1_1069 12870 jmp .LBB1_817 12871 .LBB1_821: 12872 xor edi, edi 12873 .LBB1_822: 12874 test r9b, 1 12875 je .LBB1_824 12876 # %bb.823: 12877 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 12878 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 12879 pmullw xmm1, xmm0 12880 pmullw xmm2, xmm0 12881 movdqu xmmword ptr [r8 + 2*rdi], xmm1 12882 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 12883 .LBB1_824: 12884 cmp rsi, r10 12885 je .LBB1_1069 12886 jmp .LBB1_825 12887 .LBB1_829: 12888 xor edi, edi 12889 .LBB1_830: 12890 test r9b, 1 12891 je .LBB1_832 12892 # %bb.831: 12893 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 12894 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 12895 paddw xmm1, xmm0 12896 paddw xmm2, xmm0 12897 movdqu xmmword ptr [r8 + 2*rdi], xmm1 12898 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 12899 .LBB1_832: 12900 cmp rsi, r10 12901 je .LBB1_1069 12902 jmp .LBB1_833 12903 .LBB1_837: 12904 xor edi, edi 12905 .LBB1_838: 12906 test r9b, 1 12907 je .LBB1_840 12908 # %bb.839: 12909 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 12910 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 12911 paddw xmm1, xmm0 12912 paddw xmm2, xmm0 12913 movdqu xmmword ptr [r8 + 2*rdi], xmm1 12914 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 12915 .LBB1_840: 12916 cmp rsi, r10 12917 je .LBB1_1069 12918 jmp .LBB1_841 12919 .LBB1_845: 12920 xor edi, edi 12921 .LBB1_846: 12922 test r9b, 1 12923 je .LBB1_848 12924 # %bb.847: 12925 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 12926 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 12927 psubw xmm1, xmm0 12928 psubw xmm2, xmm0 12929 movdqu xmmword ptr [r8 + 2*rdi], xmm1 12930 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 12931 .LBB1_848: 12932 cmp rsi, r10 12933 je .LBB1_1069 12934 jmp .LBB1_849 12935 .LBB1_853: 12936 xor edi, edi 12937 .LBB1_854: 12938 test r9b, 1 12939 je .LBB1_856 12940 # %bb.855: 12941 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 12942 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 12943 psubw xmm1, xmm0 12944 psubw xmm2, xmm0 12945 movdqu xmmword ptr [r8 + 2*rdi], xmm1 12946 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 12947 .LBB1_856: 12948 cmp rsi, r10 12949 je .LBB1_1069 12950 jmp .LBB1_857 12951 .LBB1_861: 12952 xor edi, edi 12953 .LBB1_862: 12954 test r9b, 1 12955 je .LBB1_864 12956 # %bb.863: 12957 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 12958 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 12959 paddw xmm1, xmm0 12960 paddw xmm2, xmm0 12961 movdqu xmmword ptr [r8 + 2*rdi], xmm1 12962 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 12963 .LBB1_864: 12964 cmp rsi, r10 12965 je .LBB1_1069 12966 jmp .LBB1_865 12967 .LBB1_869: 12968 xor edi, edi 12969 .LBB1_870: 12970 test r9b, 1 12971 je .LBB1_872 12972 # %bb.871: 12973 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 12974 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 12975 paddw xmm1, xmm0 12976 paddw xmm2, xmm0 12977 movdqu xmmword ptr [r8 + 2*rdi], xmm1 12978 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 12979 .LBB1_872: 12980 cmp rsi, r10 12981 je .LBB1_1069 12982 jmp .LBB1_873 12983 .LBB1_877: 12984 xor edi, edi 12985 .LBB1_878: 12986 test r9b, 1 12987 je .LBB1_880 12988 # %bb.879: 12989 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 12990 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 12991 psubw xmm1, xmm0 12992 psubw xmm2, xmm0 12993 movdqu xmmword ptr [r8 + 2*rdi], xmm1 12994 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 12995 .LBB1_880: 12996 cmp rsi, r10 12997 je .LBB1_1069 12998 jmp .LBB1_881 12999 .LBB1_885: 13000 xor edi, edi 13001 .LBB1_886: 13002 test r9b, 1 13003 je .LBB1_888 13004 # %bb.887: 13005 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 13006 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 13007 psubw xmm1, xmm0 13008 psubw xmm2, xmm0 13009 movdqu xmmword ptr [r8 + 2*rdi], xmm1 13010 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 13011 .LBB1_888: 13012 cmp rsi, r10 13013 je .LBB1_1069 13014 jmp .LBB1_889 13015 .LBB1_893: 13016 xor edi, edi 13017 .LBB1_894: 13018 test r9b, 1 13019 je .LBB1_896 13020 # %bb.895: 13021 movups xmm2, xmmword ptr [rdx + 4*rdi] 13022 movups xmm3, xmmword ptr [rdx + 4*rdi + 16] 13023 mulps xmm2, xmm1 13024 mulps xmm3, xmm1 13025 movups xmmword ptr [r8 + 4*rdi], xmm2 13026 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 13027 .LBB1_896: 13028 cmp rcx, rax 13029 je .LBB1_1069 13030 jmp .LBB1_897 13031 .LBB1_901: 13032 xor edi, edi 13033 .LBB1_902: 13034 test r9b, 1 13035 je .LBB1_904 13036 # %bb.903: 13037 movups xmm2, xmmword ptr [rdx + 4*rdi] 13038 movups xmm3, xmmword ptr [rdx + 4*rdi + 16] 13039 mulps xmm2, xmm1 13040 mulps xmm3, xmm1 13041 movups xmmword ptr [r8 + 4*rdi], xmm2 13042 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 13043 .LBB1_904: 13044 cmp rcx, rax 13045 je .LBB1_1069 13046 jmp .LBB1_905 13047 .LBB1_909: 13048 xor edi, edi 13049 .LBB1_910: 13050 test r9b, 1 13051 je .LBB1_912 13052 # %bb.911: 13053 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 13054 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 13055 paddq xmm1, xmm0 13056 paddq xmm2, xmm0 13057 movdqu xmmword ptr [r8 + 8*rdi], xmm1 13058 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 13059 .LBB1_912: 13060 cmp rsi, r10 13061 je .LBB1_1069 13062 jmp .LBB1_913 13063 .LBB1_917: 13064 xor edi, edi 13065 .LBB1_918: 13066 test r9b, 1 13067 je .LBB1_920 13068 # %bb.919: 13069 movups xmm2, xmmword ptr [rdx + 4*rdi] 13070 movups xmm3, xmmword ptr [rdx + 4*rdi + 16] 13071 addps xmm2, xmm1 13072 addps xmm3, xmm1 13073 movups xmmword ptr [r8 + 4*rdi], xmm2 13074 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 13075 .LBB1_920: 13076 cmp rcx, rax 13077 je .LBB1_1069 13078 jmp .LBB1_921 13079 .LBB1_925: 13080 xor edi, edi 13081 .LBB1_926: 13082 test r9b, 1 13083 je .LBB1_928 13084 # %bb.927: 13085 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 13086 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 13087 psubq xmm1, xmm0 13088 psubq xmm2, xmm0 13089 movdqu xmmword ptr [r8 + 8*rdi], xmm1 13090 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 13091 .LBB1_928: 13092 cmp rsi, r10 13093 je .LBB1_1069 13094 jmp .LBB1_929 13095 .LBB1_933: 13096 xor edi, edi 13097 .LBB1_934: 13098 test r9b, 1 13099 je .LBB1_936 13100 # %bb.935: 13101 movups xmm2, xmmword ptr [rdx + 4*rdi] 13102 movups xmm3, xmmword ptr [rdx + 4*rdi + 16] 13103 subps xmm2, xmm1 13104 subps xmm3, xmm1 13105 movups xmmword ptr [r8 + 4*rdi], xmm2 13106 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 13107 .LBB1_936: 13108 cmp rcx, rax 13109 je .LBB1_1069 13110 jmp .LBB1_937 13111 .LBB1_941: 13112 xor edi, edi 13113 .LBB1_942: 13114 test r9b, 1 13115 je .LBB1_944 13116 # %bb.943: 13117 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 13118 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 13119 paddq xmm1, xmm0 13120 paddq xmm2, xmm0 13121 movdqu xmmword ptr [r8 + 8*rdi], xmm1 13122 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 13123 .LBB1_944: 13124 cmp rsi, r10 13125 je .LBB1_1069 13126 jmp .LBB1_945 13127 .LBB1_949: 13128 xor edi, edi 13129 .LBB1_950: 13130 test r9b, 1 13131 je .LBB1_952 13132 # %bb.951: 13133 movups xmm2, xmmword ptr [rdx + 4*rdi] 13134 movups xmm3, xmmword ptr [rdx + 4*rdi + 16] 13135 addps xmm2, xmm1 13136 addps xmm3, xmm1 13137 movups xmmword ptr [r8 + 4*rdi], xmm2 13138 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 13139 .LBB1_952: 13140 cmp rcx, rax 13141 je .LBB1_1069 13142 jmp .LBB1_953 13143 .LBB1_957: 13144 xor edi, edi 13145 .LBB1_958: 13146 test r9b, 1 13147 je .LBB1_960 13148 # %bb.959: 13149 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 13150 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 13151 psubq xmm1, xmm0 13152 psubq xmm2, xmm0 13153 movdqu xmmword ptr [r8 + 8*rdi], xmm1 13154 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 13155 .LBB1_960: 13156 cmp rsi, r10 13157 je .LBB1_1069 13158 jmp .LBB1_961 13159 .LBB1_965: 13160 xor edi, edi 13161 .LBB1_966: 13162 test r9b, 1 13163 je .LBB1_968 13164 # %bb.967: 13165 movups xmm2, xmmword ptr [rdx + 4*rdi] 13166 movups xmm3, xmmword ptr [rdx + 4*rdi + 16] 13167 subps xmm2, xmm1 13168 subps xmm3, xmm1 13169 movups xmmword ptr [r8 + 4*rdi], xmm2 13170 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 13171 .LBB1_968: 13172 cmp rcx, rax 13173 je .LBB1_1069 13174 jmp .LBB1_969 13175 .LBB1_973: 13176 xor eax, eax 13177 .LBB1_974: 13178 test r9b, 1 13179 je .LBB1_976 13180 # %bb.975: 13181 movdqu xmm2, xmmword ptr [rdx + rax] 13182 movdqu xmm3, xmmword ptr [rdx + rax + 16] 13183 movdqa xmm4, xmm0 13184 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 13185 pmovzxbw xmm5, xmm2 # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 13186 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 13187 pmullw xmm2, xmm4 13188 movdqa xmm4, xmmword ptr [rip + .LCPI1_0] # xmm4 = [255,255,255,255,255,255,255,255] 13189 pand xmm2, xmm4 13190 pmullw xmm5, xmm1 13191 pand xmm5, xmm4 13192 packuswb xmm5, xmm2 13193 punpckhbw xmm0, xmm0 # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 13194 pmovzxbw xmm2, xmm3 # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 13195 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 13196 pmullw xmm3, xmm0 13197 pand xmm3, xmm4 13198 pmullw xmm2, xmm1 13199 pand xmm2, xmm4 13200 packuswb xmm2, xmm3 13201 movdqu xmmword ptr [r8 + rax], xmm5 13202 movdqu xmmword ptr [r8 + rax + 16], xmm2 13203 .LBB1_976: 13204 cmp rdi, r10 13205 je .LBB1_1069 13206 jmp .LBB1_977 13207 .LBB1_981: 13208 xor eax, eax 13209 .LBB1_982: 13210 test r9b, 1 13211 je .LBB1_984 13212 # %bb.983: 13213 movdqu xmm2, xmmword ptr [rdx + rax] 13214 movdqu xmm3, xmmword ptr [rdx + rax + 16] 13215 movdqa xmm4, xmm0 13216 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 13217 pmovzxbw xmm5, xmm2 # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 13218 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 13219 pmullw xmm2, xmm4 13220 movdqa xmm4, xmmword ptr [rip + .LCPI1_0] # xmm4 = [255,255,255,255,255,255,255,255] 13221 pand xmm2, xmm4 13222 pmullw xmm5, xmm1 13223 pand xmm5, xmm4 13224 packuswb xmm5, xmm2 13225 punpckhbw xmm0, xmm0 # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 13226 pmovzxbw xmm2, xmm3 # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 13227 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 13228 pmullw xmm3, xmm0 13229 pand xmm3, xmm4 13230 pmullw xmm2, xmm1 13231 pand xmm2, xmm4 13232 packuswb xmm2, xmm3 13233 movdqu xmmword ptr [r8 + rax], xmm5 13234 movdqu xmmword ptr [r8 + rax + 16], xmm2 13235 .LBB1_984: 13236 cmp rdi, r10 13237 je .LBB1_1069 13238 jmp .LBB1_985 13239 .LBB1_989: 13240 xor edi, edi 13241 .LBB1_990: 13242 test r9b, 1 13243 je .LBB1_992 13244 # %bb.991: 13245 movdqu xmm1, xmmword ptr [rdx + rdi] 13246 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 13247 paddb xmm1, xmm0 13248 paddb xmm2, xmm0 13249 movdqu xmmword ptr [r8 + rdi], xmm1 13250 movdqu xmmword ptr [r8 + rdi + 16], xmm2 13251 .LBB1_992: 13252 cmp rsi, r10 13253 je .LBB1_1069 13254 jmp .LBB1_993 13255 .LBB1_997: 13256 xor edi, edi 13257 .LBB1_998: 13258 test r9b, 1 13259 je .LBB1_1000 13260 # %bb.999: 13261 movdqu xmm1, xmmword ptr [rdx + rdi] 13262 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 13263 psubb xmm1, xmm0 13264 psubb xmm2, xmm0 13265 movdqu xmmword ptr [r8 + rdi], xmm1 13266 movdqu xmmword ptr [r8 + rdi + 16], xmm2 13267 .LBB1_1000: 13268 cmp rsi, r10 13269 je .LBB1_1069 13270 jmp .LBB1_1001 13271 .LBB1_1005: 13272 xor edi, edi 13273 .LBB1_1006: 13274 test r9b, 1 13275 je .LBB1_1008 13276 # %bb.1007: 13277 movdqu xmm1, xmmword ptr [rdx + rdi] 13278 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 13279 paddb xmm1, xmm0 13280 paddb xmm2, xmm0 13281 movdqu xmmword ptr [r8 + rdi], xmm1 13282 movdqu xmmword ptr [r8 + rdi + 16], xmm2 13283 .LBB1_1008: 13284 cmp rsi, r10 13285 je .LBB1_1069 13286 jmp .LBB1_1009 13287 .LBB1_1013: 13288 xor edi, edi 13289 .LBB1_1014: 13290 test r9b, 1 13291 je .LBB1_1016 13292 # %bb.1015: 13293 movdqu xmm1, xmmword ptr [rdx + rdi] 13294 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 13295 psubb xmm1, xmm0 13296 psubb xmm2, xmm0 13297 movdqu xmmword ptr [r8 + rdi], xmm1 13298 movdqu xmmword ptr [r8 + rdi + 16], xmm2 13299 .LBB1_1016: 13300 cmp rsi, r10 13301 je .LBB1_1069 13302 jmp .LBB1_1017 13303 .LBB1_1021: 13304 xor edi, edi 13305 .LBB1_1022: 13306 test r9b, 1 13307 je .LBB1_1024 13308 # %bb.1023: 13309 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 13310 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 13311 pmulld xmm1, xmm0 13312 pmulld xmm2, xmm0 13313 movdqu xmmword ptr [r8 + 4*rdi], xmm1 13314 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 13315 .LBB1_1024: 13316 cmp rsi, r10 13317 je .LBB1_1069 13318 jmp .LBB1_1025 13319 .LBB1_1029: 13320 xor edi, edi 13321 .LBB1_1030: 13322 test r9b, 1 13323 je .LBB1_1032 13324 # %bb.1031: 13325 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 13326 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 13327 pmulld xmm1, xmm0 13328 pmulld xmm2, xmm0 13329 movdqu xmmword ptr [r8 + 4*rdi], xmm1 13330 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 13331 .LBB1_1032: 13332 cmp rsi, r10 13333 je .LBB1_1069 13334 jmp .LBB1_1033 13335 .LBB1_1037: 13336 xor edi, edi 13337 .LBB1_1038: 13338 test r9b, 1 13339 je .LBB1_1040 13340 # %bb.1039: 13341 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 13342 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 13343 paddd xmm1, xmm0 13344 paddd xmm2, xmm0 13345 movdqu xmmword ptr [r8 + 4*rdi], xmm1 13346 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 13347 .LBB1_1040: 13348 cmp rsi, r10 13349 je .LBB1_1069 13350 jmp .LBB1_1041 13351 .LBB1_1045: 13352 xor edi, edi 13353 .LBB1_1046: 13354 test r9b, 1 13355 je .LBB1_1048 13356 # %bb.1047: 13357 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 13358 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 13359 psubd xmm1, xmm0 13360 psubd xmm2, xmm0 13361 movdqu xmmword ptr [r8 + 4*rdi], xmm1 13362 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 13363 .LBB1_1048: 13364 cmp rsi, r10 13365 je .LBB1_1069 13366 jmp .LBB1_1049 13367 .LBB1_1053: 13368 xor edi, edi 13369 .LBB1_1054: 13370 test r9b, 1 13371 je .LBB1_1056 13372 # %bb.1055: 13373 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 13374 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 13375 paddd xmm1, xmm0 13376 paddd xmm2, xmm0 13377 movdqu xmmword ptr [r8 + 4*rdi], xmm1 13378 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 13379 .LBB1_1056: 13380 cmp rsi, r10 13381 je .LBB1_1069 13382 jmp .LBB1_1057 13383 .LBB1_1061: 13384 xor edi, edi 13385 .LBB1_1062: 13386 test r9b, 1 13387 je .LBB1_1064 13388 # %bb.1063: 13389 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 13390 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 13391 psubd xmm1, xmm0 13392 psubd xmm2, xmm0 13393 movdqu xmmword ptr [r8 + 4*rdi], xmm1 13394 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 13395 .LBB1_1064: 13396 cmp rsi, r10 13397 je .LBB1_1069 13398 jmp .LBB1_1065 13399 .Lfunc_end1: 13400 .size arithmetic_arr_scalar_sse4, .Lfunc_end1-arithmetic_arr_scalar_sse4 13401 # -- End function 13402 .section .rodata.cst16,"aM",@progbits,16 13403 .p2align 4 # -- Begin function arithmetic_scalar_arr_sse4 13404 .LCPI2_0: 13405 .short 255 # 0xff 13406 .short 255 # 0xff 13407 .short 255 # 0xff 13408 .short 255 # 0xff 13409 .short 255 # 0xff 13410 .short 255 # 0xff 13411 .short 255 # 0xff 13412 .short 255 # 0xff 13413 .text 13414 .globl arithmetic_scalar_arr_sse4 13415 .p2align 4, 0x90 13416 .type arithmetic_scalar_arr_sse4,@function 13417 arithmetic_scalar_arr_sse4: # @arithmetic_scalar_arr_sse4 13418 # %bb.0: 13419 push rbp 13420 mov rbp, rsp 13421 and rsp, -8 13422 cmp sil, 20 13423 jg .LBB2_12 13424 # %bb.1: 13425 test sil, sil 13426 je .LBB2_23 13427 # %bb.2: 13428 cmp sil, 1 13429 je .LBB2_31 13430 # %bb.3: 13431 cmp sil, 2 13432 jne .LBB2_1069 13433 # %bb.4: 13434 cmp edi, 6 13435 jg .LBB2_55 13436 # %bb.5: 13437 cmp edi, 3 13438 jle .LBB2_97 13439 # %bb.6: 13440 cmp edi, 4 13441 je .LBB2_157 13442 # %bb.7: 13443 cmp edi, 5 13444 je .LBB2_160 13445 # %bb.8: 13446 cmp edi, 6 13447 jne .LBB2_1069 13448 # %bb.9: 13449 test r9d, r9d 13450 jle .LBB2_1069 13451 # %bb.10: 13452 mov eax, dword ptr [rdx] 13453 mov r10d, r9d 13454 cmp r9d, 8 13455 jb .LBB2_11 13456 # %bb.265: 13457 lea rdx, [rcx + 4*r10] 13458 cmp rdx, r8 13459 jbe .LBB2_453 13460 # %bb.266: 13461 lea rdx, [r8 + 4*r10] 13462 cmp rdx, rcx 13463 jbe .LBB2_453 13464 .LBB2_11: 13465 xor esi, esi 13466 .LBB2_625: 13467 mov r9, rsi 13468 not r9 13469 add r9, r10 13470 mov rdi, r10 13471 and rdi, 3 13472 je .LBB2_627 13473 .LBB2_626: # =>This Inner Loop Header: Depth=1 13474 mov edx, dword ptr [rcx + 4*rsi] 13475 imul edx, eax 13476 mov dword ptr [r8 + 4*rsi], edx 13477 add rsi, 1 13478 add rdi, -1 13479 jne .LBB2_626 13480 .LBB2_627: 13481 cmp r9, 3 13482 jb .LBB2_1069 13483 .LBB2_628: # =>This Inner Loop Header: Depth=1 13484 mov edx, dword ptr [rcx + 4*rsi] 13485 imul edx, eax 13486 mov dword ptr [r8 + 4*rsi], edx 13487 mov edx, dword ptr [rcx + 4*rsi + 4] 13488 imul edx, eax 13489 mov dword ptr [r8 + 4*rsi + 4], edx 13490 mov edx, dword ptr [rcx + 4*rsi + 8] 13491 imul edx, eax 13492 mov dword ptr [r8 + 4*rsi + 8], edx 13493 mov edx, dword ptr [rcx + 4*rsi + 12] 13494 imul edx, eax 13495 mov dword ptr [r8 + 4*rsi + 12], edx 13496 add rsi, 4 13497 cmp r10, rsi 13498 jne .LBB2_628 13499 jmp .LBB2_1069 13500 .LBB2_12: 13501 cmp sil, 21 13502 je .LBB2_39 13503 # %bb.13: 13504 cmp sil, 22 13505 je .LBB2_47 13506 # %bb.14: 13507 cmp sil, 23 13508 jne .LBB2_1069 13509 # %bb.15: 13510 cmp edi, 6 13511 jg .LBB2_62 13512 # %bb.16: 13513 cmp edi, 3 13514 jle .LBB2_102 13515 # %bb.17: 13516 cmp edi, 4 13517 je .LBB2_163 13518 # %bb.18: 13519 cmp edi, 5 13520 je .LBB2_166 13521 # %bb.19: 13522 cmp edi, 6 13523 jne .LBB2_1069 13524 # %bb.20: 13525 test r9d, r9d 13526 jle .LBB2_1069 13527 # %bb.21: 13528 mov eax, dword ptr [rdx] 13529 mov r10d, r9d 13530 cmp r9d, 8 13531 jb .LBB2_22 13532 # %bb.268: 13533 lea rdx, [rcx + 4*r10] 13534 cmp rdx, r8 13535 jbe .LBB2_456 13536 # %bb.269: 13537 lea rdx, [r8 + 4*r10] 13538 cmp rdx, rcx 13539 jbe .LBB2_456 13540 .LBB2_22: 13541 xor esi, esi 13542 .LBB2_633: 13543 mov r9, rsi 13544 not r9 13545 add r9, r10 13546 mov rdi, r10 13547 and rdi, 3 13548 je .LBB2_635 13549 .LBB2_634: # =>This Inner Loop Header: Depth=1 13550 mov edx, dword ptr [rcx + 4*rsi] 13551 imul edx, eax 13552 mov dword ptr [r8 + 4*rsi], edx 13553 add rsi, 1 13554 add rdi, -1 13555 jne .LBB2_634 13556 .LBB2_635: 13557 cmp r9, 3 13558 jb .LBB2_1069 13559 .LBB2_636: # =>This Inner Loop Header: Depth=1 13560 mov edx, dword ptr [rcx + 4*rsi] 13561 imul edx, eax 13562 mov dword ptr [r8 + 4*rsi], edx 13563 mov edx, dword ptr [rcx + 4*rsi + 4] 13564 imul edx, eax 13565 mov dword ptr [r8 + 4*rsi + 4], edx 13566 mov edx, dword ptr [rcx + 4*rsi + 8] 13567 imul edx, eax 13568 mov dword ptr [r8 + 4*rsi + 8], edx 13569 mov edx, dword ptr [rcx + 4*rsi + 12] 13570 imul edx, eax 13571 mov dword ptr [r8 + 4*rsi + 12], edx 13572 add rsi, 4 13573 cmp r10, rsi 13574 jne .LBB2_636 13575 jmp .LBB2_1069 13576 .LBB2_23: 13577 cmp edi, 6 13578 jg .LBB2_69 13579 # %bb.24: 13580 cmp edi, 3 13581 jle .LBB2_107 13582 # %bb.25: 13583 cmp edi, 4 13584 je .LBB2_169 13585 # %bb.26: 13586 cmp edi, 5 13587 je .LBB2_172 13588 # %bb.27: 13589 cmp edi, 6 13590 jne .LBB2_1069 13591 # %bb.28: 13592 test r9d, r9d 13593 jle .LBB2_1069 13594 # %bb.29: 13595 mov eax, dword ptr [rdx] 13596 mov r10d, r9d 13597 cmp r9d, 8 13598 jb .LBB2_30 13599 # %bb.271: 13600 lea rdx, [rcx + 4*r10] 13601 cmp rdx, r8 13602 jbe .LBB2_459 13603 # %bb.272: 13604 lea rdx, [r8 + 4*r10] 13605 cmp rdx, rcx 13606 jbe .LBB2_459 13607 .LBB2_30: 13608 xor esi, esi 13609 .LBB2_641: 13610 mov r9, rsi 13611 not r9 13612 add r9, r10 13613 mov rdi, r10 13614 and rdi, 3 13615 je .LBB2_643 13616 .LBB2_642: # =>This Inner Loop Header: Depth=1 13617 mov edx, dword ptr [rcx + 4*rsi] 13618 add edx, eax 13619 mov dword ptr [r8 + 4*rsi], edx 13620 add rsi, 1 13621 add rdi, -1 13622 jne .LBB2_642 13623 .LBB2_643: 13624 cmp r9, 3 13625 jb .LBB2_1069 13626 .LBB2_644: # =>This Inner Loop Header: Depth=1 13627 mov edx, dword ptr [rcx + 4*rsi] 13628 add edx, eax 13629 mov dword ptr [r8 + 4*rsi], edx 13630 mov edx, dword ptr [rcx + 4*rsi + 4] 13631 add edx, eax 13632 mov dword ptr [r8 + 4*rsi + 4], edx 13633 mov edx, dword ptr [rcx + 4*rsi + 8] 13634 add edx, eax 13635 mov dword ptr [r8 + 4*rsi + 8], edx 13636 mov edx, dword ptr [rcx + 4*rsi + 12] 13637 add edx, eax 13638 mov dword ptr [r8 + 4*rsi + 12], edx 13639 add rsi, 4 13640 cmp r10, rsi 13641 jne .LBB2_644 13642 jmp .LBB2_1069 13643 .LBB2_31: 13644 cmp edi, 6 13645 jg .LBB2_76 13646 # %bb.32: 13647 cmp edi, 3 13648 jle .LBB2_112 13649 # %bb.33: 13650 cmp edi, 4 13651 je .LBB2_175 13652 # %bb.34: 13653 cmp edi, 5 13654 je .LBB2_178 13655 # %bb.35: 13656 cmp edi, 6 13657 jne .LBB2_1069 13658 # %bb.36: 13659 test r9d, r9d 13660 jle .LBB2_1069 13661 # %bb.37: 13662 mov r11d, dword ptr [rdx] 13663 mov r10d, r9d 13664 cmp r9d, 8 13665 jb .LBB2_38 13666 # %bb.274: 13667 lea rdx, [rcx + 4*r10] 13668 cmp rdx, r8 13669 jbe .LBB2_462 13670 # %bb.275: 13671 lea rdx, [r8 + 4*r10] 13672 cmp rdx, rcx 13673 jbe .LBB2_462 13674 .LBB2_38: 13675 xor esi, esi 13676 .LBB2_649: 13677 mov rdx, rsi 13678 not rdx 13679 add rdx, r10 13680 mov rdi, r10 13681 and rdi, 3 13682 je .LBB2_651 13683 .LBB2_650: # =>This Inner Loop Header: Depth=1 13684 mov eax, r11d 13685 sub eax, dword ptr [rcx + 4*rsi] 13686 mov dword ptr [r8 + 4*rsi], eax 13687 add rsi, 1 13688 add rdi, -1 13689 jne .LBB2_650 13690 .LBB2_651: 13691 cmp rdx, 3 13692 jb .LBB2_1069 13693 .LBB2_652: # =>This Inner Loop Header: Depth=1 13694 mov eax, r11d 13695 sub eax, dword ptr [rcx + 4*rsi] 13696 mov dword ptr [r8 + 4*rsi], eax 13697 mov eax, r11d 13698 sub eax, dword ptr [rcx + 4*rsi + 4] 13699 mov dword ptr [r8 + 4*rsi + 4], eax 13700 mov eax, r11d 13701 sub eax, dword ptr [rcx + 4*rsi + 8] 13702 mov dword ptr [r8 + 4*rsi + 8], eax 13703 mov eax, r11d 13704 sub eax, dword ptr [rcx + 4*rsi + 12] 13705 mov dword ptr [r8 + 4*rsi + 12], eax 13706 add rsi, 4 13707 cmp r10, rsi 13708 jne .LBB2_652 13709 jmp .LBB2_1069 13710 .LBB2_39: 13711 cmp edi, 6 13712 jg .LBB2_83 13713 # %bb.40: 13714 cmp edi, 3 13715 jle .LBB2_117 13716 # %bb.41: 13717 cmp edi, 4 13718 je .LBB2_181 13719 # %bb.42: 13720 cmp edi, 5 13721 je .LBB2_184 13722 # %bb.43: 13723 cmp edi, 6 13724 jne .LBB2_1069 13725 # %bb.44: 13726 test r9d, r9d 13727 jle .LBB2_1069 13728 # %bb.45: 13729 mov eax, dword ptr [rdx] 13730 mov r10d, r9d 13731 cmp r9d, 8 13732 jb .LBB2_46 13733 # %bb.277: 13734 lea rdx, [rcx + 4*r10] 13735 cmp rdx, r8 13736 jbe .LBB2_465 13737 # %bb.278: 13738 lea rdx, [r8 + 4*r10] 13739 cmp rdx, rcx 13740 jbe .LBB2_465 13741 .LBB2_46: 13742 xor esi, esi 13743 .LBB2_657: 13744 mov r9, rsi 13745 not r9 13746 add r9, r10 13747 mov rdi, r10 13748 and rdi, 3 13749 je .LBB2_659 13750 .LBB2_658: # =>This Inner Loop Header: Depth=1 13751 mov edx, dword ptr [rcx + 4*rsi] 13752 add edx, eax 13753 mov dword ptr [r8 + 4*rsi], edx 13754 add rsi, 1 13755 add rdi, -1 13756 jne .LBB2_658 13757 .LBB2_659: 13758 cmp r9, 3 13759 jb .LBB2_1069 13760 .LBB2_660: # =>This Inner Loop Header: Depth=1 13761 mov edx, dword ptr [rcx + 4*rsi] 13762 add edx, eax 13763 mov dword ptr [r8 + 4*rsi], edx 13764 mov edx, dword ptr [rcx + 4*rsi + 4] 13765 add edx, eax 13766 mov dword ptr [r8 + 4*rsi + 4], edx 13767 mov edx, dword ptr [rcx + 4*rsi + 8] 13768 add edx, eax 13769 mov dword ptr [r8 + 4*rsi + 8], edx 13770 mov edx, dword ptr [rcx + 4*rsi + 12] 13771 add edx, eax 13772 mov dword ptr [r8 + 4*rsi + 12], edx 13773 add rsi, 4 13774 cmp r10, rsi 13775 jne .LBB2_660 13776 jmp .LBB2_1069 13777 .LBB2_47: 13778 cmp edi, 6 13779 jg .LBB2_90 13780 # %bb.48: 13781 cmp edi, 3 13782 jle .LBB2_122 13783 # %bb.49: 13784 cmp edi, 4 13785 je .LBB2_187 13786 # %bb.50: 13787 cmp edi, 5 13788 je .LBB2_190 13789 # %bb.51: 13790 cmp edi, 6 13791 jne .LBB2_1069 13792 # %bb.52: 13793 test r9d, r9d 13794 jle .LBB2_1069 13795 # %bb.53: 13796 mov r11d, dword ptr [rdx] 13797 mov r10d, r9d 13798 cmp r9d, 8 13799 jb .LBB2_54 13800 # %bb.280: 13801 lea rdx, [rcx + 4*r10] 13802 cmp rdx, r8 13803 jbe .LBB2_468 13804 # %bb.281: 13805 lea rdx, [r8 + 4*r10] 13806 cmp rdx, rcx 13807 jbe .LBB2_468 13808 .LBB2_54: 13809 xor esi, esi 13810 .LBB2_665: 13811 mov rdx, rsi 13812 not rdx 13813 add rdx, r10 13814 mov rdi, r10 13815 and rdi, 3 13816 je .LBB2_667 13817 .LBB2_666: # =>This Inner Loop Header: Depth=1 13818 mov eax, r11d 13819 sub eax, dword ptr [rcx + 4*rsi] 13820 mov dword ptr [r8 + 4*rsi], eax 13821 add rsi, 1 13822 add rdi, -1 13823 jne .LBB2_666 13824 .LBB2_667: 13825 cmp rdx, 3 13826 jb .LBB2_1069 13827 .LBB2_668: # =>This Inner Loop Header: Depth=1 13828 mov eax, r11d 13829 sub eax, dword ptr [rcx + 4*rsi] 13830 mov dword ptr [r8 + 4*rsi], eax 13831 mov eax, r11d 13832 sub eax, dword ptr [rcx + 4*rsi + 4] 13833 mov dword ptr [r8 + 4*rsi + 4], eax 13834 mov eax, r11d 13835 sub eax, dword ptr [rcx + 4*rsi + 8] 13836 mov dword ptr [r8 + 4*rsi + 8], eax 13837 mov eax, r11d 13838 sub eax, dword ptr [rcx + 4*rsi + 12] 13839 mov dword ptr [r8 + 4*rsi + 12], eax 13840 add rsi, 4 13841 cmp r10, rsi 13842 jne .LBB2_668 13843 jmp .LBB2_1069 13844 .LBB2_55: 13845 cmp edi, 8 13846 jle .LBB2_127 13847 # %bb.56: 13848 cmp edi, 9 13849 je .LBB2_193 13850 # %bb.57: 13851 cmp edi, 11 13852 je .LBB2_196 13853 # %bb.58: 13854 cmp edi, 12 13855 jne .LBB2_1069 13856 # %bb.59: 13857 test r9d, r9d 13858 jle .LBB2_1069 13859 # %bb.60: 13860 movsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero 13861 mov eax, r9d 13862 cmp r9d, 4 13863 jb .LBB2_61 13864 # %bb.283: 13865 lea rdx, [rcx + 8*rax] 13866 cmp rdx, r8 13867 jbe .LBB2_471 13868 # %bb.284: 13869 lea rdx, [r8 + 8*rax] 13870 cmp rdx, rcx 13871 jbe .LBB2_471 13872 .LBB2_61: 13873 xor edx, edx 13874 .LBB2_673: 13875 mov rsi, rdx 13876 not rsi 13877 add rsi, rax 13878 mov rdi, rax 13879 and rdi, 3 13880 je .LBB2_675 13881 .LBB2_674: # =>This Inner Loop Header: Depth=1 13882 movsd xmm1, qword ptr [rcx + 8*rdx] # xmm1 = mem[0],zero 13883 mulsd xmm1, xmm0 13884 movsd qword ptr [r8 + 8*rdx], xmm1 13885 add rdx, 1 13886 add rdi, -1 13887 jne .LBB2_674 13888 .LBB2_675: 13889 cmp rsi, 3 13890 jb .LBB2_1069 13891 .LBB2_676: # =>This Inner Loop Header: Depth=1 13892 movsd xmm1, qword ptr [rcx + 8*rdx] # xmm1 = mem[0],zero 13893 mulsd xmm1, xmm0 13894 movsd qword ptr [r8 + 8*rdx], xmm1 13895 movsd xmm1, qword ptr [rcx + 8*rdx + 8] # xmm1 = mem[0],zero 13896 mulsd xmm1, xmm0 13897 movsd qword ptr [r8 + 8*rdx + 8], xmm1 13898 movsd xmm1, qword ptr [rcx + 8*rdx + 16] # xmm1 = mem[0],zero 13899 mulsd xmm1, xmm0 13900 movsd qword ptr [r8 + 8*rdx + 16], xmm1 13901 movsd xmm1, qword ptr [rcx + 8*rdx + 24] # xmm1 = mem[0],zero 13902 mulsd xmm1, xmm0 13903 movsd qword ptr [r8 + 8*rdx + 24], xmm1 13904 add rdx, 4 13905 cmp rax, rdx 13906 jne .LBB2_676 13907 jmp .LBB2_1069 13908 .LBB2_62: 13909 cmp edi, 8 13910 jle .LBB2_132 13911 # %bb.63: 13912 cmp edi, 9 13913 je .LBB2_199 13914 # %bb.64: 13915 cmp edi, 11 13916 je .LBB2_202 13917 # %bb.65: 13918 cmp edi, 12 13919 jne .LBB2_1069 13920 # %bb.66: 13921 test r9d, r9d 13922 jle .LBB2_1069 13923 # %bb.67: 13924 movsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero 13925 mov eax, r9d 13926 cmp r9d, 4 13927 jb .LBB2_68 13928 # %bb.286: 13929 lea rdx, [rcx + 8*rax] 13930 cmp rdx, r8 13931 jbe .LBB2_474 13932 # %bb.287: 13933 lea rdx, [r8 + 8*rax] 13934 cmp rdx, rcx 13935 jbe .LBB2_474 13936 .LBB2_68: 13937 xor edx, edx 13938 .LBB2_681: 13939 mov rsi, rdx 13940 not rsi 13941 add rsi, rax 13942 mov rdi, rax 13943 and rdi, 3 13944 je .LBB2_683 13945 .LBB2_682: # =>This Inner Loop Header: Depth=1 13946 movsd xmm1, qword ptr [rcx + 8*rdx] # xmm1 = mem[0],zero 13947 mulsd xmm1, xmm0 13948 movsd qword ptr [r8 + 8*rdx], xmm1 13949 add rdx, 1 13950 add rdi, -1 13951 jne .LBB2_682 13952 .LBB2_683: 13953 cmp rsi, 3 13954 jb .LBB2_1069 13955 .LBB2_684: # =>This Inner Loop Header: Depth=1 13956 movsd xmm1, qword ptr [rcx + 8*rdx] # xmm1 = mem[0],zero 13957 mulsd xmm1, xmm0 13958 movsd qword ptr [r8 + 8*rdx], xmm1 13959 movsd xmm1, qword ptr [rcx + 8*rdx + 8] # xmm1 = mem[0],zero 13960 mulsd xmm1, xmm0 13961 movsd qword ptr [r8 + 8*rdx + 8], xmm1 13962 movsd xmm1, qword ptr [rcx + 8*rdx + 16] # xmm1 = mem[0],zero 13963 mulsd xmm1, xmm0 13964 movsd qword ptr [r8 + 8*rdx + 16], xmm1 13965 movsd xmm1, qword ptr [rcx + 8*rdx + 24] # xmm1 = mem[0],zero 13966 mulsd xmm1, xmm0 13967 movsd qword ptr [r8 + 8*rdx + 24], xmm1 13968 add rdx, 4 13969 cmp rax, rdx 13970 jne .LBB2_684 13971 jmp .LBB2_1069 13972 .LBB2_69: 13973 cmp edi, 8 13974 jle .LBB2_137 13975 # %bb.70: 13976 cmp edi, 9 13977 je .LBB2_205 13978 # %bb.71: 13979 cmp edi, 11 13980 je .LBB2_208 13981 # %bb.72: 13982 cmp edi, 12 13983 jne .LBB2_1069 13984 # %bb.73: 13985 test r9d, r9d 13986 jle .LBB2_1069 13987 # %bb.74: 13988 movsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero 13989 mov eax, r9d 13990 cmp r9d, 4 13991 jb .LBB2_75 13992 # %bb.289: 13993 lea rdx, [rcx + 8*rax] 13994 cmp rdx, r8 13995 jbe .LBB2_477 13996 # %bb.290: 13997 lea rdx, [r8 + 8*rax] 13998 cmp rdx, rcx 13999 jbe .LBB2_477 14000 .LBB2_75: 14001 xor edx, edx 14002 .LBB2_689: 14003 mov rsi, rdx 14004 not rsi 14005 add rsi, rax 14006 mov rdi, rax 14007 and rdi, 3 14008 je .LBB2_691 14009 .LBB2_690: # =>This Inner Loop Header: Depth=1 14010 movsd xmm1, qword ptr [rcx + 8*rdx] # xmm1 = mem[0],zero 14011 addsd xmm1, xmm0 14012 movsd qword ptr [r8 + 8*rdx], xmm1 14013 add rdx, 1 14014 add rdi, -1 14015 jne .LBB2_690 14016 .LBB2_691: 14017 cmp rsi, 3 14018 jb .LBB2_1069 14019 .LBB2_692: # =>This Inner Loop Header: Depth=1 14020 movsd xmm1, qword ptr [rcx + 8*rdx] # xmm1 = mem[0],zero 14021 addsd xmm1, xmm0 14022 movsd qword ptr [r8 + 8*rdx], xmm1 14023 movsd xmm1, qword ptr [rcx + 8*rdx + 8] # xmm1 = mem[0],zero 14024 addsd xmm1, xmm0 14025 movsd qword ptr [r8 + 8*rdx + 8], xmm1 14026 movsd xmm1, qword ptr [rcx + 8*rdx + 16] # xmm1 = mem[0],zero 14027 addsd xmm1, xmm0 14028 movsd qword ptr [r8 + 8*rdx + 16], xmm1 14029 movsd xmm1, qword ptr [rcx + 8*rdx + 24] # xmm1 = mem[0],zero 14030 addsd xmm1, xmm0 14031 movsd qword ptr [r8 + 8*rdx + 24], xmm1 14032 add rdx, 4 14033 cmp rax, rdx 14034 jne .LBB2_692 14035 jmp .LBB2_1069 14036 .LBB2_76: 14037 cmp edi, 8 14038 jle .LBB2_142 14039 # %bb.77: 14040 cmp edi, 9 14041 je .LBB2_211 14042 # %bb.78: 14043 cmp edi, 11 14044 je .LBB2_214 14045 # %bb.79: 14046 cmp edi, 12 14047 jne .LBB2_1069 14048 # %bb.80: 14049 test r9d, r9d 14050 jle .LBB2_1069 14051 # %bb.81: 14052 movsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero 14053 mov eax, r9d 14054 cmp r9d, 4 14055 jb .LBB2_82 14056 # %bb.292: 14057 lea rdx, [rcx + 8*rax] 14058 cmp rdx, r8 14059 jbe .LBB2_480 14060 # %bb.293: 14061 lea rdx, [r8 + 8*rax] 14062 cmp rdx, rcx 14063 jbe .LBB2_480 14064 .LBB2_82: 14065 xor edx, edx 14066 .LBB2_697: 14067 mov rsi, rdx 14068 not rsi 14069 add rsi, rax 14070 mov rdi, rax 14071 and rdi, 3 14072 je .LBB2_699 14073 .LBB2_698: # =>This Inner Loop Header: Depth=1 14074 movapd xmm1, xmm0 14075 subsd xmm1, qword ptr [rcx + 8*rdx] 14076 movsd qword ptr [r8 + 8*rdx], xmm1 14077 add rdx, 1 14078 add rdi, -1 14079 jne .LBB2_698 14080 .LBB2_699: 14081 cmp rsi, 3 14082 jb .LBB2_1069 14083 .LBB2_700: # =>This Inner Loop Header: Depth=1 14084 movapd xmm1, xmm0 14085 subsd xmm1, qword ptr [rcx + 8*rdx] 14086 movsd qword ptr [r8 + 8*rdx], xmm1 14087 movapd xmm1, xmm0 14088 subsd xmm1, qword ptr [rcx + 8*rdx + 8] 14089 movsd qword ptr [r8 + 8*rdx + 8], xmm1 14090 movapd xmm1, xmm0 14091 subsd xmm1, qword ptr [rcx + 8*rdx + 16] 14092 movsd qword ptr [r8 + 8*rdx + 16], xmm1 14093 movapd xmm1, xmm0 14094 subsd xmm1, qword ptr [rcx + 8*rdx + 24] 14095 movsd qword ptr [r8 + 8*rdx + 24], xmm1 14096 add rdx, 4 14097 cmp rax, rdx 14098 jne .LBB2_700 14099 jmp .LBB2_1069 14100 .LBB2_83: 14101 cmp edi, 8 14102 jle .LBB2_147 14103 # %bb.84: 14104 cmp edi, 9 14105 je .LBB2_217 14106 # %bb.85: 14107 cmp edi, 11 14108 je .LBB2_220 14109 # %bb.86: 14110 cmp edi, 12 14111 jne .LBB2_1069 14112 # %bb.87: 14113 test r9d, r9d 14114 jle .LBB2_1069 14115 # %bb.88: 14116 movsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero 14117 mov eax, r9d 14118 cmp r9d, 4 14119 jb .LBB2_89 14120 # %bb.295: 14121 lea rdx, [rcx + 8*rax] 14122 cmp rdx, r8 14123 jbe .LBB2_483 14124 # %bb.296: 14125 lea rdx, [r8 + 8*rax] 14126 cmp rdx, rcx 14127 jbe .LBB2_483 14128 .LBB2_89: 14129 xor edx, edx 14130 .LBB2_705: 14131 mov rsi, rdx 14132 not rsi 14133 add rsi, rax 14134 mov rdi, rax 14135 and rdi, 3 14136 je .LBB2_707 14137 .LBB2_706: # =>This Inner Loop Header: Depth=1 14138 movsd xmm1, qword ptr [rcx + 8*rdx] # xmm1 = mem[0],zero 14139 addsd xmm1, xmm0 14140 movsd qword ptr [r8 + 8*rdx], xmm1 14141 add rdx, 1 14142 add rdi, -1 14143 jne .LBB2_706 14144 .LBB2_707: 14145 cmp rsi, 3 14146 jb .LBB2_1069 14147 .LBB2_708: # =>This Inner Loop Header: Depth=1 14148 movsd xmm1, qword ptr [rcx + 8*rdx] # xmm1 = mem[0],zero 14149 addsd xmm1, xmm0 14150 movsd qword ptr [r8 + 8*rdx], xmm1 14151 movsd xmm1, qword ptr [rcx + 8*rdx + 8] # xmm1 = mem[0],zero 14152 addsd xmm1, xmm0 14153 movsd qword ptr [r8 + 8*rdx + 8], xmm1 14154 movsd xmm1, qword ptr [rcx + 8*rdx + 16] # xmm1 = mem[0],zero 14155 addsd xmm1, xmm0 14156 movsd qword ptr [r8 + 8*rdx + 16], xmm1 14157 movsd xmm1, qword ptr [rcx + 8*rdx + 24] # xmm1 = mem[0],zero 14158 addsd xmm1, xmm0 14159 movsd qword ptr [r8 + 8*rdx + 24], xmm1 14160 add rdx, 4 14161 cmp rax, rdx 14162 jne .LBB2_708 14163 jmp .LBB2_1069 14164 .LBB2_90: 14165 cmp edi, 8 14166 jle .LBB2_152 14167 # %bb.91: 14168 cmp edi, 9 14169 je .LBB2_223 14170 # %bb.92: 14171 cmp edi, 11 14172 je .LBB2_226 14173 # %bb.93: 14174 cmp edi, 12 14175 jne .LBB2_1069 14176 # %bb.94: 14177 test r9d, r9d 14178 jle .LBB2_1069 14179 # %bb.95: 14180 movsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero 14181 mov eax, r9d 14182 cmp r9d, 4 14183 jb .LBB2_96 14184 # %bb.298: 14185 lea rdx, [rcx + 8*rax] 14186 cmp rdx, r8 14187 jbe .LBB2_486 14188 # %bb.299: 14189 lea rdx, [r8 + 8*rax] 14190 cmp rdx, rcx 14191 jbe .LBB2_486 14192 .LBB2_96: 14193 xor edx, edx 14194 .LBB2_713: 14195 mov rsi, rdx 14196 not rsi 14197 add rsi, rax 14198 mov rdi, rax 14199 and rdi, 3 14200 je .LBB2_715 14201 .LBB2_714: # =>This Inner Loop Header: Depth=1 14202 movapd xmm1, xmm0 14203 subsd xmm1, qword ptr [rcx + 8*rdx] 14204 movsd qword ptr [r8 + 8*rdx], xmm1 14205 add rdx, 1 14206 add rdi, -1 14207 jne .LBB2_714 14208 .LBB2_715: 14209 cmp rsi, 3 14210 jb .LBB2_1069 14211 .LBB2_716: # =>This Inner Loop Header: Depth=1 14212 movapd xmm1, xmm0 14213 subsd xmm1, qword ptr [rcx + 8*rdx] 14214 movsd qword ptr [r8 + 8*rdx], xmm1 14215 movapd xmm1, xmm0 14216 subsd xmm1, qword ptr [rcx + 8*rdx + 8] 14217 movsd qword ptr [r8 + 8*rdx + 8], xmm1 14218 movapd xmm1, xmm0 14219 subsd xmm1, qword ptr [rcx + 8*rdx + 16] 14220 movsd qword ptr [r8 + 8*rdx + 16], xmm1 14221 movapd xmm1, xmm0 14222 subsd xmm1, qword ptr [rcx + 8*rdx + 24] 14223 movsd qword ptr [r8 + 8*rdx + 24], xmm1 14224 add rdx, 4 14225 cmp rax, rdx 14226 jne .LBB2_716 14227 jmp .LBB2_1069 14228 .LBB2_97: 14229 cmp edi, 2 14230 je .LBB2_229 14231 # %bb.98: 14232 cmp edi, 3 14233 jne .LBB2_1069 14234 # %bb.99: 14235 test r9d, r9d 14236 jle .LBB2_1069 14237 # %bb.100: 14238 mov dl, byte ptr [rdx] 14239 mov r10d, r9d 14240 cmp r9d, 32 14241 jb .LBB2_101 14242 # %bb.301: 14243 lea rax, [rcx + r10] 14244 cmp rax, r8 14245 jbe .LBB2_489 14246 # %bb.302: 14247 lea rax, [r8 + r10] 14248 cmp rax, rcx 14249 jbe .LBB2_489 14250 .LBB2_101: 14251 xor edi, edi 14252 .LBB2_721: 14253 mov r9, rdi 14254 not r9 14255 add r9, r10 14256 mov rsi, r10 14257 and rsi, 3 14258 je .LBB2_723 14259 .LBB2_722: # =>This Inner Loop Header: Depth=1 14260 movzx eax, byte ptr [rcx + rdi] 14261 mul dl 14262 mov byte ptr [r8 + rdi], al 14263 add rdi, 1 14264 add rsi, -1 14265 jne .LBB2_722 14266 .LBB2_723: 14267 cmp r9, 3 14268 jb .LBB2_1069 14269 .LBB2_724: # =>This Inner Loop Header: Depth=1 14270 movzx eax, byte ptr [rcx + rdi] 14271 mul dl 14272 mov byte ptr [r8 + rdi], al 14273 movzx eax, byte ptr [rcx + rdi + 1] 14274 mul dl 14275 mov byte ptr [r8 + rdi + 1], al 14276 movzx eax, byte ptr [rcx + rdi + 2] 14277 mul dl 14278 mov byte ptr [r8 + rdi + 2], al 14279 movzx eax, byte ptr [rcx + rdi + 3] 14280 mul dl 14281 mov byte ptr [r8 + rdi + 3], al 14282 add rdi, 4 14283 cmp r10, rdi 14284 jne .LBB2_724 14285 jmp .LBB2_1069 14286 .LBB2_102: 14287 cmp edi, 2 14288 je .LBB2_232 14289 # %bb.103: 14290 cmp edi, 3 14291 jne .LBB2_1069 14292 # %bb.104: 14293 test r9d, r9d 14294 jle .LBB2_1069 14295 # %bb.105: 14296 mov dl, byte ptr [rdx] 14297 mov r10d, r9d 14298 cmp r9d, 32 14299 jb .LBB2_106 14300 # %bb.304: 14301 lea rax, [rcx + r10] 14302 cmp rax, r8 14303 jbe .LBB2_492 14304 # %bb.305: 14305 lea rax, [r8 + r10] 14306 cmp rax, rcx 14307 jbe .LBB2_492 14308 .LBB2_106: 14309 xor edi, edi 14310 .LBB2_729: 14311 mov r9, rdi 14312 not r9 14313 add r9, r10 14314 mov rsi, r10 14315 and rsi, 3 14316 je .LBB2_731 14317 .LBB2_730: # =>This Inner Loop Header: Depth=1 14318 movzx eax, byte ptr [rcx + rdi] 14319 mul dl 14320 mov byte ptr [r8 + rdi], al 14321 add rdi, 1 14322 add rsi, -1 14323 jne .LBB2_730 14324 .LBB2_731: 14325 cmp r9, 3 14326 jb .LBB2_1069 14327 .LBB2_732: # =>This Inner Loop Header: Depth=1 14328 movzx eax, byte ptr [rcx + rdi] 14329 mul dl 14330 mov byte ptr [r8 + rdi], al 14331 movzx eax, byte ptr [rcx + rdi + 1] 14332 mul dl 14333 mov byte ptr [r8 + rdi + 1], al 14334 movzx eax, byte ptr [rcx + rdi + 2] 14335 mul dl 14336 mov byte ptr [r8 + rdi + 2], al 14337 movzx eax, byte ptr [rcx + rdi + 3] 14338 mul dl 14339 mov byte ptr [r8 + rdi + 3], al 14340 add rdi, 4 14341 cmp r10, rdi 14342 jne .LBB2_732 14343 jmp .LBB2_1069 14344 .LBB2_107: 14345 cmp edi, 2 14346 je .LBB2_235 14347 # %bb.108: 14348 cmp edi, 3 14349 jne .LBB2_1069 14350 # %bb.109: 14351 test r9d, r9d 14352 jle .LBB2_1069 14353 # %bb.110: 14354 mov al, byte ptr [rdx] 14355 mov r10d, r9d 14356 cmp r9d, 32 14357 jb .LBB2_111 14358 # %bb.307: 14359 lea rdx, [rcx + r10] 14360 cmp rdx, r8 14361 jbe .LBB2_495 14362 # %bb.308: 14363 lea rdx, [r8 + r10] 14364 cmp rdx, rcx 14365 jbe .LBB2_495 14366 .LBB2_111: 14367 xor esi, esi 14368 .LBB2_737: 14369 mov r9, rsi 14370 not r9 14371 add r9, r10 14372 mov rdi, r10 14373 and rdi, 3 14374 je .LBB2_739 14375 .LBB2_738: # =>This Inner Loop Header: Depth=1 14376 movzx edx, byte ptr [rcx + rsi] 14377 add dl, al 14378 mov byte ptr [r8 + rsi], dl 14379 add rsi, 1 14380 add rdi, -1 14381 jne .LBB2_738 14382 .LBB2_739: 14383 cmp r9, 3 14384 jb .LBB2_1069 14385 .LBB2_740: # =>This Inner Loop Header: Depth=1 14386 movzx edx, byte ptr [rcx + rsi] 14387 add dl, al 14388 mov byte ptr [r8 + rsi], dl 14389 movzx edx, byte ptr [rcx + rsi + 1] 14390 add dl, al 14391 mov byte ptr [r8 + rsi + 1], dl 14392 movzx edx, byte ptr [rcx + rsi + 2] 14393 add dl, al 14394 mov byte ptr [r8 + rsi + 2], dl 14395 movzx edx, byte ptr [rcx + rsi + 3] 14396 add dl, al 14397 mov byte ptr [r8 + rsi + 3], dl 14398 add rsi, 4 14399 cmp r10, rsi 14400 jne .LBB2_740 14401 jmp .LBB2_1069 14402 .LBB2_112: 14403 cmp edi, 2 14404 je .LBB2_238 14405 # %bb.113: 14406 cmp edi, 3 14407 jne .LBB2_1069 14408 # %bb.114: 14409 test r9d, r9d 14410 jle .LBB2_1069 14411 # %bb.115: 14412 mov r11b, byte ptr [rdx] 14413 mov r10d, r9d 14414 cmp r9d, 32 14415 jb .LBB2_116 14416 # %bb.310: 14417 lea rdx, [rcx + r10] 14418 cmp rdx, r8 14419 jbe .LBB2_498 14420 # %bb.311: 14421 lea rdx, [r8 + r10] 14422 cmp rdx, rcx 14423 jbe .LBB2_498 14424 .LBB2_116: 14425 xor esi, esi 14426 .LBB2_745: 14427 mov rdx, rsi 14428 not rdx 14429 add rdx, r10 14430 mov rdi, r10 14431 and rdi, 3 14432 je .LBB2_747 14433 .LBB2_746: # =>This Inner Loop Header: Depth=1 14434 mov eax, r11d 14435 sub al, byte ptr [rcx + rsi] 14436 mov byte ptr [r8 + rsi], al 14437 add rsi, 1 14438 add rdi, -1 14439 jne .LBB2_746 14440 .LBB2_747: 14441 cmp rdx, 3 14442 jb .LBB2_1069 14443 .LBB2_748: # =>This Inner Loop Header: Depth=1 14444 mov eax, r11d 14445 sub al, byte ptr [rcx + rsi] 14446 mov byte ptr [r8 + rsi], al 14447 mov eax, r11d 14448 sub al, byte ptr [rcx + rsi + 1] 14449 mov byte ptr [r8 + rsi + 1], al 14450 mov eax, r11d 14451 sub al, byte ptr [rcx + rsi + 2] 14452 mov byte ptr [r8 + rsi + 2], al 14453 mov eax, r11d 14454 sub al, byte ptr [rcx + rsi + 3] 14455 mov byte ptr [r8 + rsi + 3], al 14456 add rsi, 4 14457 cmp r10, rsi 14458 jne .LBB2_748 14459 jmp .LBB2_1069 14460 .LBB2_117: 14461 cmp edi, 2 14462 je .LBB2_241 14463 # %bb.118: 14464 cmp edi, 3 14465 jne .LBB2_1069 14466 # %bb.119: 14467 test r9d, r9d 14468 jle .LBB2_1069 14469 # %bb.120: 14470 mov al, byte ptr [rdx] 14471 mov r10d, r9d 14472 cmp r9d, 32 14473 jb .LBB2_121 14474 # %bb.313: 14475 lea rdx, [rcx + r10] 14476 cmp rdx, r8 14477 jbe .LBB2_501 14478 # %bb.314: 14479 lea rdx, [r8 + r10] 14480 cmp rdx, rcx 14481 jbe .LBB2_501 14482 .LBB2_121: 14483 xor esi, esi 14484 .LBB2_753: 14485 mov r9, rsi 14486 not r9 14487 add r9, r10 14488 mov rdi, r10 14489 and rdi, 3 14490 je .LBB2_755 14491 .LBB2_754: # =>This Inner Loop Header: Depth=1 14492 movzx edx, byte ptr [rcx + rsi] 14493 add dl, al 14494 mov byte ptr [r8 + rsi], dl 14495 add rsi, 1 14496 add rdi, -1 14497 jne .LBB2_754 14498 .LBB2_755: 14499 cmp r9, 3 14500 jb .LBB2_1069 14501 .LBB2_756: # =>This Inner Loop Header: Depth=1 14502 movzx edx, byte ptr [rcx + rsi] 14503 add dl, al 14504 mov byte ptr [r8 + rsi], dl 14505 movzx edx, byte ptr [rcx + rsi + 1] 14506 add dl, al 14507 mov byte ptr [r8 + rsi + 1], dl 14508 movzx edx, byte ptr [rcx + rsi + 2] 14509 add dl, al 14510 mov byte ptr [r8 + rsi + 2], dl 14511 movzx edx, byte ptr [rcx + rsi + 3] 14512 add dl, al 14513 mov byte ptr [r8 + rsi + 3], dl 14514 add rsi, 4 14515 cmp r10, rsi 14516 jne .LBB2_756 14517 jmp .LBB2_1069 14518 .LBB2_122: 14519 cmp edi, 2 14520 je .LBB2_244 14521 # %bb.123: 14522 cmp edi, 3 14523 jne .LBB2_1069 14524 # %bb.124: 14525 test r9d, r9d 14526 jle .LBB2_1069 14527 # %bb.125: 14528 mov r11b, byte ptr [rdx] 14529 mov r10d, r9d 14530 cmp r9d, 32 14531 jb .LBB2_126 14532 # %bb.316: 14533 lea rdx, [rcx + r10] 14534 cmp rdx, r8 14535 jbe .LBB2_504 14536 # %bb.317: 14537 lea rdx, [r8 + r10] 14538 cmp rdx, rcx 14539 jbe .LBB2_504 14540 .LBB2_126: 14541 xor esi, esi 14542 .LBB2_761: 14543 mov rdx, rsi 14544 not rdx 14545 add rdx, r10 14546 mov rdi, r10 14547 and rdi, 3 14548 je .LBB2_763 14549 .LBB2_762: # =>This Inner Loop Header: Depth=1 14550 mov eax, r11d 14551 sub al, byte ptr [rcx + rsi] 14552 mov byte ptr [r8 + rsi], al 14553 add rsi, 1 14554 add rdi, -1 14555 jne .LBB2_762 14556 .LBB2_763: 14557 cmp rdx, 3 14558 jb .LBB2_1069 14559 .LBB2_764: # =>This Inner Loop Header: Depth=1 14560 mov eax, r11d 14561 sub al, byte ptr [rcx + rsi] 14562 mov byte ptr [r8 + rsi], al 14563 mov eax, r11d 14564 sub al, byte ptr [rcx + rsi + 1] 14565 mov byte ptr [r8 + rsi + 1], al 14566 mov eax, r11d 14567 sub al, byte ptr [rcx + rsi + 2] 14568 mov byte ptr [r8 + rsi + 2], al 14569 mov eax, r11d 14570 sub al, byte ptr [rcx + rsi + 3] 14571 mov byte ptr [r8 + rsi + 3], al 14572 add rsi, 4 14573 cmp r10, rsi 14574 jne .LBB2_764 14575 jmp .LBB2_1069 14576 .LBB2_127: 14577 cmp edi, 7 14578 je .LBB2_247 14579 # %bb.128: 14580 cmp edi, 8 14581 jne .LBB2_1069 14582 # %bb.129: 14583 test r9d, r9d 14584 jle .LBB2_1069 14585 # %bb.130: 14586 mov rax, qword ptr [rdx] 14587 mov esi, r9d 14588 lea rdi, [rsi - 1] 14589 mov r9d, esi 14590 and r9d, 3 14591 cmp rdi, 3 14592 jae .LBB2_319 14593 # %bb.131: 14594 xor edi, edi 14595 jmp .LBB2_321 14596 .LBB2_132: 14597 cmp edi, 7 14598 je .LBB2_250 14599 # %bb.133: 14600 cmp edi, 8 14601 jne .LBB2_1069 14602 # %bb.134: 14603 test r9d, r9d 14604 jle .LBB2_1069 14605 # %bb.135: 14606 mov rax, qword ptr [rdx] 14607 mov esi, r9d 14608 lea rdi, [rsi - 1] 14609 mov r9d, esi 14610 and r9d, 3 14611 cmp rdi, 3 14612 jae .LBB2_324 14613 # %bb.136: 14614 xor edi, edi 14615 jmp .LBB2_326 14616 .LBB2_137: 14617 cmp edi, 7 14618 je .LBB2_253 14619 # %bb.138: 14620 cmp edi, 8 14621 jne .LBB2_1069 14622 # %bb.139: 14623 test r9d, r9d 14624 jle .LBB2_1069 14625 # %bb.140: 14626 mov rax, qword ptr [rdx] 14627 mov r10d, r9d 14628 cmp r9d, 4 14629 jb .LBB2_141 14630 # %bb.329: 14631 lea rdx, [rcx + 8*r10] 14632 cmp rdx, r8 14633 jbe .LBB2_507 14634 # %bb.330: 14635 lea rdx, [r8 + 8*r10] 14636 cmp rdx, rcx 14637 jbe .LBB2_507 14638 .LBB2_141: 14639 xor esi, esi 14640 .LBB2_769: 14641 mov r9, rsi 14642 not r9 14643 add r9, r10 14644 mov rdi, r10 14645 and rdi, 3 14646 je .LBB2_771 14647 .LBB2_770: # =>This Inner Loop Header: Depth=1 14648 mov rdx, qword ptr [rcx + 8*rsi] 14649 add rdx, rax 14650 mov qword ptr [r8 + 8*rsi], rdx 14651 add rsi, 1 14652 add rdi, -1 14653 jne .LBB2_770 14654 .LBB2_771: 14655 cmp r9, 3 14656 jb .LBB2_1069 14657 .LBB2_772: # =>This Inner Loop Header: Depth=1 14658 mov rdx, qword ptr [rcx + 8*rsi] 14659 add rdx, rax 14660 mov qword ptr [r8 + 8*rsi], rdx 14661 mov rdx, qword ptr [rcx + 8*rsi + 8] 14662 add rdx, rax 14663 mov qword ptr [r8 + 8*rsi + 8], rdx 14664 mov rdx, qword ptr [rcx + 8*rsi + 16] 14665 add rdx, rax 14666 mov qword ptr [r8 + 8*rsi + 16], rdx 14667 mov rdx, qword ptr [rcx + 8*rsi + 24] 14668 add rdx, rax 14669 mov qword ptr [r8 + 8*rsi + 24], rdx 14670 add rsi, 4 14671 cmp r10, rsi 14672 jne .LBB2_772 14673 jmp .LBB2_1069 14674 .LBB2_142: 14675 cmp edi, 7 14676 je .LBB2_256 14677 # %bb.143: 14678 cmp edi, 8 14679 jne .LBB2_1069 14680 # %bb.144: 14681 test r9d, r9d 14682 jle .LBB2_1069 14683 # %bb.145: 14684 mov r11, qword ptr [rdx] 14685 mov r10d, r9d 14686 cmp r9d, 4 14687 jb .LBB2_146 14688 # %bb.332: 14689 lea rdx, [rcx + 8*r10] 14690 cmp rdx, r8 14691 jbe .LBB2_510 14692 # %bb.333: 14693 lea rdx, [r8 + 8*r10] 14694 cmp rdx, rcx 14695 jbe .LBB2_510 14696 .LBB2_146: 14697 xor esi, esi 14698 .LBB2_777: 14699 mov rdx, rsi 14700 not rdx 14701 add rdx, r10 14702 mov rdi, r10 14703 and rdi, 3 14704 je .LBB2_779 14705 .LBB2_778: # =>This Inner Loop Header: Depth=1 14706 mov rax, r11 14707 sub rax, qword ptr [rcx + 8*rsi] 14708 mov qword ptr [r8 + 8*rsi], rax 14709 add rsi, 1 14710 add rdi, -1 14711 jne .LBB2_778 14712 .LBB2_779: 14713 cmp rdx, 3 14714 jb .LBB2_1069 14715 .LBB2_780: # =>This Inner Loop Header: Depth=1 14716 mov rax, r11 14717 sub rax, qword ptr [rcx + 8*rsi] 14718 mov qword ptr [r8 + 8*rsi], rax 14719 mov rax, r11 14720 sub rax, qword ptr [rcx + 8*rsi + 8] 14721 mov qword ptr [r8 + 8*rsi + 8], rax 14722 mov rax, r11 14723 sub rax, qword ptr [rcx + 8*rsi + 16] 14724 mov qword ptr [r8 + 8*rsi + 16], rax 14725 mov rax, r11 14726 sub rax, qword ptr [rcx + 8*rsi + 24] 14727 mov qword ptr [r8 + 8*rsi + 24], rax 14728 add rsi, 4 14729 cmp r10, rsi 14730 jne .LBB2_780 14731 jmp .LBB2_1069 14732 .LBB2_147: 14733 cmp edi, 7 14734 je .LBB2_259 14735 # %bb.148: 14736 cmp edi, 8 14737 jne .LBB2_1069 14738 # %bb.149: 14739 test r9d, r9d 14740 jle .LBB2_1069 14741 # %bb.150: 14742 mov rax, qword ptr [rdx] 14743 mov r10d, r9d 14744 cmp r9d, 4 14745 jb .LBB2_151 14746 # %bb.335: 14747 lea rdx, [rcx + 8*r10] 14748 cmp rdx, r8 14749 jbe .LBB2_513 14750 # %bb.336: 14751 lea rdx, [r8 + 8*r10] 14752 cmp rdx, rcx 14753 jbe .LBB2_513 14754 .LBB2_151: 14755 xor esi, esi 14756 .LBB2_785: 14757 mov r9, rsi 14758 not r9 14759 add r9, r10 14760 mov rdi, r10 14761 and rdi, 3 14762 je .LBB2_787 14763 .LBB2_786: # =>This Inner Loop Header: Depth=1 14764 mov rdx, qword ptr [rcx + 8*rsi] 14765 add rdx, rax 14766 mov qword ptr [r8 + 8*rsi], rdx 14767 add rsi, 1 14768 add rdi, -1 14769 jne .LBB2_786 14770 .LBB2_787: 14771 cmp r9, 3 14772 jb .LBB2_1069 14773 .LBB2_788: # =>This Inner Loop Header: Depth=1 14774 mov rdx, qword ptr [rcx + 8*rsi] 14775 add rdx, rax 14776 mov qword ptr [r8 + 8*rsi], rdx 14777 mov rdx, qword ptr [rcx + 8*rsi + 8] 14778 add rdx, rax 14779 mov qword ptr [r8 + 8*rsi + 8], rdx 14780 mov rdx, qword ptr [rcx + 8*rsi + 16] 14781 add rdx, rax 14782 mov qword ptr [r8 + 8*rsi + 16], rdx 14783 mov rdx, qword ptr [rcx + 8*rsi + 24] 14784 add rdx, rax 14785 mov qword ptr [r8 + 8*rsi + 24], rdx 14786 add rsi, 4 14787 cmp r10, rsi 14788 jne .LBB2_788 14789 jmp .LBB2_1069 14790 .LBB2_152: 14791 cmp edi, 7 14792 je .LBB2_262 14793 # %bb.153: 14794 cmp edi, 8 14795 jne .LBB2_1069 14796 # %bb.154: 14797 test r9d, r9d 14798 jle .LBB2_1069 14799 # %bb.155: 14800 mov r11, qword ptr [rdx] 14801 mov r10d, r9d 14802 cmp r9d, 4 14803 jb .LBB2_156 14804 # %bb.338: 14805 lea rdx, [rcx + 8*r10] 14806 cmp rdx, r8 14807 jbe .LBB2_516 14808 # %bb.339: 14809 lea rdx, [r8 + 8*r10] 14810 cmp rdx, rcx 14811 jbe .LBB2_516 14812 .LBB2_156: 14813 xor esi, esi 14814 .LBB2_793: 14815 mov rdx, rsi 14816 not rdx 14817 add rdx, r10 14818 mov rdi, r10 14819 and rdi, 3 14820 je .LBB2_795 14821 .LBB2_794: # =>This Inner Loop Header: Depth=1 14822 mov rax, r11 14823 sub rax, qword ptr [rcx + 8*rsi] 14824 mov qword ptr [r8 + 8*rsi], rax 14825 add rsi, 1 14826 add rdi, -1 14827 jne .LBB2_794 14828 .LBB2_795: 14829 cmp rdx, 3 14830 jb .LBB2_1069 14831 .LBB2_796: # =>This Inner Loop Header: Depth=1 14832 mov rax, r11 14833 sub rax, qword ptr [rcx + 8*rsi] 14834 mov qword ptr [r8 + 8*rsi], rax 14835 mov rax, r11 14836 sub rax, qword ptr [rcx + 8*rsi + 8] 14837 mov qword ptr [r8 + 8*rsi + 8], rax 14838 mov rax, r11 14839 sub rax, qword ptr [rcx + 8*rsi + 16] 14840 mov qword ptr [r8 + 8*rsi + 16], rax 14841 mov rax, r11 14842 sub rax, qword ptr [rcx + 8*rsi + 24] 14843 mov qword ptr [r8 + 8*rsi + 24], rax 14844 add rsi, 4 14845 cmp r10, rsi 14846 jne .LBB2_796 14847 jmp .LBB2_1069 14848 .LBB2_157: 14849 test r9d, r9d 14850 jle .LBB2_1069 14851 # %bb.158: 14852 movzx eax, word ptr [rdx] 14853 mov r10d, r9d 14854 cmp r9d, 16 14855 jb .LBB2_159 14856 # %bb.341: 14857 lea rdx, [rcx + 2*r10] 14858 cmp rdx, r8 14859 jbe .LBB2_519 14860 # %bb.342: 14861 lea rdx, [r8 + 2*r10] 14862 cmp rdx, rcx 14863 jbe .LBB2_519 14864 .LBB2_159: 14865 xor esi, esi 14866 .LBB2_801: 14867 mov r9, rsi 14868 not r9 14869 add r9, r10 14870 mov rdi, r10 14871 and rdi, 3 14872 je .LBB2_803 14873 .LBB2_802: # =>This Inner Loop Header: Depth=1 14874 movzx edx, word ptr [rcx + 2*rsi] 14875 imul dx, ax 14876 mov word ptr [r8 + 2*rsi], dx 14877 add rsi, 1 14878 add rdi, -1 14879 jne .LBB2_802 14880 .LBB2_803: 14881 cmp r9, 3 14882 jb .LBB2_1069 14883 .LBB2_804: # =>This Inner Loop Header: Depth=1 14884 movzx edx, word ptr [rcx + 2*rsi] 14885 imul dx, ax 14886 mov word ptr [r8 + 2*rsi], dx 14887 movzx edx, word ptr [rcx + 2*rsi + 2] 14888 imul dx, ax 14889 mov word ptr [r8 + 2*rsi + 2], dx 14890 movzx edx, word ptr [rcx + 2*rsi + 4] 14891 imul dx, ax 14892 mov word ptr [r8 + 2*rsi + 4], dx 14893 movzx edx, word ptr [rcx + 2*rsi + 6] 14894 imul dx, ax 14895 mov word ptr [r8 + 2*rsi + 6], dx 14896 add rsi, 4 14897 cmp r10, rsi 14898 jne .LBB2_804 14899 jmp .LBB2_1069 14900 .LBB2_160: 14901 test r9d, r9d 14902 jle .LBB2_1069 14903 # %bb.161: 14904 movzx eax, word ptr [rdx] 14905 mov r10d, r9d 14906 cmp r9d, 16 14907 jb .LBB2_162 14908 # %bb.344: 14909 lea rdx, [rcx + 2*r10] 14910 cmp rdx, r8 14911 jbe .LBB2_522 14912 # %bb.345: 14913 lea rdx, [r8 + 2*r10] 14914 cmp rdx, rcx 14915 jbe .LBB2_522 14916 .LBB2_162: 14917 xor esi, esi 14918 .LBB2_809: 14919 mov r9, rsi 14920 not r9 14921 add r9, r10 14922 mov rdi, r10 14923 and rdi, 3 14924 je .LBB2_811 14925 .LBB2_810: # =>This Inner Loop Header: Depth=1 14926 movzx edx, word ptr [rcx + 2*rsi] 14927 imul dx, ax 14928 mov word ptr [r8 + 2*rsi], dx 14929 add rsi, 1 14930 add rdi, -1 14931 jne .LBB2_810 14932 .LBB2_811: 14933 cmp r9, 3 14934 jb .LBB2_1069 14935 .LBB2_812: # =>This Inner Loop Header: Depth=1 14936 movzx edx, word ptr [rcx + 2*rsi] 14937 imul dx, ax 14938 mov word ptr [r8 + 2*rsi], dx 14939 movzx edx, word ptr [rcx + 2*rsi + 2] 14940 imul dx, ax 14941 mov word ptr [r8 + 2*rsi + 2], dx 14942 movzx edx, word ptr [rcx + 2*rsi + 4] 14943 imul dx, ax 14944 mov word ptr [r8 + 2*rsi + 4], dx 14945 movzx edx, word ptr [rcx + 2*rsi + 6] 14946 imul dx, ax 14947 mov word ptr [r8 + 2*rsi + 6], dx 14948 add rsi, 4 14949 cmp r10, rsi 14950 jne .LBB2_812 14951 jmp .LBB2_1069 14952 .LBB2_163: 14953 test r9d, r9d 14954 jle .LBB2_1069 14955 # %bb.164: 14956 movzx eax, word ptr [rdx] 14957 mov r10d, r9d 14958 cmp r9d, 16 14959 jb .LBB2_165 14960 # %bb.347: 14961 lea rdx, [rcx + 2*r10] 14962 cmp rdx, r8 14963 jbe .LBB2_525 14964 # %bb.348: 14965 lea rdx, [r8 + 2*r10] 14966 cmp rdx, rcx 14967 jbe .LBB2_525 14968 .LBB2_165: 14969 xor esi, esi 14970 .LBB2_817: 14971 mov r9, rsi 14972 not r9 14973 add r9, r10 14974 mov rdi, r10 14975 and rdi, 3 14976 je .LBB2_819 14977 .LBB2_818: # =>This Inner Loop Header: Depth=1 14978 movzx edx, word ptr [rcx + 2*rsi] 14979 imul dx, ax 14980 mov word ptr [r8 + 2*rsi], dx 14981 add rsi, 1 14982 add rdi, -1 14983 jne .LBB2_818 14984 .LBB2_819: 14985 cmp r9, 3 14986 jb .LBB2_1069 14987 .LBB2_820: # =>This Inner Loop Header: Depth=1 14988 movzx edx, word ptr [rcx + 2*rsi] 14989 imul dx, ax 14990 mov word ptr [r8 + 2*rsi], dx 14991 movzx edx, word ptr [rcx + 2*rsi + 2] 14992 imul dx, ax 14993 mov word ptr [r8 + 2*rsi + 2], dx 14994 movzx edx, word ptr [rcx + 2*rsi + 4] 14995 imul dx, ax 14996 mov word ptr [r8 + 2*rsi + 4], dx 14997 movzx edx, word ptr [rcx + 2*rsi + 6] 14998 imul dx, ax 14999 mov word ptr [r8 + 2*rsi + 6], dx 15000 add rsi, 4 15001 cmp r10, rsi 15002 jne .LBB2_820 15003 jmp .LBB2_1069 15004 .LBB2_166: 15005 test r9d, r9d 15006 jle .LBB2_1069 15007 # %bb.167: 15008 movzx eax, word ptr [rdx] 15009 mov r10d, r9d 15010 cmp r9d, 16 15011 jb .LBB2_168 15012 # %bb.350: 15013 lea rdx, [rcx + 2*r10] 15014 cmp rdx, r8 15015 jbe .LBB2_528 15016 # %bb.351: 15017 lea rdx, [r8 + 2*r10] 15018 cmp rdx, rcx 15019 jbe .LBB2_528 15020 .LBB2_168: 15021 xor esi, esi 15022 .LBB2_825: 15023 mov r9, rsi 15024 not r9 15025 add r9, r10 15026 mov rdi, r10 15027 and rdi, 3 15028 je .LBB2_827 15029 .LBB2_826: # =>This Inner Loop Header: Depth=1 15030 movzx edx, word ptr [rcx + 2*rsi] 15031 imul dx, ax 15032 mov word ptr [r8 + 2*rsi], dx 15033 add rsi, 1 15034 add rdi, -1 15035 jne .LBB2_826 15036 .LBB2_827: 15037 cmp r9, 3 15038 jb .LBB2_1069 15039 .LBB2_828: # =>This Inner Loop Header: Depth=1 15040 movzx edx, word ptr [rcx + 2*rsi] 15041 imul dx, ax 15042 mov word ptr [r8 + 2*rsi], dx 15043 movzx edx, word ptr [rcx + 2*rsi + 2] 15044 imul dx, ax 15045 mov word ptr [r8 + 2*rsi + 2], dx 15046 movzx edx, word ptr [rcx + 2*rsi + 4] 15047 imul dx, ax 15048 mov word ptr [r8 + 2*rsi + 4], dx 15049 movzx edx, word ptr [rcx + 2*rsi + 6] 15050 imul dx, ax 15051 mov word ptr [r8 + 2*rsi + 6], dx 15052 add rsi, 4 15053 cmp r10, rsi 15054 jne .LBB2_828 15055 jmp .LBB2_1069 15056 .LBB2_169: 15057 test r9d, r9d 15058 jle .LBB2_1069 15059 # %bb.170: 15060 movzx eax, word ptr [rdx] 15061 mov r10d, r9d 15062 cmp r9d, 16 15063 jb .LBB2_171 15064 # %bb.353: 15065 lea rdx, [rcx + 2*r10] 15066 cmp rdx, r8 15067 jbe .LBB2_531 15068 # %bb.354: 15069 lea rdx, [r8 + 2*r10] 15070 cmp rdx, rcx 15071 jbe .LBB2_531 15072 .LBB2_171: 15073 xor esi, esi 15074 .LBB2_833: 15075 mov r9, rsi 15076 not r9 15077 add r9, r10 15078 mov rdi, r10 15079 and rdi, 3 15080 je .LBB2_835 15081 .LBB2_834: # =>This Inner Loop Header: Depth=1 15082 movzx edx, word ptr [rcx + 2*rsi] 15083 add dx, ax 15084 mov word ptr [r8 + 2*rsi], dx 15085 add rsi, 1 15086 add rdi, -1 15087 jne .LBB2_834 15088 .LBB2_835: 15089 cmp r9, 3 15090 jb .LBB2_1069 15091 .LBB2_836: # =>This Inner Loop Header: Depth=1 15092 movzx edx, word ptr [rcx + 2*rsi] 15093 add dx, ax 15094 mov word ptr [r8 + 2*rsi], dx 15095 movzx edx, word ptr [rcx + 2*rsi + 2] 15096 add dx, ax 15097 mov word ptr [r8 + 2*rsi + 2], dx 15098 movzx edx, word ptr [rcx + 2*rsi + 4] 15099 add dx, ax 15100 mov word ptr [r8 + 2*rsi + 4], dx 15101 movzx edx, word ptr [rcx + 2*rsi + 6] 15102 add dx, ax 15103 mov word ptr [r8 + 2*rsi + 6], dx 15104 add rsi, 4 15105 cmp r10, rsi 15106 jne .LBB2_836 15107 jmp .LBB2_1069 15108 .LBB2_172: 15109 test r9d, r9d 15110 jle .LBB2_1069 15111 # %bb.173: 15112 movzx eax, word ptr [rdx] 15113 mov r10d, r9d 15114 cmp r9d, 16 15115 jb .LBB2_174 15116 # %bb.356: 15117 lea rdx, [rcx + 2*r10] 15118 cmp rdx, r8 15119 jbe .LBB2_534 15120 # %bb.357: 15121 lea rdx, [r8 + 2*r10] 15122 cmp rdx, rcx 15123 jbe .LBB2_534 15124 .LBB2_174: 15125 xor esi, esi 15126 .LBB2_841: 15127 mov r9, rsi 15128 not r9 15129 add r9, r10 15130 mov rdi, r10 15131 and rdi, 3 15132 je .LBB2_843 15133 .LBB2_842: # =>This Inner Loop Header: Depth=1 15134 movzx edx, word ptr [rcx + 2*rsi] 15135 add dx, ax 15136 mov word ptr [r8 + 2*rsi], dx 15137 add rsi, 1 15138 add rdi, -1 15139 jne .LBB2_842 15140 .LBB2_843: 15141 cmp r9, 3 15142 jb .LBB2_1069 15143 .LBB2_844: # =>This Inner Loop Header: Depth=1 15144 movzx edx, word ptr [rcx + 2*rsi] 15145 add dx, ax 15146 mov word ptr [r8 + 2*rsi], dx 15147 movzx edx, word ptr [rcx + 2*rsi + 2] 15148 add dx, ax 15149 mov word ptr [r8 + 2*rsi + 2], dx 15150 movzx edx, word ptr [rcx + 2*rsi + 4] 15151 add dx, ax 15152 mov word ptr [r8 + 2*rsi + 4], dx 15153 movzx edx, word ptr [rcx + 2*rsi + 6] 15154 add dx, ax 15155 mov word ptr [r8 + 2*rsi + 6], dx 15156 add rsi, 4 15157 cmp r10, rsi 15158 jne .LBB2_844 15159 jmp .LBB2_1069 15160 .LBB2_175: 15161 test r9d, r9d 15162 jle .LBB2_1069 15163 # %bb.176: 15164 movzx eax, word ptr [rdx] 15165 mov r10d, r9d 15166 cmp r9d, 16 15167 jb .LBB2_177 15168 # %bb.359: 15169 lea rdx, [rcx + 2*r10] 15170 cmp rdx, r8 15171 jbe .LBB2_537 15172 # %bb.360: 15173 lea rdx, [r8 + 2*r10] 15174 cmp rdx, rcx 15175 jbe .LBB2_537 15176 .LBB2_177: 15177 xor esi, esi 15178 .LBB2_849: 15179 mov r9, rsi 15180 not r9 15181 add r9, r10 15182 mov rdi, r10 15183 and rdi, 3 15184 je .LBB2_851 15185 .LBB2_850: # =>This Inner Loop Header: Depth=1 15186 mov edx, eax 15187 sub dx, word ptr [rcx + 2*rsi] 15188 mov word ptr [r8 + 2*rsi], dx 15189 add rsi, 1 15190 add rdi, -1 15191 jne .LBB2_850 15192 .LBB2_851: 15193 cmp r9, 3 15194 jb .LBB2_1069 15195 .LBB2_852: # =>This Inner Loop Header: Depth=1 15196 mov edx, eax 15197 sub dx, word ptr [rcx + 2*rsi] 15198 mov word ptr [r8 + 2*rsi], dx 15199 mov edx, eax 15200 sub dx, word ptr [rcx + 2*rsi + 2] 15201 mov word ptr [r8 + 2*rsi + 2], dx 15202 mov edx, eax 15203 sub dx, word ptr [rcx + 2*rsi + 4] 15204 mov word ptr [r8 + 2*rsi + 4], dx 15205 mov edx, eax 15206 sub dx, word ptr [rcx + 2*rsi + 6] 15207 mov word ptr [r8 + 2*rsi + 6], dx 15208 add rsi, 4 15209 cmp r10, rsi 15210 jne .LBB2_852 15211 jmp .LBB2_1069 15212 .LBB2_178: 15213 test r9d, r9d 15214 jle .LBB2_1069 15215 # %bb.179: 15216 movzx eax, word ptr [rdx] 15217 mov r10d, r9d 15218 cmp r9d, 16 15219 jb .LBB2_180 15220 # %bb.362: 15221 lea rdx, [rcx + 2*r10] 15222 cmp rdx, r8 15223 jbe .LBB2_540 15224 # %bb.363: 15225 lea rdx, [r8 + 2*r10] 15226 cmp rdx, rcx 15227 jbe .LBB2_540 15228 .LBB2_180: 15229 xor esi, esi 15230 .LBB2_857: 15231 mov r9, rsi 15232 not r9 15233 add r9, r10 15234 mov rdi, r10 15235 and rdi, 3 15236 je .LBB2_859 15237 .LBB2_858: # =>This Inner Loop Header: Depth=1 15238 mov edx, eax 15239 sub dx, word ptr [rcx + 2*rsi] 15240 mov word ptr [r8 + 2*rsi], dx 15241 add rsi, 1 15242 add rdi, -1 15243 jne .LBB2_858 15244 .LBB2_859: 15245 cmp r9, 3 15246 jb .LBB2_1069 15247 .LBB2_860: # =>This Inner Loop Header: Depth=1 15248 mov edx, eax 15249 sub dx, word ptr [rcx + 2*rsi] 15250 mov word ptr [r8 + 2*rsi], dx 15251 mov edx, eax 15252 sub dx, word ptr [rcx + 2*rsi + 2] 15253 mov word ptr [r8 + 2*rsi + 2], dx 15254 mov edx, eax 15255 sub dx, word ptr [rcx + 2*rsi + 4] 15256 mov word ptr [r8 + 2*rsi + 4], dx 15257 mov edx, eax 15258 sub dx, word ptr [rcx + 2*rsi + 6] 15259 mov word ptr [r8 + 2*rsi + 6], dx 15260 add rsi, 4 15261 cmp r10, rsi 15262 jne .LBB2_860 15263 jmp .LBB2_1069 15264 .LBB2_181: 15265 test r9d, r9d 15266 jle .LBB2_1069 15267 # %bb.182: 15268 movzx eax, word ptr [rdx] 15269 mov r10d, r9d 15270 cmp r9d, 16 15271 jb .LBB2_183 15272 # %bb.365: 15273 lea rdx, [rcx + 2*r10] 15274 cmp rdx, r8 15275 jbe .LBB2_543 15276 # %bb.366: 15277 lea rdx, [r8 + 2*r10] 15278 cmp rdx, rcx 15279 jbe .LBB2_543 15280 .LBB2_183: 15281 xor esi, esi 15282 .LBB2_865: 15283 mov r9, rsi 15284 not r9 15285 add r9, r10 15286 mov rdi, r10 15287 and rdi, 3 15288 je .LBB2_867 15289 .LBB2_866: # =>This Inner Loop Header: Depth=1 15290 movzx edx, word ptr [rcx + 2*rsi] 15291 add dx, ax 15292 mov word ptr [r8 + 2*rsi], dx 15293 add rsi, 1 15294 add rdi, -1 15295 jne .LBB2_866 15296 .LBB2_867: 15297 cmp r9, 3 15298 jb .LBB2_1069 15299 .LBB2_868: # =>This Inner Loop Header: Depth=1 15300 movzx edx, word ptr [rcx + 2*rsi] 15301 add dx, ax 15302 mov word ptr [r8 + 2*rsi], dx 15303 movzx edx, word ptr [rcx + 2*rsi + 2] 15304 add dx, ax 15305 mov word ptr [r8 + 2*rsi + 2], dx 15306 movzx edx, word ptr [rcx + 2*rsi + 4] 15307 add dx, ax 15308 mov word ptr [r8 + 2*rsi + 4], dx 15309 movzx edx, word ptr [rcx + 2*rsi + 6] 15310 add dx, ax 15311 mov word ptr [r8 + 2*rsi + 6], dx 15312 add rsi, 4 15313 cmp r10, rsi 15314 jne .LBB2_868 15315 jmp .LBB2_1069 15316 .LBB2_184: 15317 test r9d, r9d 15318 jle .LBB2_1069 15319 # %bb.185: 15320 movzx eax, word ptr [rdx] 15321 mov r10d, r9d 15322 cmp r9d, 16 15323 jb .LBB2_186 15324 # %bb.368: 15325 lea rdx, [rcx + 2*r10] 15326 cmp rdx, r8 15327 jbe .LBB2_546 15328 # %bb.369: 15329 lea rdx, [r8 + 2*r10] 15330 cmp rdx, rcx 15331 jbe .LBB2_546 15332 .LBB2_186: 15333 xor esi, esi 15334 .LBB2_873: 15335 mov r9, rsi 15336 not r9 15337 add r9, r10 15338 mov rdi, r10 15339 and rdi, 3 15340 je .LBB2_875 15341 .LBB2_874: # =>This Inner Loop Header: Depth=1 15342 movzx edx, word ptr [rcx + 2*rsi] 15343 add dx, ax 15344 mov word ptr [r8 + 2*rsi], dx 15345 add rsi, 1 15346 add rdi, -1 15347 jne .LBB2_874 15348 .LBB2_875: 15349 cmp r9, 3 15350 jb .LBB2_1069 15351 .LBB2_876: # =>This Inner Loop Header: Depth=1 15352 movzx edx, word ptr [rcx + 2*rsi] 15353 add dx, ax 15354 mov word ptr [r8 + 2*rsi], dx 15355 movzx edx, word ptr [rcx + 2*rsi + 2] 15356 add dx, ax 15357 mov word ptr [r8 + 2*rsi + 2], dx 15358 movzx edx, word ptr [rcx + 2*rsi + 4] 15359 add dx, ax 15360 mov word ptr [r8 + 2*rsi + 4], dx 15361 movzx edx, word ptr [rcx + 2*rsi + 6] 15362 add dx, ax 15363 mov word ptr [r8 + 2*rsi + 6], dx 15364 add rsi, 4 15365 cmp r10, rsi 15366 jne .LBB2_876 15367 jmp .LBB2_1069 15368 .LBB2_187: 15369 test r9d, r9d 15370 jle .LBB2_1069 15371 # %bb.188: 15372 movzx eax, word ptr [rdx] 15373 mov r10d, r9d 15374 cmp r9d, 16 15375 jb .LBB2_189 15376 # %bb.371: 15377 lea rdx, [rcx + 2*r10] 15378 cmp rdx, r8 15379 jbe .LBB2_549 15380 # %bb.372: 15381 lea rdx, [r8 + 2*r10] 15382 cmp rdx, rcx 15383 jbe .LBB2_549 15384 .LBB2_189: 15385 xor esi, esi 15386 .LBB2_881: 15387 mov r9, rsi 15388 not r9 15389 add r9, r10 15390 mov rdi, r10 15391 and rdi, 3 15392 je .LBB2_883 15393 .LBB2_882: # =>This Inner Loop Header: Depth=1 15394 mov edx, eax 15395 sub dx, word ptr [rcx + 2*rsi] 15396 mov word ptr [r8 + 2*rsi], dx 15397 add rsi, 1 15398 add rdi, -1 15399 jne .LBB2_882 15400 .LBB2_883: 15401 cmp r9, 3 15402 jb .LBB2_1069 15403 .LBB2_884: # =>This Inner Loop Header: Depth=1 15404 mov edx, eax 15405 sub dx, word ptr [rcx + 2*rsi] 15406 mov word ptr [r8 + 2*rsi], dx 15407 mov edx, eax 15408 sub dx, word ptr [rcx + 2*rsi + 2] 15409 mov word ptr [r8 + 2*rsi + 2], dx 15410 mov edx, eax 15411 sub dx, word ptr [rcx + 2*rsi + 4] 15412 mov word ptr [r8 + 2*rsi + 4], dx 15413 mov edx, eax 15414 sub dx, word ptr [rcx + 2*rsi + 6] 15415 mov word ptr [r8 + 2*rsi + 6], dx 15416 add rsi, 4 15417 cmp r10, rsi 15418 jne .LBB2_884 15419 jmp .LBB2_1069 15420 .LBB2_190: 15421 test r9d, r9d 15422 jle .LBB2_1069 15423 # %bb.191: 15424 movzx eax, word ptr [rdx] 15425 mov r10d, r9d 15426 cmp r9d, 16 15427 jb .LBB2_192 15428 # %bb.374: 15429 lea rdx, [rcx + 2*r10] 15430 cmp rdx, r8 15431 jbe .LBB2_552 15432 # %bb.375: 15433 lea rdx, [r8 + 2*r10] 15434 cmp rdx, rcx 15435 jbe .LBB2_552 15436 .LBB2_192: 15437 xor esi, esi 15438 .LBB2_889: 15439 mov r9, rsi 15440 not r9 15441 add r9, r10 15442 mov rdi, r10 15443 and rdi, 3 15444 je .LBB2_891 15445 .LBB2_890: # =>This Inner Loop Header: Depth=1 15446 mov edx, eax 15447 sub dx, word ptr [rcx + 2*rsi] 15448 mov word ptr [r8 + 2*rsi], dx 15449 add rsi, 1 15450 add rdi, -1 15451 jne .LBB2_890 15452 .LBB2_891: 15453 cmp r9, 3 15454 jb .LBB2_1069 15455 .LBB2_892: # =>This Inner Loop Header: Depth=1 15456 mov edx, eax 15457 sub dx, word ptr [rcx + 2*rsi] 15458 mov word ptr [r8 + 2*rsi], dx 15459 mov edx, eax 15460 sub dx, word ptr [rcx + 2*rsi + 2] 15461 mov word ptr [r8 + 2*rsi + 2], dx 15462 mov edx, eax 15463 sub dx, word ptr [rcx + 2*rsi + 4] 15464 mov word ptr [r8 + 2*rsi + 4], dx 15465 mov edx, eax 15466 sub dx, word ptr [rcx + 2*rsi + 6] 15467 mov word ptr [r8 + 2*rsi + 6], dx 15468 add rsi, 4 15469 cmp r10, rsi 15470 jne .LBB2_892 15471 jmp .LBB2_1069 15472 .LBB2_193: 15473 test r9d, r9d 15474 jle .LBB2_1069 15475 # %bb.194: 15476 mov rax, qword ptr [rdx] 15477 mov esi, r9d 15478 lea rdi, [rsi - 1] 15479 mov r9d, esi 15480 and r9d, 3 15481 cmp rdi, 3 15482 jae .LBB2_377 15483 # %bb.195: 15484 xor edi, edi 15485 jmp .LBB2_379 15486 .LBB2_196: 15487 test r9d, r9d 15488 jle .LBB2_1069 15489 # %bb.197: 15490 movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero 15491 mov eax, r9d 15492 cmp r9d, 8 15493 jb .LBB2_198 15494 # %bb.382: 15495 lea rdx, [rcx + 4*rax] 15496 cmp rdx, r8 15497 jbe .LBB2_555 15498 # %bb.383: 15499 lea rdx, [r8 + 4*rax] 15500 cmp rdx, rcx 15501 jbe .LBB2_555 15502 .LBB2_198: 15503 xor edx, edx 15504 .LBB2_897: 15505 mov rsi, rdx 15506 not rsi 15507 add rsi, rax 15508 mov rdi, rax 15509 and rdi, 3 15510 je .LBB2_899 15511 .LBB2_898: # =>This Inner Loop Header: Depth=1 15512 movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero 15513 mulss xmm1, xmm0 15514 movss dword ptr [r8 + 4*rdx], xmm1 15515 add rdx, 1 15516 add rdi, -1 15517 jne .LBB2_898 15518 .LBB2_899: 15519 cmp rsi, 3 15520 jb .LBB2_1069 15521 .LBB2_900: # =>This Inner Loop Header: Depth=1 15522 movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero 15523 mulss xmm1, xmm0 15524 movss dword ptr [r8 + 4*rdx], xmm1 15525 movss xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero 15526 mulss xmm1, xmm0 15527 movss dword ptr [r8 + 4*rdx + 4], xmm1 15528 movss xmm1, dword ptr [rcx + 4*rdx + 8] # xmm1 = mem[0],zero,zero,zero 15529 mulss xmm1, xmm0 15530 movss dword ptr [r8 + 4*rdx + 8], xmm1 15531 movss xmm1, dword ptr [rcx + 4*rdx + 12] # xmm1 = mem[0],zero,zero,zero 15532 mulss xmm1, xmm0 15533 movss dword ptr [r8 + 4*rdx + 12], xmm1 15534 add rdx, 4 15535 cmp rax, rdx 15536 jne .LBB2_900 15537 jmp .LBB2_1069 15538 .LBB2_199: 15539 test r9d, r9d 15540 jle .LBB2_1069 15541 # %bb.200: 15542 mov rax, qword ptr [rdx] 15543 mov esi, r9d 15544 lea rdi, [rsi - 1] 15545 mov r9d, esi 15546 and r9d, 3 15547 cmp rdi, 3 15548 jae .LBB2_385 15549 # %bb.201: 15550 xor edi, edi 15551 jmp .LBB2_387 15552 .LBB2_202: 15553 test r9d, r9d 15554 jle .LBB2_1069 15555 # %bb.203: 15556 movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero 15557 mov eax, r9d 15558 cmp r9d, 8 15559 jb .LBB2_204 15560 # %bb.390: 15561 lea rdx, [rcx + 4*rax] 15562 cmp rdx, r8 15563 jbe .LBB2_558 15564 # %bb.391: 15565 lea rdx, [r8 + 4*rax] 15566 cmp rdx, rcx 15567 jbe .LBB2_558 15568 .LBB2_204: 15569 xor edx, edx 15570 .LBB2_905: 15571 mov rsi, rdx 15572 not rsi 15573 add rsi, rax 15574 mov rdi, rax 15575 and rdi, 3 15576 je .LBB2_907 15577 .LBB2_906: # =>This Inner Loop Header: Depth=1 15578 movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero 15579 mulss xmm1, xmm0 15580 movss dword ptr [r8 + 4*rdx], xmm1 15581 add rdx, 1 15582 add rdi, -1 15583 jne .LBB2_906 15584 .LBB2_907: 15585 cmp rsi, 3 15586 jb .LBB2_1069 15587 .LBB2_908: # =>This Inner Loop Header: Depth=1 15588 movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero 15589 mulss xmm1, xmm0 15590 movss dword ptr [r8 + 4*rdx], xmm1 15591 movss xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero 15592 mulss xmm1, xmm0 15593 movss dword ptr [r8 + 4*rdx + 4], xmm1 15594 movss xmm1, dword ptr [rcx + 4*rdx + 8] # xmm1 = mem[0],zero,zero,zero 15595 mulss xmm1, xmm0 15596 movss dword ptr [r8 + 4*rdx + 8], xmm1 15597 movss xmm1, dword ptr [rcx + 4*rdx + 12] # xmm1 = mem[0],zero,zero,zero 15598 mulss xmm1, xmm0 15599 movss dword ptr [r8 + 4*rdx + 12], xmm1 15600 add rdx, 4 15601 cmp rax, rdx 15602 jne .LBB2_908 15603 jmp .LBB2_1069 15604 .LBB2_205: 15605 test r9d, r9d 15606 jle .LBB2_1069 15607 # %bb.206: 15608 mov rax, qword ptr [rdx] 15609 mov r10d, r9d 15610 cmp r9d, 4 15611 jb .LBB2_207 15612 # %bb.393: 15613 lea rdx, [rcx + 8*r10] 15614 cmp rdx, r8 15615 jbe .LBB2_561 15616 # %bb.394: 15617 lea rdx, [r8 + 8*r10] 15618 cmp rdx, rcx 15619 jbe .LBB2_561 15620 .LBB2_207: 15621 xor esi, esi 15622 .LBB2_913: 15623 mov r9, rsi 15624 not r9 15625 add r9, r10 15626 mov rdi, r10 15627 and rdi, 3 15628 je .LBB2_915 15629 .LBB2_914: # =>This Inner Loop Header: Depth=1 15630 mov rdx, qword ptr [rcx + 8*rsi] 15631 add rdx, rax 15632 mov qword ptr [r8 + 8*rsi], rdx 15633 add rsi, 1 15634 add rdi, -1 15635 jne .LBB2_914 15636 .LBB2_915: 15637 cmp r9, 3 15638 jb .LBB2_1069 15639 .LBB2_916: # =>This Inner Loop Header: Depth=1 15640 mov rdx, qword ptr [rcx + 8*rsi] 15641 add rdx, rax 15642 mov qword ptr [r8 + 8*rsi], rdx 15643 mov rdx, qword ptr [rcx + 8*rsi + 8] 15644 add rdx, rax 15645 mov qword ptr [r8 + 8*rsi + 8], rdx 15646 mov rdx, qword ptr [rcx + 8*rsi + 16] 15647 add rdx, rax 15648 mov qword ptr [r8 + 8*rsi + 16], rdx 15649 mov rdx, qword ptr [rcx + 8*rsi + 24] 15650 add rdx, rax 15651 mov qword ptr [r8 + 8*rsi + 24], rdx 15652 add rsi, 4 15653 cmp r10, rsi 15654 jne .LBB2_916 15655 jmp .LBB2_1069 15656 .LBB2_208: 15657 test r9d, r9d 15658 jle .LBB2_1069 15659 # %bb.209: 15660 movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero 15661 mov eax, r9d 15662 cmp r9d, 8 15663 jb .LBB2_210 15664 # %bb.396: 15665 lea rdx, [rcx + 4*rax] 15666 cmp rdx, r8 15667 jbe .LBB2_564 15668 # %bb.397: 15669 lea rdx, [r8 + 4*rax] 15670 cmp rdx, rcx 15671 jbe .LBB2_564 15672 .LBB2_210: 15673 xor edx, edx 15674 .LBB2_921: 15675 mov rsi, rdx 15676 not rsi 15677 add rsi, rax 15678 mov rdi, rax 15679 and rdi, 3 15680 je .LBB2_923 15681 .LBB2_922: # =>This Inner Loop Header: Depth=1 15682 movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero 15683 addss xmm1, xmm0 15684 movss dword ptr [r8 + 4*rdx], xmm1 15685 add rdx, 1 15686 add rdi, -1 15687 jne .LBB2_922 15688 .LBB2_923: 15689 cmp rsi, 3 15690 jb .LBB2_1069 15691 .LBB2_924: # =>This Inner Loop Header: Depth=1 15692 movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero 15693 addss xmm1, xmm0 15694 movss dword ptr [r8 + 4*rdx], xmm1 15695 movss xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero 15696 addss xmm1, xmm0 15697 movss dword ptr [r8 + 4*rdx + 4], xmm1 15698 movss xmm1, dword ptr [rcx + 4*rdx + 8] # xmm1 = mem[0],zero,zero,zero 15699 addss xmm1, xmm0 15700 movss dword ptr [r8 + 4*rdx + 8], xmm1 15701 movss xmm1, dword ptr [rcx + 4*rdx + 12] # xmm1 = mem[0],zero,zero,zero 15702 addss xmm1, xmm0 15703 movss dword ptr [r8 + 4*rdx + 12], xmm1 15704 add rdx, 4 15705 cmp rax, rdx 15706 jne .LBB2_924 15707 jmp .LBB2_1069 15708 .LBB2_211: 15709 test r9d, r9d 15710 jle .LBB2_1069 15711 # %bb.212: 15712 mov r11, qword ptr [rdx] 15713 mov r10d, r9d 15714 cmp r9d, 4 15715 jb .LBB2_213 15716 # %bb.399: 15717 lea rdx, [rcx + 8*r10] 15718 cmp rdx, r8 15719 jbe .LBB2_567 15720 # %bb.400: 15721 lea rdx, [r8 + 8*r10] 15722 cmp rdx, rcx 15723 jbe .LBB2_567 15724 .LBB2_213: 15725 xor esi, esi 15726 .LBB2_929: 15727 mov rdx, rsi 15728 not rdx 15729 add rdx, r10 15730 mov rdi, r10 15731 and rdi, 3 15732 je .LBB2_931 15733 .LBB2_930: # =>This Inner Loop Header: Depth=1 15734 mov rax, r11 15735 sub rax, qword ptr [rcx + 8*rsi] 15736 mov qword ptr [r8 + 8*rsi], rax 15737 add rsi, 1 15738 add rdi, -1 15739 jne .LBB2_930 15740 .LBB2_931: 15741 cmp rdx, 3 15742 jb .LBB2_1069 15743 .LBB2_932: # =>This Inner Loop Header: Depth=1 15744 mov rax, r11 15745 sub rax, qword ptr [rcx + 8*rsi] 15746 mov qword ptr [r8 + 8*rsi], rax 15747 mov rax, r11 15748 sub rax, qword ptr [rcx + 8*rsi + 8] 15749 mov qword ptr [r8 + 8*rsi + 8], rax 15750 mov rax, r11 15751 sub rax, qword ptr [rcx + 8*rsi + 16] 15752 mov qword ptr [r8 + 8*rsi + 16], rax 15753 mov rax, r11 15754 sub rax, qword ptr [rcx + 8*rsi + 24] 15755 mov qword ptr [r8 + 8*rsi + 24], rax 15756 add rsi, 4 15757 cmp r10, rsi 15758 jne .LBB2_932 15759 jmp .LBB2_1069 15760 .LBB2_214: 15761 test r9d, r9d 15762 jle .LBB2_1069 15763 # %bb.215: 15764 movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero 15765 mov eax, r9d 15766 cmp r9d, 8 15767 jb .LBB2_216 15768 # %bb.402: 15769 lea rdx, [rcx + 4*rax] 15770 cmp rdx, r8 15771 jbe .LBB2_570 15772 # %bb.403: 15773 lea rdx, [r8 + 4*rax] 15774 cmp rdx, rcx 15775 jbe .LBB2_570 15776 .LBB2_216: 15777 xor edx, edx 15778 .LBB2_937: 15779 mov rsi, rdx 15780 not rsi 15781 add rsi, rax 15782 mov rdi, rax 15783 and rdi, 3 15784 je .LBB2_939 15785 .LBB2_938: # =>This Inner Loop Header: Depth=1 15786 movaps xmm1, xmm0 15787 subss xmm1, dword ptr [rcx + 4*rdx] 15788 movss dword ptr [r8 + 4*rdx], xmm1 15789 add rdx, 1 15790 add rdi, -1 15791 jne .LBB2_938 15792 .LBB2_939: 15793 cmp rsi, 3 15794 jb .LBB2_1069 15795 .LBB2_940: # =>This Inner Loop Header: Depth=1 15796 movaps xmm1, xmm0 15797 subss xmm1, dword ptr [rcx + 4*rdx] 15798 movss dword ptr [r8 + 4*rdx], xmm1 15799 movaps xmm1, xmm0 15800 subss xmm1, dword ptr [rcx + 4*rdx + 4] 15801 movss dword ptr [r8 + 4*rdx + 4], xmm1 15802 movaps xmm1, xmm0 15803 subss xmm1, dword ptr [rcx + 4*rdx + 8] 15804 movss dword ptr [r8 + 4*rdx + 8], xmm1 15805 movaps xmm1, xmm0 15806 subss xmm1, dword ptr [rcx + 4*rdx + 12] 15807 movss dword ptr [r8 + 4*rdx + 12], xmm1 15808 add rdx, 4 15809 cmp rax, rdx 15810 jne .LBB2_940 15811 jmp .LBB2_1069 15812 .LBB2_217: 15813 test r9d, r9d 15814 jle .LBB2_1069 15815 # %bb.218: 15816 mov rax, qword ptr [rdx] 15817 mov r10d, r9d 15818 cmp r9d, 4 15819 jb .LBB2_219 15820 # %bb.405: 15821 lea rdx, [rcx + 8*r10] 15822 cmp rdx, r8 15823 jbe .LBB2_573 15824 # %bb.406: 15825 lea rdx, [r8 + 8*r10] 15826 cmp rdx, rcx 15827 jbe .LBB2_573 15828 .LBB2_219: 15829 xor esi, esi 15830 .LBB2_945: 15831 mov r9, rsi 15832 not r9 15833 add r9, r10 15834 mov rdi, r10 15835 and rdi, 3 15836 je .LBB2_947 15837 .LBB2_946: # =>This Inner Loop Header: Depth=1 15838 mov rdx, qword ptr [rcx + 8*rsi] 15839 add rdx, rax 15840 mov qword ptr [r8 + 8*rsi], rdx 15841 add rsi, 1 15842 add rdi, -1 15843 jne .LBB2_946 15844 .LBB2_947: 15845 cmp r9, 3 15846 jb .LBB2_1069 15847 .LBB2_948: # =>This Inner Loop Header: Depth=1 15848 mov rdx, qword ptr [rcx + 8*rsi] 15849 add rdx, rax 15850 mov qword ptr [r8 + 8*rsi], rdx 15851 mov rdx, qword ptr [rcx + 8*rsi + 8] 15852 add rdx, rax 15853 mov qword ptr [r8 + 8*rsi + 8], rdx 15854 mov rdx, qword ptr [rcx + 8*rsi + 16] 15855 add rdx, rax 15856 mov qword ptr [r8 + 8*rsi + 16], rdx 15857 mov rdx, qword ptr [rcx + 8*rsi + 24] 15858 add rdx, rax 15859 mov qword ptr [r8 + 8*rsi + 24], rdx 15860 add rsi, 4 15861 cmp r10, rsi 15862 jne .LBB2_948 15863 jmp .LBB2_1069 15864 .LBB2_220: 15865 test r9d, r9d 15866 jle .LBB2_1069 15867 # %bb.221: 15868 movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero 15869 mov eax, r9d 15870 cmp r9d, 8 15871 jb .LBB2_222 15872 # %bb.408: 15873 lea rdx, [rcx + 4*rax] 15874 cmp rdx, r8 15875 jbe .LBB2_576 15876 # %bb.409: 15877 lea rdx, [r8 + 4*rax] 15878 cmp rdx, rcx 15879 jbe .LBB2_576 15880 .LBB2_222: 15881 xor edx, edx 15882 .LBB2_953: 15883 mov rsi, rdx 15884 not rsi 15885 add rsi, rax 15886 mov rdi, rax 15887 and rdi, 3 15888 je .LBB2_955 15889 .LBB2_954: # =>This Inner Loop Header: Depth=1 15890 movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero 15891 addss xmm1, xmm0 15892 movss dword ptr [r8 + 4*rdx], xmm1 15893 add rdx, 1 15894 add rdi, -1 15895 jne .LBB2_954 15896 .LBB2_955: 15897 cmp rsi, 3 15898 jb .LBB2_1069 15899 .LBB2_956: # =>This Inner Loop Header: Depth=1 15900 movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero 15901 addss xmm1, xmm0 15902 movss dword ptr [r8 + 4*rdx], xmm1 15903 movss xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero 15904 addss xmm1, xmm0 15905 movss dword ptr [r8 + 4*rdx + 4], xmm1 15906 movss xmm1, dword ptr [rcx + 4*rdx + 8] # xmm1 = mem[0],zero,zero,zero 15907 addss xmm1, xmm0 15908 movss dword ptr [r8 + 4*rdx + 8], xmm1 15909 movss xmm1, dword ptr [rcx + 4*rdx + 12] # xmm1 = mem[0],zero,zero,zero 15910 addss xmm1, xmm0 15911 movss dword ptr [r8 + 4*rdx + 12], xmm1 15912 add rdx, 4 15913 cmp rax, rdx 15914 jne .LBB2_956 15915 jmp .LBB2_1069 15916 .LBB2_223: 15917 test r9d, r9d 15918 jle .LBB2_1069 15919 # %bb.224: 15920 mov r11, qword ptr [rdx] 15921 mov r10d, r9d 15922 cmp r9d, 4 15923 jb .LBB2_225 15924 # %bb.411: 15925 lea rdx, [rcx + 8*r10] 15926 cmp rdx, r8 15927 jbe .LBB2_579 15928 # %bb.412: 15929 lea rdx, [r8 + 8*r10] 15930 cmp rdx, rcx 15931 jbe .LBB2_579 15932 .LBB2_225: 15933 xor esi, esi 15934 .LBB2_961: 15935 mov rdx, rsi 15936 not rdx 15937 add rdx, r10 15938 mov rdi, r10 15939 and rdi, 3 15940 je .LBB2_963 15941 .LBB2_962: # =>This Inner Loop Header: Depth=1 15942 mov rax, r11 15943 sub rax, qword ptr [rcx + 8*rsi] 15944 mov qword ptr [r8 + 8*rsi], rax 15945 add rsi, 1 15946 add rdi, -1 15947 jne .LBB2_962 15948 .LBB2_963: 15949 cmp rdx, 3 15950 jb .LBB2_1069 15951 .LBB2_964: # =>This Inner Loop Header: Depth=1 15952 mov rax, r11 15953 sub rax, qword ptr [rcx + 8*rsi] 15954 mov qword ptr [r8 + 8*rsi], rax 15955 mov rax, r11 15956 sub rax, qword ptr [rcx + 8*rsi + 8] 15957 mov qword ptr [r8 + 8*rsi + 8], rax 15958 mov rax, r11 15959 sub rax, qword ptr [rcx + 8*rsi + 16] 15960 mov qword ptr [r8 + 8*rsi + 16], rax 15961 mov rax, r11 15962 sub rax, qword ptr [rcx + 8*rsi + 24] 15963 mov qword ptr [r8 + 8*rsi + 24], rax 15964 add rsi, 4 15965 cmp r10, rsi 15966 jne .LBB2_964 15967 jmp .LBB2_1069 15968 .LBB2_226: 15969 test r9d, r9d 15970 jle .LBB2_1069 15971 # %bb.227: 15972 movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero 15973 mov eax, r9d 15974 cmp r9d, 8 15975 jb .LBB2_228 15976 # %bb.414: 15977 lea rdx, [rcx + 4*rax] 15978 cmp rdx, r8 15979 jbe .LBB2_582 15980 # %bb.415: 15981 lea rdx, [r8 + 4*rax] 15982 cmp rdx, rcx 15983 jbe .LBB2_582 15984 .LBB2_228: 15985 xor edx, edx 15986 .LBB2_969: 15987 mov rsi, rdx 15988 not rsi 15989 add rsi, rax 15990 mov rdi, rax 15991 and rdi, 3 15992 je .LBB2_971 15993 .LBB2_970: # =>This Inner Loop Header: Depth=1 15994 movaps xmm1, xmm0 15995 subss xmm1, dword ptr [rcx + 4*rdx] 15996 movss dword ptr [r8 + 4*rdx], xmm1 15997 add rdx, 1 15998 add rdi, -1 15999 jne .LBB2_970 16000 .LBB2_971: 16001 cmp rsi, 3 16002 jb .LBB2_1069 16003 .LBB2_972: # =>This Inner Loop Header: Depth=1 16004 movaps xmm1, xmm0 16005 subss xmm1, dword ptr [rcx + 4*rdx] 16006 movss dword ptr [r8 + 4*rdx], xmm1 16007 movaps xmm1, xmm0 16008 subss xmm1, dword ptr [rcx + 4*rdx + 4] 16009 movss dword ptr [r8 + 4*rdx + 4], xmm1 16010 movaps xmm1, xmm0 16011 subss xmm1, dword ptr [rcx + 4*rdx + 8] 16012 movss dword ptr [r8 + 4*rdx + 8], xmm1 16013 movaps xmm1, xmm0 16014 subss xmm1, dword ptr [rcx + 4*rdx + 12] 16015 movss dword ptr [r8 + 4*rdx + 12], xmm1 16016 add rdx, 4 16017 cmp rax, rdx 16018 jne .LBB2_972 16019 jmp .LBB2_1069 16020 .LBB2_229: 16021 test r9d, r9d 16022 jle .LBB2_1069 16023 # %bb.230: 16024 mov dl, byte ptr [rdx] 16025 mov r10d, r9d 16026 cmp r9d, 32 16027 jb .LBB2_231 16028 # %bb.417: 16029 lea rax, [rcx + r10] 16030 cmp rax, r8 16031 jbe .LBB2_585 16032 # %bb.418: 16033 lea rax, [r8 + r10] 16034 cmp rax, rcx 16035 jbe .LBB2_585 16036 .LBB2_231: 16037 xor edi, edi 16038 .LBB2_977: 16039 mov r9, rdi 16040 not r9 16041 add r9, r10 16042 mov rsi, r10 16043 and rsi, 3 16044 je .LBB2_979 16045 .LBB2_978: # =>This Inner Loop Header: Depth=1 16046 movzx eax, byte ptr [rcx + rdi] 16047 mul dl 16048 mov byte ptr [r8 + rdi], al 16049 add rdi, 1 16050 add rsi, -1 16051 jne .LBB2_978 16052 .LBB2_979: 16053 cmp r9, 3 16054 jb .LBB2_1069 16055 .LBB2_980: # =>This Inner Loop Header: Depth=1 16056 movzx eax, byte ptr [rcx + rdi] 16057 mul dl 16058 mov byte ptr [r8 + rdi], al 16059 movzx eax, byte ptr [rcx + rdi + 1] 16060 mul dl 16061 mov byte ptr [r8 + rdi + 1], al 16062 movzx eax, byte ptr [rcx + rdi + 2] 16063 mul dl 16064 mov byte ptr [r8 + rdi + 2], al 16065 movzx eax, byte ptr [rcx + rdi + 3] 16066 mul dl 16067 mov byte ptr [r8 + rdi + 3], al 16068 add rdi, 4 16069 cmp r10, rdi 16070 jne .LBB2_980 16071 jmp .LBB2_1069 16072 .LBB2_232: 16073 test r9d, r9d 16074 jle .LBB2_1069 16075 # %bb.233: 16076 mov dl, byte ptr [rdx] 16077 mov r10d, r9d 16078 cmp r9d, 32 16079 jb .LBB2_234 16080 # %bb.420: 16081 lea rax, [rcx + r10] 16082 cmp rax, r8 16083 jbe .LBB2_588 16084 # %bb.421: 16085 lea rax, [r8 + r10] 16086 cmp rax, rcx 16087 jbe .LBB2_588 16088 .LBB2_234: 16089 xor edi, edi 16090 .LBB2_985: 16091 mov r9, rdi 16092 not r9 16093 add r9, r10 16094 mov rsi, r10 16095 and rsi, 3 16096 je .LBB2_987 16097 .LBB2_986: # =>This Inner Loop Header: Depth=1 16098 movzx eax, byte ptr [rcx + rdi] 16099 mul dl 16100 mov byte ptr [r8 + rdi], al 16101 add rdi, 1 16102 add rsi, -1 16103 jne .LBB2_986 16104 .LBB2_987: 16105 cmp r9, 3 16106 jb .LBB2_1069 16107 .LBB2_988: # =>This Inner Loop Header: Depth=1 16108 movzx eax, byte ptr [rcx + rdi] 16109 mul dl 16110 mov byte ptr [r8 + rdi], al 16111 movzx eax, byte ptr [rcx + rdi + 1] 16112 mul dl 16113 mov byte ptr [r8 + rdi + 1], al 16114 movzx eax, byte ptr [rcx + rdi + 2] 16115 mul dl 16116 mov byte ptr [r8 + rdi + 2], al 16117 movzx eax, byte ptr [rcx + rdi + 3] 16118 mul dl 16119 mov byte ptr [r8 + rdi + 3], al 16120 add rdi, 4 16121 cmp r10, rdi 16122 jne .LBB2_988 16123 jmp .LBB2_1069 16124 .LBB2_235: 16125 test r9d, r9d 16126 jle .LBB2_1069 16127 # %bb.236: 16128 mov al, byte ptr [rdx] 16129 mov r10d, r9d 16130 cmp r9d, 32 16131 jb .LBB2_237 16132 # %bb.423: 16133 lea rdx, [rcx + r10] 16134 cmp rdx, r8 16135 jbe .LBB2_591 16136 # %bb.424: 16137 lea rdx, [r8 + r10] 16138 cmp rdx, rcx 16139 jbe .LBB2_591 16140 .LBB2_237: 16141 xor esi, esi 16142 .LBB2_993: 16143 mov r9, rsi 16144 not r9 16145 add r9, r10 16146 mov rdi, r10 16147 and rdi, 3 16148 je .LBB2_995 16149 .LBB2_994: # =>This Inner Loop Header: Depth=1 16150 movzx edx, byte ptr [rcx + rsi] 16151 add dl, al 16152 mov byte ptr [r8 + rsi], dl 16153 add rsi, 1 16154 add rdi, -1 16155 jne .LBB2_994 16156 .LBB2_995: 16157 cmp r9, 3 16158 jb .LBB2_1069 16159 .LBB2_996: # =>This Inner Loop Header: Depth=1 16160 movzx edx, byte ptr [rcx + rsi] 16161 add dl, al 16162 mov byte ptr [r8 + rsi], dl 16163 movzx edx, byte ptr [rcx + rsi + 1] 16164 add dl, al 16165 mov byte ptr [r8 + rsi + 1], dl 16166 movzx edx, byte ptr [rcx + rsi + 2] 16167 add dl, al 16168 mov byte ptr [r8 + rsi + 2], dl 16169 movzx edx, byte ptr [rcx + rsi + 3] 16170 add dl, al 16171 mov byte ptr [r8 + rsi + 3], dl 16172 add rsi, 4 16173 cmp r10, rsi 16174 jne .LBB2_996 16175 jmp .LBB2_1069 16176 .LBB2_238: 16177 test r9d, r9d 16178 jle .LBB2_1069 16179 # %bb.239: 16180 mov r11b, byte ptr [rdx] 16181 mov r10d, r9d 16182 cmp r9d, 32 16183 jb .LBB2_240 16184 # %bb.426: 16185 lea rdx, [rcx + r10] 16186 cmp rdx, r8 16187 jbe .LBB2_594 16188 # %bb.427: 16189 lea rdx, [r8 + r10] 16190 cmp rdx, rcx 16191 jbe .LBB2_594 16192 .LBB2_240: 16193 xor esi, esi 16194 .LBB2_1001: 16195 mov rdx, rsi 16196 not rdx 16197 add rdx, r10 16198 mov rdi, r10 16199 and rdi, 3 16200 je .LBB2_1003 16201 .LBB2_1002: # =>This Inner Loop Header: Depth=1 16202 mov eax, r11d 16203 sub al, byte ptr [rcx + rsi] 16204 mov byte ptr [r8 + rsi], al 16205 add rsi, 1 16206 add rdi, -1 16207 jne .LBB2_1002 16208 .LBB2_1003: 16209 cmp rdx, 3 16210 jb .LBB2_1069 16211 .LBB2_1004: # =>This Inner Loop Header: Depth=1 16212 mov eax, r11d 16213 sub al, byte ptr [rcx + rsi] 16214 mov byte ptr [r8 + rsi], al 16215 mov eax, r11d 16216 sub al, byte ptr [rcx + rsi + 1] 16217 mov byte ptr [r8 + rsi + 1], al 16218 mov eax, r11d 16219 sub al, byte ptr [rcx + rsi + 2] 16220 mov byte ptr [r8 + rsi + 2], al 16221 mov eax, r11d 16222 sub al, byte ptr [rcx + rsi + 3] 16223 mov byte ptr [r8 + rsi + 3], al 16224 add rsi, 4 16225 cmp r10, rsi 16226 jne .LBB2_1004 16227 jmp .LBB2_1069 16228 .LBB2_241: 16229 test r9d, r9d 16230 jle .LBB2_1069 16231 # %bb.242: 16232 mov al, byte ptr [rdx] 16233 mov r10d, r9d 16234 cmp r9d, 32 16235 jb .LBB2_243 16236 # %bb.429: 16237 lea rdx, [rcx + r10] 16238 cmp rdx, r8 16239 jbe .LBB2_597 16240 # %bb.430: 16241 lea rdx, [r8 + r10] 16242 cmp rdx, rcx 16243 jbe .LBB2_597 16244 .LBB2_243: 16245 xor esi, esi 16246 .LBB2_1009: 16247 mov r9, rsi 16248 not r9 16249 add r9, r10 16250 mov rdi, r10 16251 and rdi, 3 16252 je .LBB2_1011 16253 .LBB2_1010: # =>This Inner Loop Header: Depth=1 16254 movzx edx, byte ptr [rcx + rsi] 16255 add dl, al 16256 mov byte ptr [r8 + rsi], dl 16257 add rsi, 1 16258 add rdi, -1 16259 jne .LBB2_1010 16260 .LBB2_1011: 16261 cmp r9, 3 16262 jb .LBB2_1069 16263 .LBB2_1012: # =>This Inner Loop Header: Depth=1 16264 movzx edx, byte ptr [rcx + rsi] 16265 add dl, al 16266 mov byte ptr [r8 + rsi], dl 16267 movzx edx, byte ptr [rcx + rsi + 1] 16268 add dl, al 16269 mov byte ptr [r8 + rsi + 1], dl 16270 movzx edx, byte ptr [rcx + rsi + 2] 16271 add dl, al 16272 mov byte ptr [r8 + rsi + 2], dl 16273 movzx edx, byte ptr [rcx + rsi + 3] 16274 add dl, al 16275 mov byte ptr [r8 + rsi + 3], dl 16276 add rsi, 4 16277 cmp r10, rsi 16278 jne .LBB2_1012 16279 jmp .LBB2_1069 16280 .LBB2_244: 16281 test r9d, r9d 16282 jle .LBB2_1069 16283 # %bb.245: 16284 mov r11b, byte ptr [rdx] 16285 mov r10d, r9d 16286 cmp r9d, 32 16287 jb .LBB2_246 16288 # %bb.432: 16289 lea rdx, [rcx + r10] 16290 cmp rdx, r8 16291 jbe .LBB2_600 16292 # %bb.433: 16293 lea rdx, [r8 + r10] 16294 cmp rdx, rcx 16295 jbe .LBB2_600 16296 .LBB2_246: 16297 xor esi, esi 16298 .LBB2_1017: 16299 mov rdx, rsi 16300 not rdx 16301 add rdx, r10 16302 mov rdi, r10 16303 and rdi, 3 16304 je .LBB2_1019 16305 .LBB2_1018: # =>This Inner Loop Header: Depth=1 16306 mov eax, r11d 16307 sub al, byte ptr [rcx + rsi] 16308 mov byte ptr [r8 + rsi], al 16309 add rsi, 1 16310 add rdi, -1 16311 jne .LBB2_1018 16312 .LBB2_1019: 16313 cmp rdx, 3 16314 jb .LBB2_1069 16315 .LBB2_1020: # =>This Inner Loop Header: Depth=1 16316 mov eax, r11d 16317 sub al, byte ptr [rcx + rsi] 16318 mov byte ptr [r8 + rsi], al 16319 mov eax, r11d 16320 sub al, byte ptr [rcx + rsi + 1] 16321 mov byte ptr [r8 + rsi + 1], al 16322 mov eax, r11d 16323 sub al, byte ptr [rcx + rsi + 2] 16324 mov byte ptr [r8 + rsi + 2], al 16325 mov eax, r11d 16326 sub al, byte ptr [rcx + rsi + 3] 16327 mov byte ptr [r8 + rsi + 3], al 16328 add rsi, 4 16329 cmp r10, rsi 16330 jne .LBB2_1020 16331 jmp .LBB2_1069 16332 .LBB2_247: 16333 test r9d, r9d 16334 jle .LBB2_1069 16335 # %bb.248: 16336 mov eax, dword ptr [rdx] 16337 mov r10d, r9d 16338 cmp r9d, 8 16339 jb .LBB2_249 16340 # %bb.435: 16341 lea rdx, [rcx + 4*r10] 16342 cmp rdx, r8 16343 jbe .LBB2_603 16344 # %bb.436: 16345 lea rdx, [r8 + 4*r10] 16346 cmp rdx, rcx 16347 jbe .LBB2_603 16348 .LBB2_249: 16349 xor esi, esi 16350 .LBB2_1025: 16351 mov r9, rsi 16352 not r9 16353 add r9, r10 16354 mov rdi, r10 16355 and rdi, 3 16356 je .LBB2_1027 16357 .LBB2_1026: # =>This Inner Loop Header: Depth=1 16358 mov edx, dword ptr [rcx + 4*rsi] 16359 imul edx, eax 16360 mov dword ptr [r8 + 4*rsi], edx 16361 add rsi, 1 16362 add rdi, -1 16363 jne .LBB2_1026 16364 .LBB2_1027: 16365 cmp r9, 3 16366 jb .LBB2_1069 16367 .LBB2_1028: # =>This Inner Loop Header: Depth=1 16368 mov edx, dword ptr [rcx + 4*rsi] 16369 imul edx, eax 16370 mov dword ptr [r8 + 4*rsi], edx 16371 mov edx, dword ptr [rcx + 4*rsi + 4] 16372 imul edx, eax 16373 mov dword ptr [r8 + 4*rsi + 4], edx 16374 mov edx, dword ptr [rcx + 4*rsi + 8] 16375 imul edx, eax 16376 mov dword ptr [r8 + 4*rsi + 8], edx 16377 mov edx, dword ptr [rcx + 4*rsi + 12] 16378 imul edx, eax 16379 mov dword ptr [r8 + 4*rsi + 12], edx 16380 add rsi, 4 16381 cmp r10, rsi 16382 jne .LBB2_1028 16383 jmp .LBB2_1069 16384 .LBB2_250: 16385 test r9d, r9d 16386 jle .LBB2_1069 16387 # %bb.251: 16388 mov eax, dword ptr [rdx] 16389 mov r10d, r9d 16390 cmp r9d, 8 16391 jb .LBB2_252 16392 # %bb.438: 16393 lea rdx, [rcx + 4*r10] 16394 cmp rdx, r8 16395 jbe .LBB2_606 16396 # %bb.439: 16397 lea rdx, [r8 + 4*r10] 16398 cmp rdx, rcx 16399 jbe .LBB2_606 16400 .LBB2_252: 16401 xor esi, esi 16402 .LBB2_1033: 16403 mov r9, rsi 16404 not r9 16405 add r9, r10 16406 mov rdi, r10 16407 and rdi, 3 16408 je .LBB2_1035 16409 .LBB2_1034: # =>This Inner Loop Header: Depth=1 16410 mov edx, dword ptr [rcx + 4*rsi] 16411 imul edx, eax 16412 mov dword ptr [r8 + 4*rsi], edx 16413 add rsi, 1 16414 add rdi, -1 16415 jne .LBB2_1034 16416 .LBB2_1035: 16417 cmp r9, 3 16418 jb .LBB2_1069 16419 .LBB2_1036: # =>This Inner Loop Header: Depth=1 16420 mov edx, dword ptr [rcx + 4*rsi] 16421 imul edx, eax 16422 mov dword ptr [r8 + 4*rsi], edx 16423 mov edx, dword ptr [rcx + 4*rsi + 4] 16424 imul edx, eax 16425 mov dword ptr [r8 + 4*rsi + 4], edx 16426 mov edx, dword ptr [rcx + 4*rsi + 8] 16427 imul edx, eax 16428 mov dword ptr [r8 + 4*rsi + 8], edx 16429 mov edx, dword ptr [rcx + 4*rsi + 12] 16430 imul edx, eax 16431 mov dword ptr [r8 + 4*rsi + 12], edx 16432 add rsi, 4 16433 cmp r10, rsi 16434 jne .LBB2_1036 16435 jmp .LBB2_1069 16436 .LBB2_253: 16437 test r9d, r9d 16438 jle .LBB2_1069 16439 # %bb.254: 16440 mov eax, dword ptr [rdx] 16441 mov r10d, r9d 16442 cmp r9d, 8 16443 jb .LBB2_255 16444 # %bb.441: 16445 lea rdx, [rcx + 4*r10] 16446 cmp rdx, r8 16447 jbe .LBB2_609 16448 # %bb.442: 16449 lea rdx, [r8 + 4*r10] 16450 cmp rdx, rcx 16451 jbe .LBB2_609 16452 .LBB2_255: 16453 xor esi, esi 16454 .LBB2_1041: 16455 mov r9, rsi 16456 not r9 16457 add r9, r10 16458 mov rdi, r10 16459 and rdi, 3 16460 je .LBB2_1043 16461 .LBB2_1042: # =>This Inner Loop Header: Depth=1 16462 mov edx, dword ptr [rcx + 4*rsi] 16463 add edx, eax 16464 mov dword ptr [r8 + 4*rsi], edx 16465 add rsi, 1 16466 add rdi, -1 16467 jne .LBB2_1042 16468 .LBB2_1043: 16469 cmp r9, 3 16470 jb .LBB2_1069 16471 .LBB2_1044: # =>This Inner Loop Header: Depth=1 16472 mov edx, dword ptr [rcx + 4*rsi] 16473 add edx, eax 16474 mov dword ptr [r8 + 4*rsi], edx 16475 mov edx, dword ptr [rcx + 4*rsi + 4] 16476 add edx, eax 16477 mov dword ptr [r8 + 4*rsi + 4], edx 16478 mov edx, dword ptr [rcx + 4*rsi + 8] 16479 add edx, eax 16480 mov dword ptr [r8 + 4*rsi + 8], edx 16481 mov edx, dword ptr [rcx + 4*rsi + 12] 16482 add edx, eax 16483 mov dword ptr [r8 + 4*rsi + 12], edx 16484 add rsi, 4 16485 cmp r10, rsi 16486 jne .LBB2_1044 16487 jmp .LBB2_1069 16488 .LBB2_256: 16489 test r9d, r9d 16490 jle .LBB2_1069 16491 # %bb.257: 16492 mov r11d, dword ptr [rdx] 16493 mov r10d, r9d 16494 cmp r9d, 8 16495 jb .LBB2_258 16496 # %bb.444: 16497 lea rdx, [rcx + 4*r10] 16498 cmp rdx, r8 16499 jbe .LBB2_612 16500 # %bb.445: 16501 lea rdx, [r8 + 4*r10] 16502 cmp rdx, rcx 16503 jbe .LBB2_612 16504 .LBB2_258: 16505 xor esi, esi 16506 .LBB2_1049: 16507 mov rdx, rsi 16508 not rdx 16509 add rdx, r10 16510 mov rdi, r10 16511 and rdi, 3 16512 je .LBB2_1051 16513 .LBB2_1050: # =>This Inner Loop Header: Depth=1 16514 mov eax, r11d 16515 sub eax, dword ptr [rcx + 4*rsi] 16516 mov dword ptr [r8 + 4*rsi], eax 16517 add rsi, 1 16518 add rdi, -1 16519 jne .LBB2_1050 16520 .LBB2_1051: 16521 cmp rdx, 3 16522 jb .LBB2_1069 16523 .LBB2_1052: # =>This Inner Loop Header: Depth=1 16524 mov eax, r11d 16525 sub eax, dword ptr [rcx + 4*rsi] 16526 mov dword ptr [r8 + 4*rsi], eax 16527 mov eax, r11d 16528 sub eax, dword ptr [rcx + 4*rsi + 4] 16529 mov dword ptr [r8 + 4*rsi + 4], eax 16530 mov eax, r11d 16531 sub eax, dword ptr [rcx + 4*rsi + 8] 16532 mov dword ptr [r8 + 4*rsi + 8], eax 16533 mov eax, r11d 16534 sub eax, dword ptr [rcx + 4*rsi + 12] 16535 mov dword ptr [r8 + 4*rsi + 12], eax 16536 add rsi, 4 16537 cmp r10, rsi 16538 jne .LBB2_1052 16539 jmp .LBB2_1069 16540 .LBB2_259: 16541 test r9d, r9d 16542 jle .LBB2_1069 16543 # %bb.260: 16544 mov eax, dword ptr [rdx] 16545 mov r10d, r9d 16546 cmp r9d, 8 16547 jb .LBB2_261 16548 # %bb.447: 16549 lea rdx, [rcx + 4*r10] 16550 cmp rdx, r8 16551 jbe .LBB2_615 16552 # %bb.448: 16553 lea rdx, [r8 + 4*r10] 16554 cmp rdx, rcx 16555 jbe .LBB2_615 16556 .LBB2_261: 16557 xor esi, esi 16558 .LBB2_1057: 16559 mov r9, rsi 16560 not r9 16561 add r9, r10 16562 mov rdi, r10 16563 and rdi, 3 16564 je .LBB2_1059 16565 .LBB2_1058: # =>This Inner Loop Header: Depth=1 16566 mov edx, dword ptr [rcx + 4*rsi] 16567 add edx, eax 16568 mov dword ptr [r8 + 4*rsi], edx 16569 add rsi, 1 16570 add rdi, -1 16571 jne .LBB2_1058 16572 .LBB2_1059: 16573 cmp r9, 3 16574 jb .LBB2_1069 16575 .LBB2_1060: # =>This Inner Loop Header: Depth=1 16576 mov edx, dword ptr [rcx + 4*rsi] 16577 add edx, eax 16578 mov dword ptr [r8 + 4*rsi], edx 16579 mov edx, dword ptr [rcx + 4*rsi + 4] 16580 add edx, eax 16581 mov dword ptr [r8 + 4*rsi + 4], edx 16582 mov edx, dword ptr [rcx + 4*rsi + 8] 16583 add edx, eax 16584 mov dword ptr [r8 + 4*rsi + 8], edx 16585 mov edx, dword ptr [rcx + 4*rsi + 12] 16586 add edx, eax 16587 mov dword ptr [r8 + 4*rsi + 12], edx 16588 add rsi, 4 16589 cmp r10, rsi 16590 jne .LBB2_1060 16591 jmp .LBB2_1069 16592 .LBB2_262: 16593 test r9d, r9d 16594 jle .LBB2_1069 16595 # %bb.263: 16596 mov r11d, dword ptr [rdx] 16597 mov r10d, r9d 16598 cmp r9d, 8 16599 jb .LBB2_264 16600 # %bb.450: 16601 lea rdx, [rcx + 4*r10] 16602 cmp rdx, r8 16603 jbe .LBB2_618 16604 # %bb.451: 16605 lea rdx, [r8 + 4*r10] 16606 cmp rdx, rcx 16607 jbe .LBB2_618 16608 .LBB2_264: 16609 xor esi, esi 16610 .LBB2_1065: 16611 mov rdx, rsi 16612 not rdx 16613 add rdx, r10 16614 mov rdi, r10 16615 and rdi, 3 16616 je .LBB2_1067 16617 .LBB2_1066: # =>This Inner Loop Header: Depth=1 16618 mov eax, r11d 16619 sub eax, dword ptr [rcx + 4*rsi] 16620 mov dword ptr [r8 + 4*rsi], eax 16621 add rsi, 1 16622 add rdi, -1 16623 jne .LBB2_1066 16624 .LBB2_1067: 16625 cmp rdx, 3 16626 jb .LBB2_1069 16627 .LBB2_1068: # =>This Inner Loop Header: Depth=1 16628 mov eax, r11d 16629 sub eax, dword ptr [rcx + 4*rsi] 16630 mov dword ptr [r8 + 4*rsi], eax 16631 mov eax, r11d 16632 sub eax, dword ptr [rcx + 4*rsi + 4] 16633 mov dword ptr [r8 + 4*rsi + 4], eax 16634 mov eax, r11d 16635 sub eax, dword ptr [rcx + 4*rsi + 8] 16636 mov dword ptr [r8 + 4*rsi + 8], eax 16637 mov eax, r11d 16638 sub eax, dword ptr [rcx + 4*rsi + 12] 16639 mov dword ptr [r8 + 4*rsi + 12], eax 16640 add rsi, 4 16641 cmp r10, rsi 16642 jne .LBB2_1068 16643 jmp .LBB2_1069 16644 .LBB2_319: 16645 and esi, -4 16646 xor edi, edi 16647 .LBB2_320: # =>This Inner Loop Header: Depth=1 16648 mov rdx, qword ptr [rcx + 8*rdi] 16649 imul rdx, rax 16650 mov qword ptr [r8 + 8*rdi], rdx 16651 mov rdx, qword ptr [rcx + 8*rdi + 8] 16652 imul rdx, rax 16653 mov qword ptr [r8 + 8*rdi + 8], rdx 16654 mov rdx, qword ptr [rcx + 8*rdi + 16] 16655 imul rdx, rax 16656 mov qword ptr [r8 + 8*rdi + 16], rdx 16657 mov rdx, qword ptr [rcx + 8*rdi + 24] 16658 imul rdx, rax 16659 mov qword ptr [r8 + 8*rdi + 24], rdx 16660 add rdi, 4 16661 cmp rsi, rdi 16662 jne .LBB2_320 16663 .LBB2_321: 16664 test r9, r9 16665 je .LBB2_1069 16666 # %bb.322: 16667 lea rsi, [r8 + 8*rdi] 16668 lea rcx, [rcx + 8*rdi] 16669 xor edi, edi 16670 .LBB2_323: # =>This Inner Loop Header: Depth=1 16671 mov rdx, qword ptr [rcx + 8*rdi] 16672 imul rdx, rax 16673 mov qword ptr [rsi + 8*rdi], rdx 16674 add rdi, 1 16675 cmp r9, rdi 16676 jne .LBB2_323 16677 jmp .LBB2_1069 16678 .LBB2_324: 16679 and esi, -4 16680 xor edi, edi 16681 .LBB2_325: # =>This Inner Loop Header: Depth=1 16682 mov rdx, qword ptr [rcx + 8*rdi] 16683 imul rdx, rax 16684 mov qword ptr [r8 + 8*rdi], rdx 16685 mov rdx, qword ptr [rcx + 8*rdi + 8] 16686 imul rdx, rax 16687 mov qword ptr [r8 + 8*rdi + 8], rdx 16688 mov rdx, qword ptr [rcx + 8*rdi + 16] 16689 imul rdx, rax 16690 mov qword ptr [r8 + 8*rdi + 16], rdx 16691 mov rdx, qword ptr [rcx + 8*rdi + 24] 16692 imul rdx, rax 16693 mov qword ptr [r8 + 8*rdi + 24], rdx 16694 add rdi, 4 16695 cmp rsi, rdi 16696 jne .LBB2_325 16697 .LBB2_326: 16698 test r9, r9 16699 je .LBB2_1069 16700 # %bb.327: 16701 lea rsi, [r8 + 8*rdi] 16702 lea rcx, [rcx + 8*rdi] 16703 xor edi, edi 16704 .LBB2_328: # =>This Inner Loop Header: Depth=1 16705 mov rdx, qword ptr [rcx + 8*rdi] 16706 imul rdx, rax 16707 mov qword ptr [rsi + 8*rdi], rdx 16708 add rdi, 1 16709 cmp r9, rdi 16710 jne .LBB2_328 16711 jmp .LBB2_1069 16712 .LBB2_377: 16713 and esi, -4 16714 xor edi, edi 16715 .LBB2_378: # =>This Inner Loop Header: Depth=1 16716 mov rdx, qword ptr [rcx + 8*rdi] 16717 imul rdx, rax 16718 mov qword ptr [r8 + 8*rdi], rdx 16719 mov rdx, qword ptr [rcx + 8*rdi + 8] 16720 imul rdx, rax 16721 mov qword ptr [r8 + 8*rdi + 8], rdx 16722 mov rdx, qword ptr [rcx + 8*rdi + 16] 16723 imul rdx, rax 16724 mov qword ptr [r8 + 8*rdi + 16], rdx 16725 mov rdx, qword ptr [rcx + 8*rdi + 24] 16726 imul rdx, rax 16727 mov qword ptr [r8 + 8*rdi + 24], rdx 16728 add rdi, 4 16729 cmp rsi, rdi 16730 jne .LBB2_378 16731 .LBB2_379: 16732 test r9, r9 16733 je .LBB2_1069 16734 # %bb.380: 16735 lea rsi, [r8 + 8*rdi] 16736 lea rcx, [rcx + 8*rdi] 16737 xor edi, edi 16738 .LBB2_381: # =>This Inner Loop Header: Depth=1 16739 mov rdx, qword ptr [rcx + 8*rdi] 16740 imul rdx, rax 16741 mov qword ptr [rsi + 8*rdi], rdx 16742 add rdi, 1 16743 cmp r9, rdi 16744 jne .LBB2_381 16745 jmp .LBB2_1069 16746 .LBB2_385: 16747 and esi, -4 16748 xor edi, edi 16749 .LBB2_386: # =>This Inner Loop Header: Depth=1 16750 mov rdx, qword ptr [rcx + 8*rdi] 16751 imul rdx, rax 16752 mov qword ptr [r8 + 8*rdi], rdx 16753 mov rdx, qword ptr [rcx + 8*rdi + 8] 16754 imul rdx, rax 16755 mov qword ptr [r8 + 8*rdi + 8], rdx 16756 mov rdx, qword ptr [rcx + 8*rdi + 16] 16757 imul rdx, rax 16758 mov qword ptr [r8 + 8*rdi + 16], rdx 16759 mov rdx, qword ptr [rcx + 8*rdi + 24] 16760 imul rdx, rax 16761 mov qword ptr [r8 + 8*rdi + 24], rdx 16762 add rdi, 4 16763 cmp rsi, rdi 16764 jne .LBB2_386 16765 .LBB2_387: 16766 test r9, r9 16767 je .LBB2_1069 16768 # %bb.388: 16769 lea rsi, [r8 + 8*rdi] 16770 lea rcx, [rcx + 8*rdi] 16771 xor edi, edi 16772 .LBB2_389: # =>This Inner Loop Header: Depth=1 16773 mov rdx, qword ptr [rcx + 8*rdi] 16774 imul rdx, rax 16775 mov qword ptr [rsi + 8*rdi], rdx 16776 add rdi, 1 16777 cmp r9, rdi 16778 jne .LBB2_389 16779 .LBB2_1069: 16780 mov rsp, rbp 16781 pop rbp 16782 ret 16783 .LBB2_453: 16784 mov esi, r10d 16785 and esi, -8 16786 movd xmm0, eax 16787 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 16788 lea rdx, [rsi - 8] 16789 mov r9, rdx 16790 shr r9, 3 16791 add r9, 1 16792 test rdx, rdx 16793 je .LBB2_621 16794 # %bb.454: 16795 mov rdx, r9 16796 and rdx, -2 16797 neg rdx 16798 xor edi, edi 16799 .LBB2_455: # =>This Inner Loop Header: Depth=1 16800 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 16801 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 16802 pmulld xmm1, xmm0 16803 pmulld xmm2, xmm0 16804 movdqu xmmword ptr [r8 + 4*rdi], xmm1 16805 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 16806 movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32] 16807 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 16808 pmulld xmm1, xmm0 16809 pmulld xmm2, xmm0 16810 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 16811 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 16812 add rdi, 16 16813 add rdx, 2 16814 jne .LBB2_455 16815 jmp .LBB2_622 16816 .LBB2_456: 16817 mov esi, r10d 16818 and esi, -8 16819 movd xmm0, eax 16820 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 16821 lea rdx, [rsi - 8] 16822 mov r9, rdx 16823 shr r9, 3 16824 add r9, 1 16825 test rdx, rdx 16826 je .LBB2_629 16827 # %bb.457: 16828 mov rdx, r9 16829 and rdx, -2 16830 neg rdx 16831 xor edi, edi 16832 .LBB2_458: # =>This Inner Loop Header: Depth=1 16833 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 16834 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 16835 pmulld xmm1, xmm0 16836 pmulld xmm2, xmm0 16837 movdqu xmmword ptr [r8 + 4*rdi], xmm1 16838 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 16839 movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32] 16840 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 16841 pmulld xmm1, xmm0 16842 pmulld xmm2, xmm0 16843 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 16844 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 16845 add rdi, 16 16846 add rdx, 2 16847 jne .LBB2_458 16848 jmp .LBB2_630 16849 .LBB2_459: 16850 mov esi, r10d 16851 and esi, -8 16852 movd xmm0, eax 16853 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 16854 lea rdx, [rsi - 8] 16855 mov r9, rdx 16856 shr r9, 3 16857 add r9, 1 16858 test rdx, rdx 16859 je .LBB2_637 16860 # %bb.460: 16861 mov rdx, r9 16862 and rdx, -2 16863 neg rdx 16864 xor edi, edi 16865 .LBB2_461: # =>This Inner Loop Header: Depth=1 16866 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 16867 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 16868 paddd xmm1, xmm0 16869 paddd xmm2, xmm0 16870 movdqu xmmword ptr [r8 + 4*rdi], xmm1 16871 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 16872 movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32] 16873 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 16874 paddd xmm1, xmm0 16875 paddd xmm2, xmm0 16876 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 16877 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 16878 add rdi, 16 16879 add rdx, 2 16880 jne .LBB2_461 16881 jmp .LBB2_638 16882 .LBB2_462: 16883 mov esi, r10d 16884 and esi, -8 16885 movd xmm0, r11d 16886 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 16887 lea rdx, [rsi - 8] 16888 mov r9, rdx 16889 shr r9, 3 16890 add r9, 1 16891 test rdx, rdx 16892 je .LBB2_645 16893 # %bb.463: 16894 mov rdx, r9 16895 and rdx, -2 16896 neg rdx 16897 xor edi, edi 16898 .LBB2_464: # =>This Inner Loop Header: Depth=1 16899 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 16900 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 16901 movdqa xmm3, xmm0 16902 psubd xmm3, xmm1 16903 movdqa xmm1, xmm0 16904 psubd xmm1, xmm2 16905 movdqu xmmword ptr [r8 + 4*rdi], xmm3 16906 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1 16907 movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32] 16908 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 16909 movdqa xmm3, xmm0 16910 psubd xmm3, xmm1 16911 movdqa xmm1, xmm0 16912 psubd xmm1, xmm2 16913 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm3 16914 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1 16915 add rdi, 16 16916 add rdx, 2 16917 jne .LBB2_464 16918 jmp .LBB2_646 16919 .LBB2_465: 16920 mov esi, r10d 16921 and esi, -8 16922 movd xmm0, eax 16923 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 16924 lea rdx, [rsi - 8] 16925 mov r9, rdx 16926 shr r9, 3 16927 add r9, 1 16928 test rdx, rdx 16929 je .LBB2_653 16930 # %bb.466: 16931 mov rdx, r9 16932 and rdx, -2 16933 neg rdx 16934 xor edi, edi 16935 .LBB2_467: # =>This Inner Loop Header: Depth=1 16936 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 16937 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 16938 paddd xmm1, xmm0 16939 paddd xmm2, xmm0 16940 movdqu xmmword ptr [r8 + 4*rdi], xmm1 16941 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 16942 movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32] 16943 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 16944 paddd xmm1, xmm0 16945 paddd xmm2, xmm0 16946 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 16947 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 16948 add rdi, 16 16949 add rdx, 2 16950 jne .LBB2_467 16951 jmp .LBB2_654 16952 .LBB2_468: 16953 mov esi, r10d 16954 and esi, -8 16955 movd xmm0, r11d 16956 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 16957 lea rdx, [rsi - 8] 16958 mov r9, rdx 16959 shr r9, 3 16960 add r9, 1 16961 test rdx, rdx 16962 je .LBB2_661 16963 # %bb.469: 16964 mov rdx, r9 16965 and rdx, -2 16966 neg rdx 16967 xor edi, edi 16968 .LBB2_470: # =>This Inner Loop Header: Depth=1 16969 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 16970 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 16971 movdqa xmm3, xmm0 16972 psubd xmm3, xmm1 16973 movdqa xmm1, xmm0 16974 psubd xmm1, xmm2 16975 movdqu xmmword ptr [r8 + 4*rdi], xmm3 16976 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1 16977 movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32] 16978 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 16979 movdqa xmm3, xmm0 16980 psubd xmm3, xmm1 16981 movdqa xmm1, xmm0 16982 psubd xmm1, xmm2 16983 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm3 16984 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1 16985 add rdi, 16 16986 add rdx, 2 16987 jne .LBB2_470 16988 jmp .LBB2_662 16989 .LBB2_471: 16990 mov edx, eax 16991 and edx, -4 16992 movddup xmm1, xmm0 # xmm1 = xmm0[0,0] 16993 lea rsi, [rdx - 4] 16994 mov r9, rsi 16995 shr r9, 2 16996 add r9, 1 16997 test rsi, rsi 16998 je .LBB2_669 16999 # %bb.472: 17000 mov rsi, r9 17001 and rsi, -2 17002 neg rsi 17003 xor edi, edi 17004 .LBB2_473: # =>This Inner Loop Header: Depth=1 17005 movupd xmm2, xmmword ptr [rcx + 8*rdi] 17006 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 17007 mulpd xmm2, xmm1 17008 mulpd xmm3, xmm1 17009 movupd xmmword ptr [r8 + 8*rdi], xmm2 17010 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 17011 movupd xmm2, xmmword ptr [rcx + 8*rdi + 32] 17012 movupd xmm3, xmmword ptr [rcx + 8*rdi + 48] 17013 mulpd xmm2, xmm1 17014 mulpd xmm3, xmm1 17015 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 17016 movupd xmmword ptr [r8 + 8*rdi + 48], xmm3 17017 add rdi, 8 17018 add rsi, 2 17019 jne .LBB2_473 17020 jmp .LBB2_670 17021 .LBB2_474: 17022 mov edx, eax 17023 and edx, -4 17024 movddup xmm1, xmm0 # xmm1 = xmm0[0,0] 17025 lea rsi, [rdx - 4] 17026 mov r9, rsi 17027 shr r9, 2 17028 add r9, 1 17029 test rsi, rsi 17030 je .LBB2_677 17031 # %bb.475: 17032 mov rsi, r9 17033 and rsi, -2 17034 neg rsi 17035 xor edi, edi 17036 .LBB2_476: # =>This Inner Loop Header: Depth=1 17037 movupd xmm2, xmmword ptr [rcx + 8*rdi] 17038 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 17039 mulpd xmm2, xmm1 17040 mulpd xmm3, xmm1 17041 movupd xmmword ptr [r8 + 8*rdi], xmm2 17042 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 17043 movupd xmm2, xmmword ptr [rcx + 8*rdi + 32] 17044 movupd xmm3, xmmword ptr [rcx + 8*rdi + 48] 17045 mulpd xmm2, xmm1 17046 mulpd xmm3, xmm1 17047 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 17048 movupd xmmword ptr [r8 + 8*rdi + 48], xmm3 17049 add rdi, 8 17050 add rsi, 2 17051 jne .LBB2_476 17052 jmp .LBB2_678 17053 .LBB2_477: 17054 mov edx, eax 17055 and edx, -4 17056 movddup xmm1, xmm0 # xmm1 = xmm0[0,0] 17057 lea rsi, [rdx - 4] 17058 mov r9, rsi 17059 shr r9, 2 17060 add r9, 1 17061 test rsi, rsi 17062 je .LBB2_685 17063 # %bb.478: 17064 mov rsi, r9 17065 and rsi, -2 17066 neg rsi 17067 xor edi, edi 17068 .LBB2_479: # =>This Inner Loop Header: Depth=1 17069 movupd xmm2, xmmword ptr [rcx + 8*rdi] 17070 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 17071 addpd xmm2, xmm1 17072 addpd xmm3, xmm1 17073 movupd xmmword ptr [r8 + 8*rdi], xmm2 17074 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 17075 movupd xmm2, xmmword ptr [rcx + 8*rdi + 32] 17076 movupd xmm3, xmmword ptr [rcx + 8*rdi + 48] 17077 addpd xmm2, xmm1 17078 addpd xmm3, xmm1 17079 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 17080 movupd xmmword ptr [r8 + 8*rdi + 48], xmm3 17081 add rdi, 8 17082 add rsi, 2 17083 jne .LBB2_479 17084 jmp .LBB2_686 17085 .LBB2_480: 17086 mov edx, eax 17087 and edx, -4 17088 movddup xmm1, xmm0 # xmm1 = xmm0[0,0] 17089 lea rsi, [rdx - 4] 17090 mov r9, rsi 17091 shr r9, 2 17092 add r9, 1 17093 test rsi, rsi 17094 je .LBB2_693 17095 # %bb.481: 17096 mov rsi, r9 17097 and rsi, -2 17098 neg rsi 17099 xor edi, edi 17100 .LBB2_482: # =>This Inner Loop Header: Depth=1 17101 movupd xmm2, xmmword ptr [rcx + 8*rdi] 17102 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 17103 movapd xmm4, xmm1 17104 subpd xmm4, xmm2 17105 movapd xmm2, xmm1 17106 subpd xmm2, xmm3 17107 movupd xmmword ptr [r8 + 8*rdi], xmm4 17108 movupd xmmword ptr [r8 + 8*rdi + 16], xmm2 17109 movupd xmm2, xmmword ptr [rcx + 8*rdi + 32] 17110 movupd xmm3, xmmword ptr [rcx + 8*rdi + 48] 17111 movapd xmm4, xmm1 17112 subpd xmm4, xmm2 17113 movapd xmm2, xmm1 17114 subpd xmm2, xmm3 17115 movupd xmmword ptr [r8 + 8*rdi + 32], xmm4 17116 movupd xmmword ptr [r8 + 8*rdi + 48], xmm2 17117 add rdi, 8 17118 add rsi, 2 17119 jne .LBB2_482 17120 jmp .LBB2_694 17121 .LBB2_483: 17122 mov edx, eax 17123 and edx, -4 17124 movddup xmm1, xmm0 # xmm1 = xmm0[0,0] 17125 lea rsi, [rdx - 4] 17126 mov r9, rsi 17127 shr r9, 2 17128 add r9, 1 17129 test rsi, rsi 17130 je .LBB2_701 17131 # %bb.484: 17132 mov rsi, r9 17133 and rsi, -2 17134 neg rsi 17135 xor edi, edi 17136 .LBB2_485: # =>This Inner Loop Header: Depth=1 17137 movupd xmm2, xmmword ptr [rcx + 8*rdi] 17138 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 17139 addpd xmm2, xmm1 17140 addpd xmm3, xmm1 17141 movupd xmmword ptr [r8 + 8*rdi], xmm2 17142 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 17143 movupd xmm2, xmmword ptr [rcx + 8*rdi + 32] 17144 movupd xmm3, xmmword ptr [rcx + 8*rdi + 48] 17145 addpd xmm2, xmm1 17146 addpd xmm3, xmm1 17147 movupd xmmword ptr [r8 + 8*rdi + 32], xmm2 17148 movupd xmmword ptr [r8 + 8*rdi + 48], xmm3 17149 add rdi, 8 17150 add rsi, 2 17151 jne .LBB2_485 17152 jmp .LBB2_702 17153 .LBB2_486: 17154 mov edx, eax 17155 and edx, -4 17156 movddup xmm1, xmm0 # xmm1 = xmm0[0,0] 17157 lea rsi, [rdx - 4] 17158 mov r9, rsi 17159 shr r9, 2 17160 add r9, 1 17161 test rsi, rsi 17162 je .LBB2_709 17163 # %bb.487: 17164 mov rsi, r9 17165 and rsi, -2 17166 neg rsi 17167 xor edi, edi 17168 .LBB2_488: # =>This Inner Loop Header: Depth=1 17169 movupd xmm2, xmmword ptr [rcx + 8*rdi] 17170 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 17171 movapd xmm4, xmm1 17172 subpd xmm4, xmm2 17173 movapd xmm2, xmm1 17174 subpd xmm2, xmm3 17175 movupd xmmword ptr [r8 + 8*rdi], xmm4 17176 movupd xmmword ptr [r8 + 8*rdi + 16], xmm2 17177 movupd xmm2, xmmword ptr [rcx + 8*rdi + 32] 17178 movupd xmm3, xmmword ptr [rcx + 8*rdi + 48] 17179 movapd xmm4, xmm1 17180 subpd xmm4, xmm2 17181 movapd xmm2, xmm1 17182 subpd xmm2, xmm3 17183 movupd xmmword ptr [r8 + 8*rdi + 32], xmm4 17184 movupd xmmword ptr [r8 + 8*rdi + 48], xmm2 17185 add rdi, 8 17186 add rsi, 2 17187 jne .LBB2_488 17188 jmp .LBB2_710 17189 .LBB2_489: 17190 mov edi, r10d 17191 and edi, -32 17192 movzx eax, dl 17193 movd xmm0, eax 17194 pxor xmm1, xmm1 17195 pshufb xmm0, xmm1 17196 lea rax, [rdi - 32] 17197 mov r9, rax 17198 shr r9, 5 17199 add r9, 1 17200 pmovzxbw xmm1, xmm0 # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 17201 test rax, rax 17202 je .LBB2_717 17203 # %bb.490: 17204 mov rsi, r9 17205 and rsi, -2 17206 neg rsi 17207 xor eax, eax 17208 movdqa xmm2, xmm0 17209 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 17210 movdqa xmm3, xmmword ptr [rip + .LCPI2_0] # xmm3 = [255,255,255,255,255,255,255,255] 17211 movdqa xmm4, xmm0 17212 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 17213 .LBB2_491: # =>This Inner Loop Header: Depth=1 17214 movdqu xmm5, xmmword ptr [rcx + rax] 17215 movdqu xmm6, xmmword ptr [rcx + rax + 16] 17216 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 17217 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 17218 pmullw xmm5, xmm2 17219 pand xmm5, xmm3 17220 pmullw xmm7, xmm1 17221 pand xmm7, xmm3 17222 packuswb xmm7, xmm5 17223 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 17224 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 17225 pmullw xmm6, xmm4 17226 pand xmm6, xmm3 17227 pmullw xmm5, xmm1 17228 pand xmm5, xmm3 17229 packuswb xmm5, xmm6 17230 movdqu xmmword ptr [r8 + rax], xmm7 17231 movdqu xmmword ptr [r8 + rax + 16], xmm5 17232 movdqu xmm5, xmmword ptr [rcx + rax + 32] 17233 movdqu xmm6, xmmword ptr [rcx + rax + 48] 17234 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 17235 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 17236 pmullw xmm5, xmm2 17237 pand xmm5, xmm3 17238 pmullw xmm7, xmm1 17239 pand xmm7, xmm3 17240 packuswb xmm7, xmm5 17241 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 17242 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 17243 pmullw xmm6, xmm4 17244 pand xmm6, xmm3 17245 pmullw xmm5, xmm1 17246 pand xmm5, xmm3 17247 packuswb xmm5, xmm6 17248 movdqu xmmword ptr [r8 + rax + 32], xmm7 17249 movdqu xmmword ptr [r8 + rax + 48], xmm5 17250 add rax, 64 17251 add rsi, 2 17252 jne .LBB2_491 17253 jmp .LBB2_718 17254 .LBB2_492: 17255 mov edi, r10d 17256 and edi, -32 17257 movzx eax, dl 17258 movd xmm0, eax 17259 pxor xmm1, xmm1 17260 pshufb xmm0, xmm1 17261 lea rax, [rdi - 32] 17262 mov r9, rax 17263 shr r9, 5 17264 add r9, 1 17265 pmovzxbw xmm1, xmm0 # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 17266 test rax, rax 17267 je .LBB2_725 17268 # %bb.493: 17269 mov rsi, r9 17270 and rsi, -2 17271 neg rsi 17272 xor eax, eax 17273 movdqa xmm2, xmm0 17274 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 17275 movdqa xmm3, xmmword ptr [rip + .LCPI2_0] # xmm3 = [255,255,255,255,255,255,255,255] 17276 movdqa xmm4, xmm0 17277 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 17278 .LBB2_494: # =>This Inner Loop Header: Depth=1 17279 movdqu xmm5, xmmword ptr [rcx + rax] 17280 movdqu xmm6, xmmword ptr [rcx + rax + 16] 17281 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 17282 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 17283 pmullw xmm5, xmm2 17284 pand xmm5, xmm3 17285 pmullw xmm7, xmm1 17286 pand xmm7, xmm3 17287 packuswb xmm7, xmm5 17288 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 17289 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 17290 pmullw xmm6, xmm4 17291 pand xmm6, xmm3 17292 pmullw xmm5, xmm1 17293 pand xmm5, xmm3 17294 packuswb xmm5, xmm6 17295 movdqu xmmword ptr [r8 + rax], xmm7 17296 movdqu xmmword ptr [r8 + rax + 16], xmm5 17297 movdqu xmm5, xmmword ptr [rcx + rax + 32] 17298 movdqu xmm6, xmmword ptr [rcx + rax + 48] 17299 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 17300 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 17301 pmullw xmm5, xmm2 17302 pand xmm5, xmm3 17303 pmullw xmm7, xmm1 17304 pand xmm7, xmm3 17305 packuswb xmm7, xmm5 17306 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 17307 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 17308 pmullw xmm6, xmm4 17309 pand xmm6, xmm3 17310 pmullw xmm5, xmm1 17311 pand xmm5, xmm3 17312 packuswb xmm5, xmm6 17313 movdqu xmmword ptr [r8 + rax + 32], xmm7 17314 movdqu xmmword ptr [r8 + rax + 48], xmm5 17315 add rax, 64 17316 add rsi, 2 17317 jne .LBB2_494 17318 jmp .LBB2_726 17319 .LBB2_495: 17320 mov esi, r10d 17321 and esi, -32 17322 movzx edx, al 17323 movd xmm0, edx 17324 pxor xmm1, xmm1 17325 pshufb xmm0, xmm1 17326 lea rdx, [rsi - 32] 17327 mov r9, rdx 17328 shr r9, 5 17329 add r9, 1 17330 test rdx, rdx 17331 je .LBB2_733 17332 # %bb.496: 17333 mov rdx, r9 17334 and rdx, -2 17335 neg rdx 17336 xor edi, edi 17337 .LBB2_497: # =>This Inner Loop Header: Depth=1 17338 movdqu xmm1, xmmword ptr [rcx + rdi] 17339 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 17340 paddb xmm1, xmm0 17341 paddb xmm2, xmm0 17342 movdqu xmmword ptr [r8 + rdi], xmm1 17343 movdqu xmmword ptr [r8 + rdi + 16], xmm2 17344 movdqu xmm1, xmmword ptr [rcx + rdi + 32] 17345 movdqu xmm2, xmmword ptr [rcx + rdi + 48] 17346 paddb xmm1, xmm0 17347 paddb xmm2, xmm0 17348 movdqu xmmword ptr [r8 + rdi + 32], xmm1 17349 movdqu xmmword ptr [r8 + rdi + 48], xmm2 17350 add rdi, 64 17351 add rdx, 2 17352 jne .LBB2_497 17353 jmp .LBB2_734 17354 .LBB2_498: 17355 mov esi, r10d 17356 and esi, -32 17357 movzx edx, r11b 17358 movd xmm0, edx 17359 pxor xmm1, xmm1 17360 pshufb xmm0, xmm1 17361 lea rdx, [rsi - 32] 17362 mov r9, rdx 17363 shr r9, 5 17364 add r9, 1 17365 test rdx, rdx 17366 je .LBB2_741 17367 # %bb.499: 17368 mov rdx, r9 17369 and rdx, -2 17370 neg rdx 17371 xor edi, edi 17372 .LBB2_500: # =>This Inner Loop Header: Depth=1 17373 movdqu xmm1, xmmword ptr [rcx + rdi] 17374 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 17375 movdqa xmm3, xmm0 17376 psubb xmm3, xmm1 17377 movdqa xmm1, xmm0 17378 psubb xmm1, xmm2 17379 movdqu xmmword ptr [r8 + rdi], xmm3 17380 movdqu xmmword ptr [r8 + rdi + 16], xmm1 17381 movdqu xmm1, xmmword ptr [rcx + rdi + 32] 17382 movdqu xmm2, xmmword ptr [rcx + rdi + 48] 17383 movdqa xmm3, xmm0 17384 psubb xmm3, xmm1 17385 movdqa xmm1, xmm0 17386 psubb xmm1, xmm2 17387 movdqu xmmword ptr [r8 + rdi + 32], xmm3 17388 movdqu xmmword ptr [r8 + rdi + 48], xmm1 17389 add rdi, 64 17390 add rdx, 2 17391 jne .LBB2_500 17392 jmp .LBB2_742 17393 .LBB2_501: 17394 mov esi, r10d 17395 and esi, -32 17396 movzx edx, al 17397 movd xmm0, edx 17398 pxor xmm1, xmm1 17399 pshufb xmm0, xmm1 17400 lea rdx, [rsi - 32] 17401 mov r9, rdx 17402 shr r9, 5 17403 add r9, 1 17404 test rdx, rdx 17405 je .LBB2_749 17406 # %bb.502: 17407 mov rdx, r9 17408 and rdx, -2 17409 neg rdx 17410 xor edi, edi 17411 .LBB2_503: # =>This Inner Loop Header: Depth=1 17412 movdqu xmm1, xmmword ptr [rcx + rdi] 17413 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 17414 paddb xmm1, xmm0 17415 paddb xmm2, xmm0 17416 movdqu xmmword ptr [r8 + rdi], xmm1 17417 movdqu xmmword ptr [r8 + rdi + 16], xmm2 17418 movdqu xmm1, xmmword ptr [rcx + rdi + 32] 17419 movdqu xmm2, xmmword ptr [rcx + rdi + 48] 17420 paddb xmm1, xmm0 17421 paddb xmm2, xmm0 17422 movdqu xmmword ptr [r8 + rdi + 32], xmm1 17423 movdqu xmmword ptr [r8 + rdi + 48], xmm2 17424 add rdi, 64 17425 add rdx, 2 17426 jne .LBB2_503 17427 jmp .LBB2_750 17428 .LBB2_504: 17429 mov esi, r10d 17430 and esi, -32 17431 movzx edx, r11b 17432 movd xmm0, edx 17433 pxor xmm1, xmm1 17434 pshufb xmm0, xmm1 17435 lea rdx, [rsi - 32] 17436 mov r9, rdx 17437 shr r9, 5 17438 add r9, 1 17439 test rdx, rdx 17440 je .LBB2_757 17441 # %bb.505: 17442 mov rdx, r9 17443 and rdx, -2 17444 neg rdx 17445 xor edi, edi 17446 .LBB2_506: # =>This Inner Loop Header: Depth=1 17447 movdqu xmm1, xmmword ptr [rcx + rdi] 17448 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 17449 movdqa xmm3, xmm0 17450 psubb xmm3, xmm1 17451 movdqa xmm1, xmm0 17452 psubb xmm1, xmm2 17453 movdqu xmmword ptr [r8 + rdi], xmm3 17454 movdqu xmmword ptr [r8 + rdi + 16], xmm1 17455 movdqu xmm1, xmmword ptr [rcx + rdi + 32] 17456 movdqu xmm2, xmmword ptr [rcx + rdi + 48] 17457 movdqa xmm3, xmm0 17458 psubb xmm3, xmm1 17459 movdqa xmm1, xmm0 17460 psubb xmm1, xmm2 17461 movdqu xmmword ptr [r8 + rdi + 32], xmm3 17462 movdqu xmmword ptr [r8 + rdi + 48], xmm1 17463 add rdi, 64 17464 add rdx, 2 17465 jne .LBB2_506 17466 jmp .LBB2_758 17467 .LBB2_507: 17468 mov esi, r10d 17469 and esi, -4 17470 movq xmm0, rax 17471 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 17472 lea rdx, [rsi - 4] 17473 mov r9, rdx 17474 shr r9, 2 17475 add r9, 1 17476 test rdx, rdx 17477 je .LBB2_765 17478 # %bb.508: 17479 mov rdx, r9 17480 and rdx, -2 17481 neg rdx 17482 xor edi, edi 17483 .LBB2_509: # =>This Inner Loop Header: Depth=1 17484 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 17485 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 17486 paddq xmm1, xmm0 17487 paddq xmm2, xmm0 17488 movdqu xmmword ptr [r8 + 8*rdi], xmm1 17489 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 17490 movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32] 17491 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48] 17492 paddq xmm1, xmm0 17493 paddq xmm2, xmm0 17494 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1 17495 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2 17496 add rdi, 8 17497 add rdx, 2 17498 jne .LBB2_509 17499 jmp .LBB2_766 17500 .LBB2_510: 17501 mov esi, r10d 17502 and esi, -4 17503 movq xmm0, r11 17504 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 17505 lea rdx, [rsi - 4] 17506 mov r9, rdx 17507 shr r9, 2 17508 add r9, 1 17509 test rdx, rdx 17510 je .LBB2_773 17511 # %bb.511: 17512 mov rdx, r9 17513 and rdx, -2 17514 neg rdx 17515 xor edi, edi 17516 .LBB2_512: # =>This Inner Loop Header: Depth=1 17517 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 17518 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 17519 movdqa xmm3, xmm0 17520 psubq xmm3, xmm1 17521 movdqa xmm1, xmm0 17522 psubq xmm1, xmm2 17523 movdqu xmmword ptr [r8 + 8*rdi], xmm3 17524 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1 17525 movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32] 17526 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48] 17527 movdqa xmm3, xmm0 17528 psubq xmm3, xmm1 17529 movdqa xmm1, xmm0 17530 psubq xmm1, xmm2 17531 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm3 17532 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1 17533 add rdi, 8 17534 add rdx, 2 17535 jne .LBB2_512 17536 jmp .LBB2_774 17537 .LBB2_513: 17538 mov esi, r10d 17539 and esi, -4 17540 movq xmm0, rax 17541 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 17542 lea rdx, [rsi - 4] 17543 mov r9, rdx 17544 shr r9, 2 17545 add r9, 1 17546 test rdx, rdx 17547 je .LBB2_781 17548 # %bb.514: 17549 mov rdx, r9 17550 and rdx, -2 17551 neg rdx 17552 xor edi, edi 17553 .LBB2_515: # =>This Inner Loop Header: Depth=1 17554 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 17555 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 17556 paddq xmm1, xmm0 17557 paddq xmm2, xmm0 17558 movdqu xmmword ptr [r8 + 8*rdi], xmm1 17559 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 17560 movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32] 17561 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48] 17562 paddq xmm1, xmm0 17563 paddq xmm2, xmm0 17564 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1 17565 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2 17566 add rdi, 8 17567 add rdx, 2 17568 jne .LBB2_515 17569 jmp .LBB2_782 17570 .LBB2_516: 17571 mov esi, r10d 17572 and esi, -4 17573 movq xmm0, r11 17574 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 17575 lea rdx, [rsi - 4] 17576 mov r9, rdx 17577 shr r9, 2 17578 add r9, 1 17579 test rdx, rdx 17580 je .LBB2_789 17581 # %bb.517: 17582 mov rdx, r9 17583 and rdx, -2 17584 neg rdx 17585 xor edi, edi 17586 .LBB2_518: # =>This Inner Loop Header: Depth=1 17587 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 17588 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 17589 movdqa xmm3, xmm0 17590 psubq xmm3, xmm1 17591 movdqa xmm1, xmm0 17592 psubq xmm1, xmm2 17593 movdqu xmmword ptr [r8 + 8*rdi], xmm3 17594 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1 17595 movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32] 17596 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48] 17597 movdqa xmm3, xmm0 17598 psubq xmm3, xmm1 17599 movdqa xmm1, xmm0 17600 psubq xmm1, xmm2 17601 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm3 17602 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1 17603 add rdi, 8 17604 add rdx, 2 17605 jne .LBB2_518 17606 jmp .LBB2_790 17607 .LBB2_519: 17608 mov esi, r10d 17609 and esi, -16 17610 movd xmm0, eax 17611 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 17612 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 17613 lea rdx, [rsi - 16] 17614 mov r9, rdx 17615 shr r9, 4 17616 add r9, 1 17617 test rdx, rdx 17618 je .LBB2_797 17619 # %bb.520: 17620 mov rdx, r9 17621 and rdx, -2 17622 neg rdx 17623 xor edi, edi 17624 .LBB2_521: # =>This Inner Loop Header: Depth=1 17625 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 17626 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 17627 pmullw xmm1, xmm0 17628 pmullw xmm2, xmm0 17629 movdqu xmmword ptr [r8 + 2*rdi], xmm1 17630 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 17631 movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32] 17632 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 17633 pmullw xmm1, xmm0 17634 pmullw xmm2, xmm0 17635 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 17636 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 17637 add rdi, 32 17638 add rdx, 2 17639 jne .LBB2_521 17640 jmp .LBB2_798 17641 .LBB2_522: 17642 mov esi, r10d 17643 and esi, -16 17644 movd xmm0, eax 17645 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 17646 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 17647 lea rdx, [rsi - 16] 17648 mov r9, rdx 17649 shr r9, 4 17650 add r9, 1 17651 test rdx, rdx 17652 je .LBB2_805 17653 # %bb.523: 17654 mov rdx, r9 17655 and rdx, -2 17656 neg rdx 17657 xor edi, edi 17658 .LBB2_524: # =>This Inner Loop Header: Depth=1 17659 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 17660 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 17661 pmullw xmm1, xmm0 17662 pmullw xmm2, xmm0 17663 movdqu xmmword ptr [r8 + 2*rdi], xmm1 17664 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 17665 movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32] 17666 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 17667 pmullw xmm1, xmm0 17668 pmullw xmm2, xmm0 17669 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 17670 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 17671 add rdi, 32 17672 add rdx, 2 17673 jne .LBB2_524 17674 jmp .LBB2_806 17675 .LBB2_525: 17676 mov esi, r10d 17677 and esi, -16 17678 movd xmm0, eax 17679 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 17680 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 17681 lea rdx, [rsi - 16] 17682 mov r9, rdx 17683 shr r9, 4 17684 add r9, 1 17685 test rdx, rdx 17686 je .LBB2_813 17687 # %bb.526: 17688 mov rdx, r9 17689 and rdx, -2 17690 neg rdx 17691 xor edi, edi 17692 .LBB2_527: # =>This Inner Loop Header: Depth=1 17693 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 17694 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 17695 pmullw xmm1, xmm0 17696 pmullw xmm2, xmm0 17697 movdqu xmmword ptr [r8 + 2*rdi], xmm1 17698 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 17699 movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32] 17700 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 17701 pmullw xmm1, xmm0 17702 pmullw xmm2, xmm0 17703 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 17704 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 17705 add rdi, 32 17706 add rdx, 2 17707 jne .LBB2_527 17708 jmp .LBB2_814 17709 .LBB2_528: 17710 mov esi, r10d 17711 and esi, -16 17712 movd xmm0, eax 17713 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 17714 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 17715 lea rdx, [rsi - 16] 17716 mov r9, rdx 17717 shr r9, 4 17718 add r9, 1 17719 test rdx, rdx 17720 je .LBB2_821 17721 # %bb.529: 17722 mov rdx, r9 17723 and rdx, -2 17724 neg rdx 17725 xor edi, edi 17726 .LBB2_530: # =>This Inner Loop Header: Depth=1 17727 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 17728 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 17729 pmullw xmm1, xmm0 17730 pmullw xmm2, xmm0 17731 movdqu xmmword ptr [r8 + 2*rdi], xmm1 17732 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 17733 movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32] 17734 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 17735 pmullw xmm1, xmm0 17736 pmullw xmm2, xmm0 17737 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 17738 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 17739 add rdi, 32 17740 add rdx, 2 17741 jne .LBB2_530 17742 jmp .LBB2_822 17743 .LBB2_531: 17744 mov esi, r10d 17745 and esi, -16 17746 movd xmm0, eax 17747 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 17748 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 17749 lea rdx, [rsi - 16] 17750 mov r9, rdx 17751 shr r9, 4 17752 add r9, 1 17753 test rdx, rdx 17754 je .LBB2_829 17755 # %bb.532: 17756 mov rdx, r9 17757 and rdx, -2 17758 neg rdx 17759 xor edi, edi 17760 .LBB2_533: # =>This Inner Loop Header: Depth=1 17761 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 17762 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 17763 paddw xmm1, xmm0 17764 paddw xmm2, xmm0 17765 movdqu xmmword ptr [r8 + 2*rdi], xmm1 17766 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 17767 movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32] 17768 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 17769 paddw xmm1, xmm0 17770 paddw xmm2, xmm0 17771 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 17772 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 17773 add rdi, 32 17774 add rdx, 2 17775 jne .LBB2_533 17776 jmp .LBB2_830 17777 .LBB2_534: 17778 mov esi, r10d 17779 and esi, -16 17780 movd xmm0, eax 17781 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 17782 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 17783 lea rdx, [rsi - 16] 17784 mov r9, rdx 17785 shr r9, 4 17786 add r9, 1 17787 test rdx, rdx 17788 je .LBB2_837 17789 # %bb.535: 17790 mov rdx, r9 17791 and rdx, -2 17792 neg rdx 17793 xor edi, edi 17794 .LBB2_536: # =>This Inner Loop Header: Depth=1 17795 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 17796 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 17797 paddw xmm1, xmm0 17798 paddw xmm2, xmm0 17799 movdqu xmmword ptr [r8 + 2*rdi], xmm1 17800 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 17801 movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32] 17802 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 17803 paddw xmm1, xmm0 17804 paddw xmm2, xmm0 17805 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 17806 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 17807 add rdi, 32 17808 add rdx, 2 17809 jne .LBB2_536 17810 jmp .LBB2_838 17811 .LBB2_537: 17812 mov esi, r10d 17813 and esi, -16 17814 movd xmm0, eax 17815 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 17816 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 17817 lea rdx, [rsi - 16] 17818 mov r9, rdx 17819 shr r9, 4 17820 add r9, 1 17821 test rdx, rdx 17822 je .LBB2_845 17823 # %bb.538: 17824 mov rdx, r9 17825 and rdx, -2 17826 neg rdx 17827 xor edi, edi 17828 .LBB2_539: # =>This Inner Loop Header: Depth=1 17829 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 17830 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 17831 movdqa xmm3, xmm0 17832 psubw xmm3, xmm1 17833 movdqa xmm1, xmm0 17834 psubw xmm1, xmm2 17835 movdqu xmmword ptr [r8 + 2*rdi], xmm3 17836 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1 17837 movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32] 17838 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 17839 movdqa xmm3, xmm0 17840 psubw xmm3, xmm1 17841 movdqa xmm1, xmm0 17842 psubw xmm1, xmm2 17843 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm3 17844 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1 17845 add rdi, 32 17846 add rdx, 2 17847 jne .LBB2_539 17848 jmp .LBB2_846 17849 .LBB2_540: 17850 mov esi, r10d 17851 and esi, -16 17852 movd xmm0, eax 17853 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 17854 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 17855 lea rdx, [rsi - 16] 17856 mov r9, rdx 17857 shr r9, 4 17858 add r9, 1 17859 test rdx, rdx 17860 je .LBB2_853 17861 # %bb.541: 17862 mov rdx, r9 17863 and rdx, -2 17864 neg rdx 17865 xor edi, edi 17866 .LBB2_542: # =>This Inner Loop Header: Depth=1 17867 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 17868 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 17869 movdqa xmm3, xmm0 17870 psubw xmm3, xmm1 17871 movdqa xmm1, xmm0 17872 psubw xmm1, xmm2 17873 movdqu xmmword ptr [r8 + 2*rdi], xmm3 17874 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1 17875 movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32] 17876 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 17877 movdqa xmm3, xmm0 17878 psubw xmm3, xmm1 17879 movdqa xmm1, xmm0 17880 psubw xmm1, xmm2 17881 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm3 17882 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1 17883 add rdi, 32 17884 add rdx, 2 17885 jne .LBB2_542 17886 jmp .LBB2_854 17887 .LBB2_543: 17888 mov esi, r10d 17889 and esi, -16 17890 movd xmm0, eax 17891 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 17892 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 17893 lea rdx, [rsi - 16] 17894 mov r9, rdx 17895 shr r9, 4 17896 add r9, 1 17897 test rdx, rdx 17898 je .LBB2_861 17899 # %bb.544: 17900 mov rdx, r9 17901 and rdx, -2 17902 neg rdx 17903 xor edi, edi 17904 .LBB2_545: # =>This Inner Loop Header: Depth=1 17905 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 17906 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 17907 paddw xmm1, xmm0 17908 paddw xmm2, xmm0 17909 movdqu xmmword ptr [r8 + 2*rdi], xmm1 17910 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 17911 movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32] 17912 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 17913 paddw xmm1, xmm0 17914 paddw xmm2, xmm0 17915 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 17916 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 17917 add rdi, 32 17918 add rdx, 2 17919 jne .LBB2_545 17920 jmp .LBB2_862 17921 .LBB2_546: 17922 mov esi, r10d 17923 and esi, -16 17924 movd xmm0, eax 17925 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 17926 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 17927 lea rdx, [rsi - 16] 17928 mov r9, rdx 17929 shr r9, 4 17930 add r9, 1 17931 test rdx, rdx 17932 je .LBB2_869 17933 # %bb.547: 17934 mov rdx, r9 17935 and rdx, -2 17936 neg rdx 17937 xor edi, edi 17938 .LBB2_548: # =>This Inner Loop Header: Depth=1 17939 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 17940 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 17941 paddw xmm1, xmm0 17942 paddw xmm2, xmm0 17943 movdqu xmmword ptr [r8 + 2*rdi], xmm1 17944 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 17945 movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32] 17946 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 17947 paddw xmm1, xmm0 17948 paddw xmm2, xmm0 17949 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1 17950 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2 17951 add rdi, 32 17952 add rdx, 2 17953 jne .LBB2_548 17954 jmp .LBB2_870 17955 .LBB2_549: 17956 mov esi, r10d 17957 and esi, -16 17958 movd xmm0, eax 17959 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 17960 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 17961 lea rdx, [rsi - 16] 17962 mov r9, rdx 17963 shr r9, 4 17964 add r9, 1 17965 test rdx, rdx 17966 je .LBB2_877 17967 # %bb.550: 17968 mov rdx, r9 17969 and rdx, -2 17970 neg rdx 17971 xor edi, edi 17972 .LBB2_551: # =>This Inner Loop Header: Depth=1 17973 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 17974 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 17975 movdqa xmm3, xmm0 17976 psubw xmm3, xmm1 17977 movdqa xmm1, xmm0 17978 psubw xmm1, xmm2 17979 movdqu xmmword ptr [r8 + 2*rdi], xmm3 17980 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1 17981 movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32] 17982 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 17983 movdqa xmm3, xmm0 17984 psubw xmm3, xmm1 17985 movdqa xmm1, xmm0 17986 psubw xmm1, xmm2 17987 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm3 17988 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1 17989 add rdi, 32 17990 add rdx, 2 17991 jne .LBB2_551 17992 jmp .LBB2_878 17993 .LBB2_552: 17994 mov esi, r10d 17995 and esi, -16 17996 movd xmm0, eax 17997 pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7] 17998 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 17999 lea rdx, [rsi - 16] 18000 mov r9, rdx 18001 shr r9, 4 18002 add r9, 1 18003 test rdx, rdx 18004 je .LBB2_885 18005 # %bb.553: 18006 mov rdx, r9 18007 and rdx, -2 18008 neg rdx 18009 xor edi, edi 18010 .LBB2_554: # =>This Inner Loop Header: Depth=1 18011 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 18012 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 18013 movdqa xmm3, xmm0 18014 psubw xmm3, xmm1 18015 movdqa xmm1, xmm0 18016 psubw xmm1, xmm2 18017 movdqu xmmword ptr [r8 + 2*rdi], xmm3 18018 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1 18019 movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32] 18020 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48] 18021 movdqa xmm3, xmm0 18022 psubw xmm3, xmm1 18023 movdqa xmm1, xmm0 18024 psubw xmm1, xmm2 18025 movdqu xmmword ptr [r8 + 2*rdi + 32], xmm3 18026 movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1 18027 add rdi, 32 18028 add rdx, 2 18029 jne .LBB2_554 18030 jmp .LBB2_886 18031 .LBB2_555: 18032 mov edx, eax 18033 and edx, -8 18034 movaps xmm1, xmm0 18035 shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0] 18036 lea rsi, [rdx - 8] 18037 mov r9, rsi 18038 shr r9, 3 18039 add r9, 1 18040 test rsi, rsi 18041 je .LBB2_893 18042 # %bb.556: 18043 mov rsi, r9 18044 and rsi, -2 18045 neg rsi 18046 xor edi, edi 18047 .LBB2_557: # =>This Inner Loop Header: Depth=1 18048 movups xmm2, xmmword ptr [rcx + 4*rdi] 18049 movups xmm3, xmmword ptr [rcx + 4*rdi + 16] 18050 mulps xmm2, xmm1 18051 mulps xmm3, xmm1 18052 movups xmmword ptr [r8 + 4*rdi], xmm2 18053 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 18054 movups xmm2, xmmword ptr [rcx + 4*rdi + 32] 18055 movups xmm3, xmmword ptr [rcx + 4*rdi + 48] 18056 mulps xmm2, xmm1 18057 mulps xmm3, xmm1 18058 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 18059 movups xmmword ptr [r8 + 4*rdi + 48], xmm3 18060 add rdi, 16 18061 add rsi, 2 18062 jne .LBB2_557 18063 jmp .LBB2_894 18064 .LBB2_558: 18065 mov edx, eax 18066 and edx, -8 18067 movaps xmm1, xmm0 18068 shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0] 18069 lea rsi, [rdx - 8] 18070 mov r9, rsi 18071 shr r9, 3 18072 add r9, 1 18073 test rsi, rsi 18074 je .LBB2_901 18075 # %bb.559: 18076 mov rsi, r9 18077 and rsi, -2 18078 neg rsi 18079 xor edi, edi 18080 .LBB2_560: # =>This Inner Loop Header: Depth=1 18081 movups xmm2, xmmword ptr [rcx + 4*rdi] 18082 movups xmm3, xmmword ptr [rcx + 4*rdi + 16] 18083 mulps xmm2, xmm1 18084 mulps xmm3, xmm1 18085 movups xmmword ptr [r8 + 4*rdi], xmm2 18086 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 18087 movups xmm2, xmmword ptr [rcx + 4*rdi + 32] 18088 movups xmm3, xmmword ptr [rcx + 4*rdi + 48] 18089 mulps xmm2, xmm1 18090 mulps xmm3, xmm1 18091 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 18092 movups xmmword ptr [r8 + 4*rdi + 48], xmm3 18093 add rdi, 16 18094 add rsi, 2 18095 jne .LBB2_560 18096 jmp .LBB2_902 18097 .LBB2_561: 18098 mov esi, r10d 18099 and esi, -4 18100 movq xmm0, rax 18101 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 18102 lea rdx, [rsi - 4] 18103 mov r9, rdx 18104 shr r9, 2 18105 add r9, 1 18106 test rdx, rdx 18107 je .LBB2_909 18108 # %bb.562: 18109 mov rdx, r9 18110 and rdx, -2 18111 neg rdx 18112 xor edi, edi 18113 .LBB2_563: # =>This Inner Loop Header: Depth=1 18114 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 18115 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 18116 paddq xmm1, xmm0 18117 paddq xmm2, xmm0 18118 movdqu xmmword ptr [r8 + 8*rdi], xmm1 18119 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 18120 movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32] 18121 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48] 18122 paddq xmm1, xmm0 18123 paddq xmm2, xmm0 18124 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1 18125 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2 18126 add rdi, 8 18127 add rdx, 2 18128 jne .LBB2_563 18129 jmp .LBB2_910 18130 .LBB2_564: 18131 mov edx, eax 18132 and edx, -8 18133 movaps xmm1, xmm0 18134 shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0] 18135 lea rsi, [rdx - 8] 18136 mov r9, rsi 18137 shr r9, 3 18138 add r9, 1 18139 test rsi, rsi 18140 je .LBB2_917 18141 # %bb.565: 18142 mov rsi, r9 18143 and rsi, -2 18144 neg rsi 18145 xor edi, edi 18146 .LBB2_566: # =>This Inner Loop Header: Depth=1 18147 movups xmm2, xmmword ptr [rcx + 4*rdi] 18148 movups xmm3, xmmword ptr [rcx + 4*rdi + 16] 18149 addps xmm2, xmm1 18150 addps xmm3, xmm1 18151 movups xmmword ptr [r8 + 4*rdi], xmm2 18152 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 18153 movups xmm2, xmmword ptr [rcx + 4*rdi + 32] 18154 movups xmm3, xmmword ptr [rcx + 4*rdi + 48] 18155 addps xmm2, xmm1 18156 addps xmm3, xmm1 18157 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 18158 movups xmmword ptr [r8 + 4*rdi + 48], xmm3 18159 add rdi, 16 18160 add rsi, 2 18161 jne .LBB2_566 18162 jmp .LBB2_918 18163 .LBB2_567: 18164 mov esi, r10d 18165 and esi, -4 18166 movq xmm0, r11 18167 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 18168 lea rdx, [rsi - 4] 18169 mov r9, rdx 18170 shr r9, 2 18171 add r9, 1 18172 test rdx, rdx 18173 je .LBB2_925 18174 # %bb.568: 18175 mov rdx, r9 18176 and rdx, -2 18177 neg rdx 18178 xor edi, edi 18179 .LBB2_569: # =>This Inner Loop Header: Depth=1 18180 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 18181 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 18182 movdqa xmm3, xmm0 18183 psubq xmm3, xmm1 18184 movdqa xmm1, xmm0 18185 psubq xmm1, xmm2 18186 movdqu xmmword ptr [r8 + 8*rdi], xmm3 18187 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1 18188 movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32] 18189 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48] 18190 movdqa xmm3, xmm0 18191 psubq xmm3, xmm1 18192 movdqa xmm1, xmm0 18193 psubq xmm1, xmm2 18194 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm3 18195 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1 18196 add rdi, 8 18197 add rdx, 2 18198 jne .LBB2_569 18199 jmp .LBB2_926 18200 .LBB2_570: 18201 mov edx, eax 18202 and edx, -8 18203 movaps xmm1, xmm0 18204 shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0] 18205 lea rsi, [rdx - 8] 18206 mov r9, rsi 18207 shr r9, 3 18208 add r9, 1 18209 test rsi, rsi 18210 je .LBB2_933 18211 # %bb.571: 18212 mov rsi, r9 18213 and rsi, -2 18214 neg rsi 18215 xor edi, edi 18216 .LBB2_572: # =>This Inner Loop Header: Depth=1 18217 movups xmm2, xmmword ptr [rcx + 4*rdi] 18218 movups xmm3, xmmword ptr [rcx + 4*rdi + 16] 18219 movaps xmm4, xmm1 18220 subps xmm4, xmm2 18221 movaps xmm2, xmm1 18222 subps xmm2, xmm3 18223 movups xmmword ptr [r8 + 4*rdi], xmm4 18224 movups xmmword ptr [r8 + 4*rdi + 16], xmm2 18225 movups xmm2, xmmword ptr [rcx + 4*rdi + 32] 18226 movups xmm3, xmmword ptr [rcx + 4*rdi + 48] 18227 movaps xmm4, xmm1 18228 subps xmm4, xmm2 18229 movaps xmm2, xmm1 18230 subps xmm2, xmm3 18231 movups xmmword ptr [r8 + 4*rdi + 32], xmm4 18232 movups xmmword ptr [r8 + 4*rdi + 48], xmm2 18233 add rdi, 16 18234 add rsi, 2 18235 jne .LBB2_572 18236 jmp .LBB2_934 18237 .LBB2_573: 18238 mov esi, r10d 18239 and esi, -4 18240 movq xmm0, rax 18241 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 18242 lea rdx, [rsi - 4] 18243 mov r9, rdx 18244 shr r9, 2 18245 add r9, 1 18246 test rdx, rdx 18247 je .LBB2_941 18248 # %bb.574: 18249 mov rdx, r9 18250 and rdx, -2 18251 neg rdx 18252 xor edi, edi 18253 .LBB2_575: # =>This Inner Loop Header: Depth=1 18254 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 18255 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 18256 paddq xmm1, xmm0 18257 paddq xmm2, xmm0 18258 movdqu xmmword ptr [r8 + 8*rdi], xmm1 18259 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 18260 movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32] 18261 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48] 18262 paddq xmm1, xmm0 18263 paddq xmm2, xmm0 18264 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1 18265 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2 18266 add rdi, 8 18267 add rdx, 2 18268 jne .LBB2_575 18269 jmp .LBB2_942 18270 .LBB2_576: 18271 mov edx, eax 18272 and edx, -8 18273 movaps xmm1, xmm0 18274 shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0] 18275 lea rsi, [rdx - 8] 18276 mov r9, rsi 18277 shr r9, 3 18278 add r9, 1 18279 test rsi, rsi 18280 je .LBB2_949 18281 # %bb.577: 18282 mov rsi, r9 18283 and rsi, -2 18284 neg rsi 18285 xor edi, edi 18286 .LBB2_578: # =>This Inner Loop Header: Depth=1 18287 movups xmm2, xmmword ptr [rcx + 4*rdi] 18288 movups xmm3, xmmword ptr [rcx + 4*rdi + 16] 18289 addps xmm2, xmm1 18290 addps xmm3, xmm1 18291 movups xmmword ptr [r8 + 4*rdi], xmm2 18292 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 18293 movups xmm2, xmmword ptr [rcx + 4*rdi + 32] 18294 movups xmm3, xmmword ptr [rcx + 4*rdi + 48] 18295 addps xmm2, xmm1 18296 addps xmm3, xmm1 18297 movups xmmword ptr [r8 + 4*rdi + 32], xmm2 18298 movups xmmword ptr [r8 + 4*rdi + 48], xmm3 18299 add rdi, 16 18300 add rsi, 2 18301 jne .LBB2_578 18302 jmp .LBB2_950 18303 .LBB2_579: 18304 mov esi, r10d 18305 and esi, -4 18306 movq xmm0, r11 18307 pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1] 18308 lea rdx, [rsi - 4] 18309 mov r9, rdx 18310 shr r9, 2 18311 add r9, 1 18312 test rdx, rdx 18313 je .LBB2_957 18314 # %bb.580: 18315 mov rdx, r9 18316 and rdx, -2 18317 neg rdx 18318 xor edi, edi 18319 .LBB2_581: # =>This Inner Loop Header: Depth=1 18320 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 18321 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 18322 movdqa xmm3, xmm0 18323 psubq xmm3, xmm1 18324 movdqa xmm1, xmm0 18325 psubq xmm1, xmm2 18326 movdqu xmmword ptr [r8 + 8*rdi], xmm3 18327 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1 18328 movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32] 18329 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48] 18330 movdqa xmm3, xmm0 18331 psubq xmm3, xmm1 18332 movdqa xmm1, xmm0 18333 psubq xmm1, xmm2 18334 movdqu xmmword ptr [r8 + 8*rdi + 32], xmm3 18335 movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1 18336 add rdi, 8 18337 add rdx, 2 18338 jne .LBB2_581 18339 jmp .LBB2_958 18340 .LBB2_582: 18341 mov edx, eax 18342 and edx, -8 18343 movaps xmm1, xmm0 18344 shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0] 18345 lea rsi, [rdx - 8] 18346 mov r9, rsi 18347 shr r9, 3 18348 add r9, 1 18349 test rsi, rsi 18350 je .LBB2_965 18351 # %bb.583: 18352 mov rsi, r9 18353 and rsi, -2 18354 neg rsi 18355 xor edi, edi 18356 .LBB2_584: # =>This Inner Loop Header: Depth=1 18357 movups xmm2, xmmword ptr [rcx + 4*rdi] 18358 movups xmm3, xmmword ptr [rcx + 4*rdi + 16] 18359 movaps xmm4, xmm1 18360 subps xmm4, xmm2 18361 movaps xmm2, xmm1 18362 subps xmm2, xmm3 18363 movups xmmword ptr [r8 + 4*rdi], xmm4 18364 movups xmmword ptr [r8 + 4*rdi + 16], xmm2 18365 movups xmm2, xmmword ptr [rcx + 4*rdi + 32] 18366 movups xmm3, xmmword ptr [rcx + 4*rdi + 48] 18367 movaps xmm4, xmm1 18368 subps xmm4, xmm2 18369 movaps xmm2, xmm1 18370 subps xmm2, xmm3 18371 movups xmmword ptr [r8 + 4*rdi + 32], xmm4 18372 movups xmmword ptr [r8 + 4*rdi + 48], xmm2 18373 add rdi, 16 18374 add rsi, 2 18375 jne .LBB2_584 18376 jmp .LBB2_966 18377 .LBB2_585: 18378 mov edi, r10d 18379 and edi, -32 18380 movzx eax, dl 18381 movd xmm0, eax 18382 pxor xmm1, xmm1 18383 pshufb xmm0, xmm1 18384 lea rax, [rdi - 32] 18385 mov r9, rax 18386 shr r9, 5 18387 add r9, 1 18388 pmovzxbw xmm1, xmm0 # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 18389 test rax, rax 18390 je .LBB2_973 18391 # %bb.586: 18392 mov rsi, r9 18393 and rsi, -2 18394 neg rsi 18395 xor eax, eax 18396 movdqa xmm2, xmm0 18397 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 18398 movdqa xmm3, xmmword ptr [rip + .LCPI2_0] # xmm3 = [255,255,255,255,255,255,255,255] 18399 movdqa xmm4, xmm0 18400 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 18401 .LBB2_587: # =>This Inner Loop Header: Depth=1 18402 movdqu xmm5, xmmword ptr [rcx + rax] 18403 movdqu xmm6, xmmword ptr [rcx + rax + 16] 18404 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 18405 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 18406 pmullw xmm5, xmm2 18407 pand xmm5, xmm3 18408 pmullw xmm7, xmm1 18409 pand xmm7, xmm3 18410 packuswb xmm7, xmm5 18411 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 18412 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 18413 pmullw xmm6, xmm4 18414 pand xmm6, xmm3 18415 pmullw xmm5, xmm1 18416 pand xmm5, xmm3 18417 packuswb xmm5, xmm6 18418 movdqu xmmword ptr [r8 + rax], xmm7 18419 movdqu xmmword ptr [r8 + rax + 16], xmm5 18420 movdqu xmm5, xmmword ptr [rcx + rax + 32] 18421 movdqu xmm6, xmmword ptr [rcx + rax + 48] 18422 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 18423 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 18424 pmullw xmm5, xmm2 18425 pand xmm5, xmm3 18426 pmullw xmm7, xmm1 18427 pand xmm7, xmm3 18428 packuswb xmm7, xmm5 18429 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 18430 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 18431 pmullw xmm6, xmm4 18432 pand xmm6, xmm3 18433 pmullw xmm5, xmm1 18434 pand xmm5, xmm3 18435 packuswb xmm5, xmm6 18436 movdqu xmmword ptr [r8 + rax + 32], xmm7 18437 movdqu xmmword ptr [r8 + rax + 48], xmm5 18438 add rax, 64 18439 add rsi, 2 18440 jne .LBB2_587 18441 jmp .LBB2_974 18442 .LBB2_588: 18443 mov edi, r10d 18444 and edi, -32 18445 movzx eax, dl 18446 movd xmm0, eax 18447 pxor xmm1, xmm1 18448 pshufb xmm0, xmm1 18449 lea rax, [rdi - 32] 18450 mov r9, rax 18451 shr r9, 5 18452 add r9, 1 18453 pmovzxbw xmm1, xmm0 # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 18454 test rax, rax 18455 je .LBB2_981 18456 # %bb.589: 18457 mov rsi, r9 18458 and rsi, -2 18459 neg rsi 18460 xor eax, eax 18461 movdqa xmm2, xmm0 18462 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 18463 movdqa xmm3, xmmword ptr [rip + .LCPI2_0] # xmm3 = [255,255,255,255,255,255,255,255] 18464 movdqa xmm4, xmm0 18465 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 18466 .LBB2_590: # =>This Inner Loop Header: Depth=1 18467 movdqu xmm5, xmmword ptr [rcx + rax] 18468 movdqu xmm6, xmmword ptr [rcx + rax + 16] 18469 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 18470 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 18471 pmullw xmm5, xmm2 18472 pand xmm5, xmm3 18473 pmullw xmm7, xmm1 18474 pand xmm7, xmm3 18475 packuswb xmm7, xmm5 18476 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 18477 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 18478 pmullw xmm6, xmm4 18479 pand xmm6, xmm3 18480 pmullw xmm5, xmm1 18481 pand xmm5, xmm3 18482 packuswb xmm5, xmm6 18483 movdqu xmmword ptr [r8 + rax], xmm7 18484 movdqu xmmword ptr [r8 + rax + 16], xmm5 18485 movdqu xmm5, xmmword ptr [rcx + rax + 32] 18486 movdqu xmm6, xmmword ptr [rcx + rax + 48] 18487 pmovzxbw xmm7, xmm5 # xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 18488 punpckhbw xmm5, xmm5 # xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 18489 pmullw xmm5, xmm2 18490 pand xmm5, xmm3 18491 pmullw xmm7, xmm1 18492 pand xmm7, xmm3 18493 packuswb xmm7, xmm5 18494 pmovzxbw xmm5, xmm6 # xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 18495 punpckhbw xmm6, xmm6 # xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 18496 pmullw xmm6, xmm4 18497 pand xmm6, xmm3 18498 pmullw xmm5, xmm1 18499 pand xmm5, xmm3 18500 packuswb xmm5, xmm6 18501 movdqu xmmword ptr [r8 + rax + 32], xmm7 18502 movdqu xmmword ptr [r8 + rax + 48], xmm5 18503 add rax, 64 18504 add rsi, 2 18505 jne .LBB2_590 18506 jmp .LBB2_982 18507 .LBB2_591: 18508 mov esi, r10d 18509 and esi, -32 18510 movzx edx, al 18511 movd xmm0, edx 18512 pxor xmm1, xmm1 18513 pshufb xmm0, xmm1 18514 lea rdx, [rsi - 32] 18515 mov r9, rdx 18516 shr r9, 5 18517 add r9, 1 18518 test rdx, rdx 18519 je .LBB2_989 18520 # %bb.592: 18521 mov rdx, r9 18522 and rdx, -2 18523 neg rdx 18524 xor edi, edi 18525 .LBB2_593: # =>This Inner Loop Header: Depth=1 18526 movdqu xmm1, xmmword ptr [rcx + rdi] 18527 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 18528 paddb xmm1, xmm0 18529 paddb xmm2, xmm0 18530 movdqu xmmword ptr [r8 + rdi], xmm1 18531 movdqu xmmword ptr [r8 + rdi + 16], xmm2 18532 movdqu xmm1, xmmword ptr [rcx + rdi + 32] 18533 movdqu xmm2, xmmword ptr [rcx + rdi + 48] 18534 paddb xmm1, xmm0 18535 paddb xmm2, xmm0 18536 movdqu xmmword ptr [r8 + rdi + 32], xmm1 18537 movdqu xmmword ptr [r8 + rdi + 48], xmm2 18538 add rdi, 64 18539 add rdx, 2 18540 jne .LBB2_593 18541 jmp .LBB2_990 18542 .LBB2_594: 18543 mov esi, r10d 18544 and esi, -32 18545 movzx edx, r11b 18546 movd xmm0, edx 18547 pxor xmm1, xmm1 18548 pshufb xmm0, xmm1 18549 lea rdx, [rsi - 32] 18550 mov r9, rdx 18551 shr r9, 5 18552 add r9, 1 18553 test rdx, rdx 18554 je .LBB2_997 18555 # %bb.595: 18556 mov rdx, r9 18557 and rdx, -2 18558 neg rdx 18559 xor edi, edi 18560 .LBB2_596: # =>This Inner Loop Header: Depth=1 18561 movdqu xmm1, xmmword ptr [rcx + rdi] 18562 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 18563 movdqa xmm3, xmm0 18564 psubb xmm3, xmm1 18565 movdqa xmm1, xmm0 18566 psubb xmm1, xmm2 18567 movdqu xmmword ptr [r8 + rdi], xmm3 18568 movdqu xmmword ptr [r8 + rdi + 16], xmm1 18569 movdqu xmm1, xmmword ptr [rcx + rdi + 32] 18570 movdqu xmm2, xmmword ptr [rcx + rdi + 48] 18571 movdqa xmm3, xmm0 18572 psubb xmm3, xmm1 18573 movdqa xmm1, xmm0 18574 psubb xmm1, xmm2 18575 movdqu xmmword ptr [r8 + rdi + 32], xmm3 18576 movdqu xmmword ptr [r8 + rdi + 48], xmm1 18577 add rdi, 64 18578 add rdx, 2 18579 jne .LBB2_596 18580 jmp .LBB2_998 18581 .LBB2_597: 18582 mov esi, r10d 18583 and esi, -32 18584 movzx edx, al 18585 movd xmm0, edx 18586 pxor xmm1, xmm1 18587 pshufb xmm0, xmm1 18588 lea rdx, [rsi - 32] 18589 mov r9, rdx 18590 shr r9, 5 18591 add r9, 1 18592 test rdx, rdx 18593 je .LBB2_1005 18594 # %bb.598: 18595 mov rdx, r9 18596 and rdx, -2 18597 neg rdx 18598 xor edi, edi 18599 .LBB2_599: # =>This Inner Loop Header: Depth=1 18600 movdqu xmm1, xmmword ptr [rcx + rdi] 18601 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 18602 paddb xmm1, xmm0 18603 paddb xmm2, xmm0 18604 movdqu xmmword ptr [r8 + rdi], xmm1 18605 movdqu xmmword ptr [r8 + rdi + 16], xmm2 18606 movdqu xmm1, xmmword ptr [rcx + rdi + 32] 18607 movdqu xmm2, xmmword ptr [rcx + rdi + 48] 18608 paddb xmm1, xmm0 18609 paddb xmm2, xmm0 18610 movdqu xmmword ptr [r8 + rdi + 32], xmm1 18611 movdqu xmmword ptr [r8 + rdi + 48], xmm2 18612 add rdi, 64 18613 add rdx, 2 18614 jne .LBB2_599 18615 jmp .LBB2_1006 18616 .LBB2_600: 18617 mov esi, r10d 18618 and esi, -32 18619 movzx edx, r11b 18620 movd xmm0, edx 18621 pxor xmm1, xmm1 18622 pshufb xmm0, xmm1 18623 lea rdx, [rsi - 32] 18624 mov r9, rdx 18625 shr r9, 5 18626 add r9, 1 18627 test rdx, rdx 18628 je .LBB2_1013 18629 # %bb.601: 18630 mov rdx, r9 18631 and rdx, -2 18632 neg rdx 18633 xor edi, edi 18634 .LBB2_602: # =>This Inner Loop Header: Depth=1 18635 movdqu xmm1, xmmword ptr [rcx + rdi] 18636 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 18637 movdqa xmm3, xmm0 18638 psubb xmm3, xmm1 18639 movdqa xmm1, xmm0 18640 psubb xmm1, xmm2 18641 movdqu xmmword ptr [r8 + rdi], xmm3 18642 movdqu xmmword ptr [r8 + rdi + 16], xmm1 18643 movdqu xmm1, xmmword ptr [rcx + rdi + 32] 18644 movdqu xmm2, xmmword ptr [rcx + rdi + 48] 18645 movdqa xmm3, xmm0 18646 psubb xmm3, xmm1 18647 movdqa xmm1, xmm0 18648 psubb xmm1, xmm2 18649 movdqu xmmword ptr [r8 + rdi + 32], xmm3 18650 movdqu xmmword ptr [r8 + rdi + 48], xmm1 18651 add rdi, 64 18652 add rdx, 2 18653 jne .LBB2_602 18654 jmp .LBB2_1014 18655 .LBB2_603: 18656 mov esi, r10d 18657 and esi, -8 18658 movd xmm0, eax 18659 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 18660 lea rdx, [rsi - 8] 18661 mov r9, rdx 18662 shr r9, 3 18663 add r9, 1 18664 test rdx, rdx 18665 je .LBB2_1021 18666 # %bb.604: 18667 mov rdx, r9 18668 and rdx, -2 18669 neg rdx 18670 xor edi, edi 18671 .LBB2_605: # =>This Inner Loop Header: Depth=1 18672 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 18673 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 18674 pmulld xmm1, xmm0 18675 pmulld xmm2, xmm0 18676 movdqu xmmword ptr [r8 + 4*rdi], xmm1 18677 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 18678 movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32] 18679 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 18680 pmulld xmm1, xmm0 18681 pmulld xmm2, xmm0 18682 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 18683 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 18684 add rdi, 16 18685 add rdx, 2 18686 jne .LBB2_605 18687 jmp .LBB2_1022 18688 .LBB2_606: 18689 mov esi, r10d 18690 and esi, -8 18691 movd xmm0, eax 18692 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 18693 lea rdx, [rsi - 8] 18694 mov r9, rdx 18695 shr r9, 3 18696 add r9, 1 18697 test rdx, rdx 18698 je .LBB2_1029 18699 # %bb.607: 18700 mov rdx, r9 18701 and rdx, -2 18702 neg rdx 18703 xor edi, edi 18704 .LBB2_608: # =>This Inner Loop Header: Depth=1 18705 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 18706 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 18707 pmulld xmm1, xmm0 18708 pmulld xmm2, xmm0 18709 movdqu xmmword ptr [r8 + 4*rdi], xmm1 18710 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 18711 movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32] 18712 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 18713 pmulld xmm1, xmm0 18714 pmulld xmm2, xmm0 18715 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 18716 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 18717 add rdi, 16 18718 add rdx, 2 18719 jne .LBB2_608 18720 jmp .LBB2_1030 18721 .LBB2_609: 18722 mov esi, r10d 18723 and esi, -8 18724 movd xmm0, eax 18725 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 18726 lea rdx, [rsi - 8] 18727 mov r9, rdx 18728 shr r9, 3 18729 add r9, 1 18730 test rdx, rdx 18731 je .LBB2_1037 18732 # %bb.610: 18733 mov rdx, r9 18734 and rdx, -2 18735 neg rdx 18736 xor edi, edi 18737 .LBB2_611: # =>This Inner Loop Header: Depth=1 18738 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 18739 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 18740 paddd xmm1, xmm0 18741 paddd xmm2, xmm0 18742 movdqu xmmword ptr [r8 + 4*rdi], xmm1 18743 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 18744 movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32] 18745 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 18746 paddd xmm1, xmm0 18747 paddd xmm2, xmm0 18748 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 18749 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 18750 add rdi, 16 18751 add rdx, 2 18752 jne .LBB2_611 18753 jmp .LBB2_1038 18754 .LBB2_612: 18755 mov esi, r10d 18756 and esi, -8 18757 movd xmm0, r11d 18758 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 18759 lea rdx, [rsi - 8] 18760 mov r9, rdx 18761 shr r9, 3 18762 add r9, 1 18763 test rdx, rdx 18764 je .LBB2_1045 18765 # %bb.613: 18766 mov rdx, r9 18767 and rdx, -2 18768 neg rdx 18769 xor edi, edi 18770 .LBB2_614: # =>This Inner Loop Header: Depth=1 18771 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 18772 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 18773 movdqa xmm3, xmm0 18774 psubd xmm3, xmm1 18775 movdqa xmm1, xmm0 18776 psubd xmm1, xmm2 18777 movdqu xmmword ptr [r8 + 4*rdi], xmm3 18778 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1 18779 movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32] 18780 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 18781 movdqa xmm3, xmm0 18782 psubd xmm3, xmm1 18783 movdqa xmm1, xmm0 18784 psubd xmm1, xmm2 18785 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm3 18786 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1 18787 add rdi, 16 18788 add rdx, 2 18789 jne .LBB2_614 18790 jmp .LBB2_1046 18791 .LBB2_615: 18792 mov esi, r10d 18793 and esi, -8 18794 movd xmm0, eax 18795 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 18796 lea rdx, [rsi - 8] 18797 mov r9, rdx 18798 shr r9, 3 18799 add r9, 1 18800 test rdx, rdx 18801 je .LBB2_1053 18802 # %bb.616: 18803 mov rdx, r9 18804 and rdx, -2 18805 neg rdx 18806 xor edi, edi 18807 .LBB2_617: # =>This Inner Loop Header: Depth=1 18808 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 18809 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 18810 paddd xmm1, xmm0 18811 paddd xmm2, xmm0 18812 movdqu xmmword ptr [r8 + 4*rdi], xmm1 18813 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 18814 movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32] 18815 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 18816 paddd xmm1, xmm0 18817 paddd xmm2, xmm0 18818 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1 18819 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2 18820 add rdi, 16 18821 add rdx, 2 18822 jne .LBB2_617 18823 jmp .LBB2_1054 18824 .LBB2_618: 18825 mov esi, r10d 18826 and esi, -8 18827 movd xmm0, r11d 18828 pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0] 18829 lea rdx, [rsi - 8] 18830 mov r9, rdx 18831 shr r9, 3 18832 add r9, 1 18833 test rdx, rdx 18834 je .LBB2_1061 18835 # %bb.619: 18836 mov rdx, r9 18837 and rdx, -2 18838 neg rdx 18839 xor edi, edi 18840 .LBB2_620: # =>This Inner Loop Header: Depth=1 18841 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 18842 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 18843 movdqa xmm3, xmm0 18844 psubd xmm3, xmm1 18845 movdqa xmm1, xmm0 18846 psubd xmm1, xmm2 18847 movdqu xmmword ptr [r8 + 4*rdi], xmm3 18848 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1 18849 movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32] 18850 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48] 18851 movdqa xmm3, xmm0 18852 psubd xmm3, xmm1 18853 movdqa xmm1, xmm0 18854 psubd xmm1, xmm2 18855 movdqu xmmword ptr [r8 + 4*rdi + 32], xmm3 18856 movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1 18857 add rdi, 16 18858 add rdx, 2 18859 jne .LBB2_620 18860 jmp .LBB2_1062 18861 .LBB2_621: 18862 xor edi, edi 18863 .LBB2_622: 18864 test r9b, 1 18865 je .LBB2_624 18866 # %bb.623: 18867 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 18868 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 18869 pmulld xmm1, xmm0 18870 pmulld xmm2, xmm0 18871 movdqu xmmword ptr [r8 + 4*rdi], xmm1 18872 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 18873 .LBB2_624: 18874 cmp rsi, r10 18875 je .LBB2_1069 18876 jmp .LBB2_625 18877 .LBB2_629: 18878 xor edi, edi 18879 .LBB2_630: 18880 test r9b, 1 18881 je .LBB2_632 18882 # %bb.631: 18883 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 18884 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 18885 pmulld xmm1, xmm0 18886 pmulld xmm2, xmm0 18887 movdqu xmmword ptr [r8 + 4*rdi], xmm1 18888 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 18889 .LBB2_632: 18890 cmp rsi, r10 18891 je .LBB2_1069 18892 jmp .LBB2_633 18893 .LBB2_637: 18894 xor edi, edi 18895 .LBB2_638: 18896 test r9b, 1 18897 je .LBB2_640 18898 # %bb.639: 18899 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 18900 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 18901 paddd xmm1, xmm0 18902 paddd xmm2, xmm0 18903 movdqu xmmword ptr [r8 + 4*rdi], xmm1 18904 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 18905 .LBB2_640: 18906 cmp rsi, r10 18907 je .LBB2_1069 18908 jmp .LBB2_641 18909 .LBB2_645: 18910 xor edi, edi 18911 .LBB2_646: 18912 test r9b, 1 18913 je .LBB2_648 18914 # %bb.647: 18915 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 18916 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 18917 movdqa xmm3, xmm0 18918 psubd xmm3, xmm1 18919 psubd xmm0, xmm2 18920 movdqu xmmword ptr [r8 + 4*rdi], xmm3 18921 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 18922 .LBB2_648: 18923 cmp rsi, r10 18924 je .LBB2_1069 18925 jmp .LBB2_649 18926 .LBB2_653: 18927 xor edi, edi 18928 .LBB2_654: 18929 test r9b, 1 18930 je .LBB2_656 18931 # %bb.655: 18932 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 18933 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 18934 paddd xmm1, xmm0 18935 paddd xmm2, xmm0 18936 movdqu xmmword ptr [r8 + 4*rdi], xmm1 18937 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 18938 .LBB2_656: 18939 cmp rsi, r10 18940 je .LBB2_1069 18941 jmp .LBB2_657 18942 .LBB2_661: 18943 xor edi, edi 18944 .LBB2_662: 18945 test r9b, 1 18946 je .LBB2_664 18947 # %bb.663: 18948 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 18949 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 18950 movdqa xmm3, xmm0 18951 psubd xmm3, xmm1 18952 psubd xmm0, xmm2 18953 movdqu xmmword ptr [r8 + 4*rdi], xmm3 18954 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 18955 .LBB2_664: 18956 cmp rsi, r10 18957 je .LBB2_1069 18958 jmp .LBB2_665 18959 .LBB2_669: 18960 xor edi, edi 18961 .LBB2_670: 18962 test r9b, 1 18963 je .LBB2_672 18964 # %bb.671: 18965 movupd xmm2, xmmword ptr [rcx + 8*rdi] 18966 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 18967 mulpd xmm2, xmm1 18968 mulpd xmm3, xmm1 18969 movupd xmmword ptr [r8 + 8*rdi], xmm2 18970 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 18971 .LBB2_672: 18972 cmp rdx, rax 18973 je .LBB2_1069 18974 jmp .LBB2_673 18975 .LBB2_677: 18976 xor edi, edi 18977 .LBB2_678: 18978 test r9b, 1 18979 je .LBB2_680 18980 # %bb.679: 18981 movupd xmm2, xmmword ptr [rcx + 8*rdi] 18982 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 18983 mulpd xmm2, xmm1 18984 mulpd xmm3, xmm1 18985 movupd xmmword ptr [r8 + 8*rdi], xmm2 18986 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 18987 .LBB2_680: 18988 cmp rdx, rax 18989 je .LBB2_1069 18990 jmp .LBB2_681 18991 .LBB2_685: 18992 xor edi, edi 18993 .LBB2_686: 18994 test r9b, 1 18995 je .LBB2_688 18996 # %bb.687: 18997 movupd xmm2, xmmword ptr [rcx + 8*rdi] 18998 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 18999 addpd xmm2, xmm1 19000 addpd xmm3, xmm1 19001 movupd xmmword ptr [r8 + 8*rdi], xmm2 19002 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 19003 .LBB2_688: 19004 cmp rdx, rax 19005 je .LBB2_1069 19006 jmp .LBB2_689 19007 .LBB2_693: 19008 xor edi, edi 19009 .LBB2_694: 19010 test r9b, 1 19011 je .LBB2_696 19012 # %bb.695: 19013 movupd xmm2, xmmword ptr [rcx + 8*rdi] 19014 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 19015 movapd xmm4, xmm1 19016 subpd xmm4, xmm2 19017 subpd xmm1, xmm3 19018 movupd xmmword ptr [r8 + 8*rdi], xmm4 19019 movupd xmmword ptr [r8 + 8*rdi + 16], xmm1 19020 .LBB2_696: 19021 cmp rdx, rax 19022 je .LBB2_1069 19023 jmp .LBB2_697 19024 .LBB2_701: 19025 xor edi, edi 19026 .LBB2_702: 19027 test r9b, 1 19028 je .LBB2_704 19029 # %bb.703: 19030 movupd xmm2, xmmword ptr [rcx + 8*rdi] 19031 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 19032 addpd xmm2, xmm1 19033 addpd xmm3, xmm1 19034 movupd xmmword ptr [r8 + 8*rdi], xmm2 19035 movupd xmmword ptr [r8 + 8*rdi + 16], xmm3 19036 .LBB2_704: 19037 cmp rdx, rax 19038 je .LBB2_1069 19039 jmp .LBB2_705 19040 .LBB2_709: 19041 xor edi, edi 19042 .LBB2_710: 19043 test r9b, 1 19044 je .LBB2_712 19045 # %bb.711: 19046 movupd xmm2, xmmword ptr [rcx + 8*rdi] 19047 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 19048 movapd xmm4, xmm1 19049 subpd xmm4, xmm2 19050 subpd xmm1, xmm3 19051 movupd xmmword ptr [r8 + 8*rdi], xmm4 19052 movupd xmmword ptr [r8 + 8*rdi + 16], xmm1 19053 .LBB2_712: 19054 cmp rdx, rax 19055 je .LBB2_1069 19056 jmp .LBB2_713 19057 .LBB2_717: 19058 xor eax, eax 19059 .LBB2_718: 19060 test r9b, 1 19061 je .LBB2_720 19062 # %bb.719: 19063 movdqu xmm2, xmmword ptr [rcx + rax] 19064 movdqu xmm3, xmmword ptr [rcx + rax + 16] 19065 movdqa xmm4, xmm0 19066 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19067 pmovzxbw xmm5, xmm2 # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 19068 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19069 pmullw xmm2, xmm4 19070 movdqa xmm4, xmmword ptr [rip + .LCPI2_0] # xmm4 = [255,255,255,255,255,255,255,255] 19071 pand xmm2, xmm4 19072 pmullw xmm5, xmm1 19073 pand xmm5, xmm4 19074 packuswb xmm5, xmm2 19075 punpckhbw xmm0, xmm0 # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19076 pmovzxbw xmm2, xmm3 # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 19077 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19078 pmullw xmm3, xmm0 19079 pand xmm3, xmm4 19080 pmullw xmm2, xmm1 19081 pand xmm2, xmm4 19082 packuswb xmm2, xmm3 19083 movdqu xmmword ptr [r8 + rax], xmm5 19084 movdqu xmmword ptr [r8 + rax + 16], xmm2 19085 .LBB2_720: 19086 cmp rdi, r10 19087 je .LBB2_1069 19088 jmp .LBB2_721 19089 .LBB2_725: 19090 xor eax, eax 19091 .LBB2_726: 19092 test r9b, 1 19093 je .LBB2_728 19094 # %bb.727: 19095 movdqu xmm2, xmmword ptr [rcx + rax] 19096 movdqu xmm3, xmmword ptr [rcx + rax + 16] 19097 movdqa xmm4, xmm0 19098 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19099 pmovzxbw xmm5, xmm2 # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 19100 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19101 pmullw xmm2, xmm4 19102 movdqa xmm4, xmmword ptr [rip + .LCPI2_0] # xmm4 = [255,255,255,255,255,255,255,255] 19103 pand xmm2, xmm4 19104 pmullw xmm5, xmm1 19105 pand xmm5, xmm4 19106 packuswb xmm5, xmm2 19107 punpckhbw xmm0, xmm0 # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19108 pmovzxbw xmm2, xmm3 # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 19109 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19110 pmullw xmm3, xmm0 19111 pand xmm3, xmm4 19112 pmullw xmm2, xmm1 19113 pand xmm2, xmm4 19114 packuswb xmm2, xmm3 19115 movdqu xmmword ptr [r8 + rax], xmm5 19116 movdqu xmmword ptr [r8 + rax + 16], xmm2 19117 .LBB2_728: 19118 cmp rdi, r10 19119 je .LBB2_1069 19120 jmp .LBB2_729 19121 .LBB2_733: 19122 xor edi, edi 19123 .LBB2_734: 19124 test r9b, 1 19125 je .LBB2_736 19126 # %bb.735: 19127 movdqu xmm1, xmmword ptr [rcx + rdi] 19128 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 19129 paddb xmm1, xmm0 19130 paddb xmm2, xmm0 19131 movdqu xmmword ptr [r8 + rdi], xmm1 19132 movdqu xmmword ptr [r8 + rdi + 16], xmm2 19133 .LBB2_736: 19134 cmp rsi, r10 19135 je .LBB2_1069 19136 jmp .LBB2_737 19137 .LBB2_741: 19138 xor edi, edi 19139 .LBB2_742: 19140 test r9b, 1 19141 je .LBB2_744 19142 # %bb.743: 19143 movdqu xmm1, xmmword ptr [rcx + rdi] 19144 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 19145 movdqa xmm3, xmm0 19146 psubb xmm3, xmm1 19147 psubb xmm0, xmm2 19148 movdqu xmmword ptr [r8 + rdi], xmm3 19149 movdqu xmmword ptr [r8 + rdi + 16], xmm0 19150 .LBB2_744: 19151 cmp rsi, r10 19152 je .LBB2_1069 19153 jmp .LBB2_745 19154 .LBB2_749: 19155 xor edi, edi 19156 .LBB2_750: 19157 test r9b, 1 19158 je .LBB2_752 19159 # %bb.751: 19160 movdqu xmm1, xmmword ptr [rcx + rdi] 19161 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 19162 paddb xmm1, xmm0 19163 paddb xmm2, xmm0 19164 movdqu xmmword ptr [r8 + rdi], xmm1 19165 movdqu xmmword ptr [r8 + rdi + 16], xmm2 19166 .LBB2_752: 19167 cmp rsi, r10 19168 je .LBB2_1069 19169 jmp .LBB2_753 19170 .LBB2_757: 19171 xor edi, edi 19172 .LBB2_758: 19173 test r9b, 1 19174 je .LBB2_760 19175 # %bb.759: 19176 movdqu xmm1, xmmword ptr [rcx + rdi] 19177 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 19178 movdqa xmm3, xmm0 19179 psubb xmm3, xmm1 19180 psubb xmm0, xmm2 19181 movdqu xmmword ptr [r8 + rdi], xmm3 19182 movdqu xmmword ptr [r8 + rdi + 16], xmm0 19183 .LBB2_760: 19184 cmp rsi, r10 19185 je .LBB2_1069 19186 jmp .LBB2_761 19187 .LBB2_765: 19188 xor edi, edi 19189 .LBB2_766: 19190 test r9b, 1 19191 je .LBB2_768 19192 # %bb.767: 19193 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 19194 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 19195 paddq xmm1, xmm0 19196 paddq xmm2, xmm0 19197 movdqu xmmword ptr [r8 + 8*rdi], xmm1 19198 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 19199 .LBB2_768: 19200 cmp rsi, r10 19201 je .LBB2_1069 19202 jmp .LBB2_769 19203 .LBB2_773: 19204 xor edi, edi 19205 .LBB2_774: 19206 test r9b, 1 19207 je .LBB2_776 19208 # %bb.775: 19209 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 19210 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 19211 movdqa xmm3, xmm0 19212 psubq xmm3, xmm1 19213 psubq xmm0, xmm2 19214 movdqu xmmword ptr [r8 + 8*rdi], xmm3 19215 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0 19216 .LBB2_776: 19217 cmp rsi, r10 19218 je .LBB2_1069 19219 jmp .LBB2_777 19220 .LBB2_781: 19221 xor edi, edi 19222 .LBB2_782: 19223 test r9b, 1 19224 je .LBB2_784 19225 # %bb.783: 19226 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 19227 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 19228 paddq xmm1, xmm0 19229 paddq xmm2, xmm0 19230 movdqu xmmword ptr [r8 + 8*rdi], xmm1 19231 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 19232 .LBB2_784: 19233 cmp rsi, r10 19234 je .LBB2_1069 19235 jmp .LBB2_785 19236 .LBB2_789: 19237 xor edi, edi 19238 .LBB2_790: 19239 test r9b, 1 19240 je .LBB2_792 19241 # %bb.791: 19242 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 19243 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 19244 movdqa xmm3, xmm0 19245 psubq xmm3, xmm1 19246 psubq xmm0, xmm2 19247 movdqu xmmword ptr [r8 + 8*rdi], xmm3 19248 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0 19249 .LBB2_792: 19250 cmp rsi, r10 19251 je .LBB2_1069 19252 jmp .LBB2_793 19253 .LBB2_797: 19254 xor edi, edi 19255 .LBB2_798: 19256 test r9b, 1 19257 je .LBB2_800 19258 # %bb.799: 19259 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 19260 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 19261 pmullw xmm1, xmm0 19262 pmullw xmm2, xmm0 19263 movdqu xmmword ptr [r8 + 2*rdi], xmm1 19264 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 19265 .LBB2_800: 19266 cmp rsi, r10 19267 je .LBB2_1069 19268 jmp .LBB2_801 19269 .LBB2_805: 19270 xor edi, edi 19271 .LBB2_806: 19272 test r9b, 1 19273 je .LBB2_808 19274 # %bb.807: 19275 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 19276 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 19277 pmullw xmm1, xmm0 19278 pmullw xmm2, xmm0 19279 movdqu xmmword ptr [r8 + 2*rdi], xmm1 19280 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 19281 .LBB2_808: 19282 cmp rsi, r10 19283 je .LBB2_1069 19284 jmp .LBB2_809 19285 .LBB2_813: 19286 xor edi, edi 19287 .LBB2_814: 19288 test r9b, 1 19289 je .LBB2_816 19290 # %bb.815: 19291 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 19292 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 19293 pmullw xmm1, xmm0 19294 pmullw xmm2, xmm0 19295 movdqu xmmword ptr [r8 + 2*rdi], xmm1 19296 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 19297 .LBB2_816: 19298 cmp rsi, r10 19299 je .LBB2_1069 19300 jmp .LBB2_817 19301 .LBB2_821: 19302 xor edi, edi 19303 .LBB2_822: 19304 test r9b, 1 19305 je .LBB2_824 19306 # %bb.823: 19307 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 19308 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 19309 pmullw xmm1, xmm0 19310 pmullw xmm2, xmm0 19311 movdqu xmmword ptr [r8 + 2*rdi], xmm1 19312 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 19313 .LBB2_824: 19314 cmp rsi, r10 19315 je .LBB2_1069 19316 jmp .LBB2_825 19317 .LBB2_829: 19318 xor edi, edi 19319 .LBB2_830: 19320 test r9b, 1 19321 je .LBB2_832 19322 # %bb.831: 19323 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 19324 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 19325 paddw xmm1, xmm0 19326 paddw xmm2, xmm0 19327 movdqu xmmword ptr [r8 + 2*rdi], xmm1 19328 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 19329 .LBB2_832: 19330 cmp rsi, r10 19331 je .LBB2_1069 19332 jmp .LBB2_833 19333 .LBB2_837: 19334 xor edi, edi 19335 .LBB2_838: 19336 test r9b, 1 19337 je .LBB2_840 19338 # %bb.839: 19339 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 19340 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 19341 paddw xmm1, xmm0 19342 paddw xmm2, xmm0 19343 movdqu xmmword ptr [r8 + 2*rdi], xmm1 19344 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 19345 .LBB2_840: 19346 cmp rsi, r10 19347 je .LBB2_1069 19348 jmp .LBB2_841 19349 .LBB2_845: 19350 xor edi, edi 19351 .LBB2_846: 19352 test r9b, 1 19353 je .LBB2_848 19354 # %bb.847: 19355 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 19356 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 19357 movdqa xmm3, xmm0 19358 psubw xmm3, xmm1 19359 psubw xmm0, xmm2 19360 movdqu xmmword ptr [r8 + 2*rdi], xmm3 19361 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 19362 .LBB2_848: 19363 cmp rsi, r10 19364 je .LBB2_1069 19365 jmp .LBB2_849 19366 .LBB2_853: 19367 xor edi, edi 19368 .LBB2_854: 19369 test r9b, 1 19370 je .LBB2_856 19371 # %bb.855: 19372 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 19373 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 19374 movdqa xmm3, xmm0 19375 psubw xmm3, xmm1 19376 psubw xmm0, xmm2 19377 movdqu xmmword ptr [r8 + 2*rdi], xmm3 19378 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 19379 .LBB2_856: 19380 cmp rsi, r10 19381 je .LBB2_1069 19382 jmp .LBB2_857 19383 .LBB2_861: 19384 xor edi, edi 19385 .LBB2_862: 19386 test r9b, 1 19387 je .LBB2_864 19388 # %bb.863: 19389 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 19390 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 19391 paddw xmm1, xmm0 19392 paddw xmm2, xmm0 19393 movdqu xmmword ptr [r8 + 2*rdi], xmm1 19394 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 19395 .LBB2_864: 19396 cmp rsi, r10 19397 je .LBB2_1069 19398 jmp .LBB2_865 19399 .LBB2_869: 19400 xor edi, edi 19401 .LBB2_870: 19402 test r9b, 1 19403 je .LBB2_872 19404 # %bb.871: 19405 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 19406 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 19407 paddw xmm1, xmm0 19408 paddw xmm2, xmm0 19409 movdqu xmmword ptr [r8 + 2*rdi], xmm1 19410 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2 19411 .LBB2_872: 19412 cmp rsi, r10 19413 je .LBB2_1069 19414 jmp .LBB2_873 19415 .LBB2_877: 19416 xor edi, edi 19417 .LBB2_878: 19418 test r9b, 1 19419 je .LBB2_880 19420 # %bb.879: 19421 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 19422 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 19423 movdqa xmm3, xmm0 19424 psubw xmm3, xmm1 19425 psubw xmm0, xmm2 19426 movdqu xmmword ptr [r8 + 2*rdi], xmm3 19427 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 19428 .LBB2_880: 19429 cmp rsi, r10 19430 je .LBB2_1069 19431 jmp .LBB2_881 19432 .LBB2_885: 19433 xor edi, edi 19434 .LBB2_886: 19435 test r9b, 1 19436 je .LBB2_888 19437 # %bb.887: 19438 movdqu xmm1, xmmword ptr [rcx + 2*rdi] 19439 movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16] 19440 movdqa xmm3, xmm0 19441 psubw xmm3, xmm1 19442 psubw xmm0, xmm2 19443 movdqu xmmword ptr [r8 + 2*rdi], xmm3 19444 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0 19445 .LBB2_888: 19446 cmp rsi, r10 19447 je .LBB2_1069 19448 jmp .LBB2_889 19449 .LBB2_893: 19450 xor edi, edi 19451 .LBB2_894: 19452 test r9b, 1 19453 je .LBB2_896 19454 # %bb.895: 19455 movups xmm2, xmmword ptr [rcx + 4*rdi] 19456 movups xmm3, xmmword ptr [rcx + 4*rdi + 16] 19457 mulps xmm2, xmm1 19458 mulps xmm3, xmm1 19459 movups xmmword ptr [r8 + 4*rdi], xmm2 19460 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 19461 .LBB2_896: 19462 cmp rdx, rax 19463 je .LBB2_1069 19464 jmp .LBB2_897 19465 .LBB2_901: 19466 xor edi, edi 19467 .LBB2_902: 19468 test r9b, 1 19469 je .LBB2_904 19470 # %bb.903: 19471 movups xmm2, xmmword ptr [rcx + 4*rdi] 19472 movups xmm3, xmmword ptr [rcx + 4*rdi + 16] 19473 mulps xmm2, xmm1 19474 mulps xmm3, xmm1 19475 movups xmmword ptr [r8 + 4*rdi], xmm2 19476 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 19477 .LBB2_904: 19478 cmp rdx, rax 19479 je .LBB2_1069 19480 jmp .LBB2_905 19481 .LBB2_909: 19482 xor edi, edi 19483 .LBB2_910: 19484 test r9b, 1 19485 je .LBB2_912 19486 # %bb.911: 19487 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 19488 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 19489 paddq xmm1, xmm0 19490 paddq xmm2, xmm0 19491 movdqu xmmword ptr [r8 + 8*rdi], xmm1 19492 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 19493 .LBB2_912: 19494 cmp rsi, r10 19495 je .LBB2_1069 19496 jmp .LBB2_913 19497 .LBB2_917: 19498 xor edi, edi 19499 .LBB2_918: 19500 test r9b, 1 19501 je .LBB2_920 19502 # %bb.919: 19503 movups xmm2, xmmword ptr [rcx + 4*rdi] 19504 movups xmm3, xmmword ptr [rcx + 4*rdi + 16] 19505 addps xmm2, xmm1 19506 addps xmm3, xmm1 19507 movups xmmword ptr [r8 + 4*rdi], xmm2 19508 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 19509 .LBB2_920: 19510 cmp rdx, rax 19511 je .LBB2_1069 19512 jmp .LBB2_921 19513 .LBB2_925: 19514 xor edi, edi 19515 .LBB2_926: 19516 test r9b, 1 19517 je .LBB2_928 19518 # %bb.927: 19519 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 19520 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 19521 movdqa xmm3, xmm0 19522 psubq xmm3, xmm1 19523 psubq xmm0, xmm2 19524 movdqu xmmword ptr [r8 + 8*rdi], xmm3 19525 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0 19526 .LBB2_928: 19527 cmp rsi, r10 19528 je .LBB2_1069 19529 jmp .LBB2_929 19530 .LBB2_933: 19531 xor edi, edi 19532 .LBB2_934: 19533 test r9b, 1 19534 je .LBB2_936 19535 # %bb.935: 19536 movups xmm2, xmmword ptr [rcx + 4*rdi] 19537 movups xmm3, xmmword ptr [rcx + 4*rdi + 16] 19538 movaps xmm4, xmm1 19539 subps xmm4, xmm2 19540 subps xmm1, xmm3 19541 movups xmmword ptr [r8 + 4*rdi], xmm4 19542 movups xmmword ptr [r8 + 4*rdi + 16], xmm1 19543 .LBB2_936: 19544 cmp rdx, rax 19545 je .LBB2_1069 19546 jmp .LBB2_937 19547 .LBB2_941: 19548 xor edi, edi 19549 .LBB2_942: 19550 test r9b, 1 19551 je .LBB2_944 19552 # %bb.943: 19553 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 19554 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 19555 paddq xmm1, xmm0 19556 paddq xmm2, xmm0 19557 movdqu xmmword ptr [r8 + 8*rdi], xmm1 19558 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2 19559 .LBB2_944: 19560 cmp rsi, r10 19561 je .LBB2_1069 19562 jmp .LBB2_945 19563 .LBB2_949: 19564 xor edi, edi 19565 .LBB2_950: 19566 test r9b, 1 19567 je .LBB2_952 19568 # %bb.951: 19569 movups xmm2, xmmword ptr [rcx + 4*rdi] 19570 movups xmm3, xmmword ptr [rcx + 4*rdi + 16] 19571 addps xmm2, xmm1 19572 addps xmm3, xmm1 19573 movups xmmword ptr [r8 + 4*rdi], xmm2 19574 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 19575 .LBB2_952: 19576 cmp rdx, rax 19577 je .LBB2_1069 19578 jmp .LBB2_953 19579 .LBB2_957: 19580 xor edi, edi 19581 .LBB2_958: 19582 test r9b, 1 19583 je .LBB2_960 19584 # %bb.959: 19585 movdqu xmm1, xmmword ptr [rcx + 8*rdi] 19586 movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16] 19587 movdqa xmm3, xmm0 19588 psubq xmm3, xmm1 19589 psubq xmm0, xmm2 19590 movdqu xmmword ptr [r8 + 8*rdi], xmm3 19591 movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0 19592 .LBB2_960: 19593 cmp rsi, r10 19594 je .LBB2_1069 19595 jmp .LBB2_961 19596 .LBB2_965: 19597 xor edi, edi 19598 .LBB2_966: 19599 test r9b, 1 19600 je .LBB2_968 19601 # %bb.967: 19602 movups xmm2, xmmword ptr [rcx + 4*rdi] 19603 movups xmm3, xmmword ptr [rcx + 4*rdi + 16] 19604 movaps xmm4, xmm1 19605 subps xmm4, xmm2 19606 subps xmm1, xmm3 19607 movups xmmword ptr [r8 + 4*rdi], xmm4 19608 movups xmmword ptr [r8 + 4*rdi + 16], xmm1 19609 .LBB2_968: 19610 cmp rdx, rax 19611 je .LBB2_1069 19612 jmp .LBB2_969 19613 .LBB2_973: 19614 xor eax, eax 19615 .LBB2_974: 19616 test r9b, 1 19617 je .LBB2_976 19618 # %bb.975: 19619 movdqu xmm2, xmmword ptr [rcx + rax] 19620 movdqu xmm3, xmmword ptr [rcx + rax + 16] 19621 movdqa xmm4, xmm0 19622 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19623 pmovzxbw xmm5, xmm2 # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 19624 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19625 pmullw xmm2, xmm4 19626 movdqa xmm4, xmmword ptr [rip + .LCPI2_0] # xmm4 = [255,255,255,255,255,255,255,255] 19627 pand xmm2, xmm4 19628 pmullw xmm5, xmm1 19629 pand xmm5, xmm4 19630 packuswb xmm5, xmm2 19631 punpckhbw xmm0, xmm0 # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19632 pmovzxbw xmm2, xmm3 # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 19633 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19634 pmullw xmm3, xmm0 19635 pand xmm3, xmm4 19636 pmullw xmm2, xmm1 19637 pand xmm2, xmm4 19638 packuswb xmm2, xmm3 19639 movdqu xmmword ptr [r8 + rax], xmm5 19640 movdqu xmmword ptr [r8 + rax + 16], xmm2 19641 .LBB2_976: 19642 cmp rdi, r10 19643 je .LBB2_1069 19644 jmp .LBB2_977 19645 .LBB2_981: 19646 xor eax, eax 19647 .LBB2_982: 19648 test r9b, 1 19649 je .LBB2_984 19650 # %bb.983: 19651 movdqu xmm2, xmmword ptr [rcx + rax] 19652 movdqu xmm3, xmmword ptr [rcx + rax + 16] 19653 movdqa xmm4, xmm0 19654 punpckhbw xmm4, xmm4 # xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19655 pmovzxbw xmm5, xmm2 # xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 19656 punpckhbw xmm2, xmm2 # xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19657 pmullw xmm2, xmm4 19658 movdqa xmm4, xmmword ptr [rip + .LCPI2_0] # xmm4 = [255,255,255,255,255,255,255,255] 19659 pand xmm2, xmm4 19660 pmullw xmm5, xmm1 19661 pand xmm5, xmm4 19662 packuswb xmm5, xmm2 19663 punpckhbw xmm0, xmm0 # xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19664 pmovzxbw xmm2, xmm3 # xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 19665 punpckhbw xmm3, xmm3 # xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 19666 pmullw xmm3, xmm0 19667 pand xmm3, xmm4 19668 pmullw xmm2, xmm1 19669 pand xmm2, xmm4 19670 packuswb xmm2, xmm3 19671 movdqu xmmword ptr [r8 + rax], xmm5 19672 movdqu xmmword ptr [r8 + rax + 16], xmm2 19673 .LBB2_984: 19674 cmp rdi, r10 19675 je .LBB2_1069 19676 jmp .LBB2_985 19677 .LBB2_989: 19678 xor edi, edi 19679 .LBB2_990: 19680 test r9b, 1 19681 je .LBB2_992 19682 # %bb.991: 19683 movdqu xmm1, xmmword ptr [rcx + rdi] 19684 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 19685 paddb xmm1, xmm0 19686 paddb xmm2, xmm0 19687 movdqu xmmword ptr [r8 + rdi], xmm1 19688 movdqu xmmword ptr [r8 + rdi + 16], xmm2 19689 .LBB2_992: 19690 cmp rsi, r10 19691 je .LBB2_1069 19692 jmp .LBB2_993 19693 .LBB2_997: 19694 xor edi, edi 19695 .LBB2_998: 19696 test r9b, 1 19697 je .LBB2_1000 19698 # %bb.999: 19699 movdqu xmm1, xmmword ptr [rcx + rdi] 19700 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 19701 movdqa xmm3, xmm0 19702 psubb xmm3, xmm1 19703 psubb xmm0, xmm2 19704 movdqu xmmword ptr [r8 + rdi], xmm3 19705 movdqu xmmword ptr [r8 + rdi + 16], xmm0 19706 .LBB2_1000: 19707 cmp rsi, r10 19708 je .LBB2_1069 19709 jmp .LBB2_1001 19710 .LBB2_1005: 19711 xor edi, edi 19712 .LBB2_1006: 19713 test r9b, 1 19714 je .LBB2_1008 19715 # %bb.1007: 19716 movdqu xmm1, xmmword ptr [rcx + rdi] 19717 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 19718 paddb xmm1, xmm0 19719 paddb xmm2, xmm0 19720 movdqu xmmword ptr [r8 + rdi], xmm1 19721 movdqu xmmword ptr [r8 + rdi + 16], xmm2 19722 .LBB2_1008: 19723 cmp rsi, r10 19724 je .LBB2_1069 19725 jmp .LBB2_1009 19726 .LBB2_1013: 19727 xor edi, edi 19728 .LBB2_1014: 19729 test r9b, 1 19730 je .LBB2_1016 19731 # %bb.1015: 19732 movdqu xmm1, xmmword ptr [rcx + rdi] 19733 movdqu xmm2, xmmword ptr [rcx + rdi + 16] 19734 movdqa xmm3, xmm0 19735 psubb xmm3, xmm1 19736 psubb xmm0, xmm2 19737 movdqu xmmword ptr [r8 + rdi], xmm3 19738 movdqu xmmword ptr [r8 + rdi + 16], xmm0 19739 .LBB2_1016: 19740 cmp rsi, r10 19741 je .LBB2_1069 19742 jmp .LBB2_1017 19743 .LBB2_1021: 19744 xor edi, edi 19745 .LBB2_1022: 19746 test r9b, 1 19747 je .LBB2_1024 19748 # %bb.1023: 19749 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 19750 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 19751 pmulld xmm1, xmm0 19752 pmulld xmm2, xmm0 19753 movdqu xmmword ptr [r8 + 4*rdi], xmm1 19754 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 19755 .LBB2_1024: 19756 cmp rsi, r10 19757 je .LBB2_1069 19758 jmp .LBB2_1025 19759 .LBB2_1029: 19760 xor edi, edi 19761 .LBB2_1030: 19762 test r9b, 1 19763 je .LBB2_1032 19764 # %bb.1031: 19765 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 19766 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 19767 pmulld xmm1, xmm0 19768 pmulld xmm2, xmm0 19769 movdqu xmmword ptr [r8 + 4*rdi], xmm1 19770 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 19771 .LBB2_1032: 19772 cmp rsi, r10 19773 je .LBB2_1069 19774 jmp .LBB2_1033 19775 .LBB2_1037: 19776 xor edi, edi 19777 .LBB2_1038: 19778 test r9b, 1 19779 je .LBB2_1040 19780 # %bb.1039: 19781 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 19782 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 19783 paddd xmm1, xmm0 19784 paddd xmm2, xmm0 19785 movdqu xmmword ptr [r8 + 4*rdi], xmm1 19786 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 19787 .LBB2_1040: 19788 cmp rsi, r10 19789 je .LBB2_1069 19790 jmp .LBB2_1041 19791 .LBB2_1045: 19792 xor edi, edi 19793 .LBB2_1046: 19794 test r9b, 1 19795 je .LBB2_1048 19796 # %bb.1047: 19797 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 19798 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 19799 movdqa xmm3, xmm0 19800 psubd xmm3, xmm1 19801 psubd xmm0, xmm2 19802 movdqu xmmword ptr [r8 + 4*rdi], xmm3 19803 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 19804 .LBB2_1048: 19805 cmp rsi, r10 19806 je .LBB2_1069 19807 jmp .LBB2_1049 19808 .LBB2_1053: 19809 xor edi, edi 19810 .LBB2_1054: 19811 test r9b, 1 19812 je .LBB2_1056 19813 # %bb.1055: 19814 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 19815 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 19816 paddd xmm1, xmm0 19817 paddd xmm2, xmm0 19818 movdqu xmmword ptr [r8 + 4*rdi], xmm1 19819 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2 19820 .LBB2_1056: 19821 cmp rsi, r10 19822 je .LBB2_1069 19823 jmp .LBB2_1057 19824 .LBB2_1061: 19825 xor edi, edi 19826 .LBB2_1062: 19827 test r9b, 1 19828 je .LBB2_1064 19829 # %bb.1063: 19830 movdqu xmm1, xmmword ptr [rcx + 4*rdi] 19831 movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16] 19832 movdqa xmm3, xmm0 19833 psubd xmm3, xmm1 19834 psubd xmm0, xmm2 19835 movdqu xmmword ptr [r8 + 4*rdi], xmm3 19836 movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0 19837 .LBB2_1064: 19838 cmp rsi, r10 19839 je .LBB2_1069 19840 jmp .LBB2_1065 19841 .Lfunc_end2: 19842 .size arithmetic_scalar_arr_sse4, .Lfunc_end2-arithmetic_scalar_arr_sse4 19843 # -- End function 19844 .section .rodata.cst16,"aM",@progbits,16 19845 .p2align 4 # -- Begin function arithmetic_unary_same_types_sse4 19846 .LCPI3_0: 19847 .quad 0x8000000000000000 # double -0 19848 .quad 0x8000000000000000 # double -0 19849 .LCPI3_1: 19850 .quad 0x3ff0000000000000 # double 1 19851 .quad 0x3ff0000000000000 # double 1 19852 .LCPI3_3: 19853 .long 1 # 0x1 19854 .long 1 # 0x1 19855 .long 1 # 0x1 19856 .long 1 # 0x1 19857 .LCPI3_4: 19858 .quad 1 # 0x1 19859 .quad 1 # 0x1 19860 .LCPI3_5: 19861 .short 1 # 0x1 19862 .short 1 # 0x1 19863 .short 1 # 0x1 19864 .short 1 # 0x1 19865 .short 1 # 0x1 19866 .short 1 # 0x1 19867 .short 1 # 0x1 19868 .short 1 # 0x1 19869 .LCPI3_6: 19870 .zero 16,1 19871 .LCPI3_7: 19872 .long 0x80000000 # float -0 19873 .long 0x80000000 # float -0 19874 .long 0x80000000 # float -0 19875 .long 0x80000000 # float -0 19876 .LCPI3_8: 19877 .quad 9223372036854775807 # 0x7fffffffffffffff 19878 .quad 9223372036854775807 # 0x7fffffffffffffff 19879 .LCPI3_9: 19880 .long 2147483647 # 0x7fffffff 19881 .long 2147483647 # 0x7fffffff 19882 .long 2147483647 # 0x7fffffff 19883 .long 2147483647 # 0x7fffffff 19884 .LCPI3_10: 19885 .byte 255 # 0xff 19886 .byte 0 # 0x0 19887 .byte 0 # 0x0 19888 .byte 0 # 0x0 19889 .byte 255 # 0xff 19890 .byte 0 # 0x0 19891 .byte 0 # 0x0 19892 .byte 0 # 0x0 19893 .byte 255 # 0xff 19894 .byte 0 # 0x0 19895 .byte 0 # 0x0 19896 .byte 0 # 0x0 19897 .byte 255 # 0xff 19898 .byte 0 # 0x0 19899 .byte 0 # 0x0 19900 .byte 0 # 0x0 19901 .section .rodata.cst8,"aM",@progbits,8 19902 .p2align 3 19903 .LCPI3_2: 19904 .quad 0x3ff0000000000000 # double 1 19905 .text 19906 .globl arithmetic_unary_same_types_sse4 19907 .p2align 4, 0x90 19908 .type arithmetic_unary_same_types_sse4,@function 19909 arithmetic_unary_same_types_sse4: # @arithmetic_unary_same_types_sse4 19910 # %bb.0: 19911 push rbp 19912 mov rbp, rsp 19913 and rsp, -8 19914 cmp sil, 19 19915 jle .LBB3_12 19916 # %bb.1: 19917 cmp sil, 20 19918 je .LBB3_22 19919 # %bb.2: 19920 cmp sil, 25 19921 je .LBB3_30 19922 # %bb.3: 19923 cmp sil, 26 19924 jne .LBB3_923 19925 # %bb.4: 19926 cmp edi, 6 19927 jg .LBB3_46 19928 # %bb.5: 19929 cmp edi, 3 19930 jle .LBB3_81 19931 # %bb.6: 19932 cmp edi, 4 19933 je .LBB3_131 19934 # %bb.7: 19935 cmp edi, 5 19936 je .LBB3_134 19937 # %bb.8: 19938 cmp edi, 6 19939 jne .LBB3_923 19940 # %bb.9: 19941 test r8d, r8d 19942 jle .LBB3_923 19943 # %bb.10: 19944 mov r9d, r8d 19945 cmp r8d, 8 19946 jae .LBB3_221 19947 # %bb.11: 19948 xor edx, edx 19949 jmp .LBB3_373 19950 .LBB3_12: 19951 cmp sil, 4 19952 je .LBB3_38 19953 # %bb.13: 19954 cmp sil, 5 19955 jne .LBB3_923 19956 # %bb.14: 19957 cmp edi, 6 19958 jg .LBB3_53 19959 # %bb.15: 19960 cmp edi, 3 19961 jle .LBB3_86 19962 # %bb.16: 19963 cmp edi, 4 19964 je .LBB3_137 19965 # %bb.17: 19966 cmp edi, 5 19967 je .LBB3_140 19968 # %bb.18: 19969 cmp edi, 6 19970 jne .LBB3_923 19971 # %bb.19: 19972 test r8d, r8d 19973 jle .LBB3_923 19974 # %bb.20: 19975 mov r9d, r8d 19976 cmp r8d, 8 19977 jb .LBB3_21 19978 # %bb.223: 19979 lea rax, [rdx + 4*r9] 19980 cmp rax, rcx 19981 jbe .LBB3_374 19982 # %bb.224: 19983 lea rax, [rcx + 4*r9] 19984 cmp rax, rdx 19985 jbe .LBB3_374 19986 .LBB3_21: 19987 xor esi, esi 19988 .LBB3_614: 19989 mov r8, rsi 19990 not r8 19991 add r8, r9 19992 mov rdi, r9 19993 and rdi, 3 19994 je .LBB3_616 19995 .LBB3_615: # =>This Inner Loop Header: Depth=1 19996 xor eax, eax 19997 sub eax, dword ptr [rdx + 4*rsi] 19998 mov dword ptr [rcx + 4*rsi], eax 19999 add rsi, 1 20000 add rdi, -1 20001 jne .LBB3_615 20002 .LBB3_616: 20003 cmp r8, 3 20004 jb .LBB3_923 20005 .LBB3_617: # =>This Inner Loop Header: Depth=1 20006 xor eax, eax 20007 sub eax, dword ptr [rdx + 4*rsi] 20008 mov dword ptr [rcx + 4*rsi], eax 20009 xor eax, eax 20010 sub eax, dword ptr [rdx + 4*rsi + 4] 20011 mov dword ptr [rcx + 4*rsi + 4], eax 20012 xor eax, eax 20013 sub eax, dword ptr [rdx + 4*rsi + 8] 20014 mov dword ptr [rcx + 4*rsi + 8], eax 20015 xor eax, eax 20016 sub eax, dword ptr [rdx + 4*rsi + 12] 20017 mov dword ptr [rcx + 4*rsi + 12], eax 20018 add rsi, 4 20019 cmp r9, rsi 20020 jne .LBB3_617 20021 jmp .LBB3_923 20022 .LBB3_22: 20023 cmp edi, 6 20024 jg .LBB3_60 20025 # %bb.23: 20026 cmp edi, 3 20027 jle .LBB3_91 20028 # %bb.24: 20029 cmp edi, 4 20030 je .LBB3_143 20031 # %bb.25: 20032 cmp edi, 5 20033 je .LBB3_146 20034 # %bb.26: 20035 cmp edi, 6 20036 jne .LBB3_923 20037 # %bb.27: 20038 test r8d, r8d 20039 jle .LBB3_923 20040 # %bb.28: 20041 mov r9d, r8d 20042 cmp r8d, 8 20043 jb .LBB3_29 20044 # %bb.226: 20045 lea rax, [rdx + 4*r9] 20046 cmp rax, rcx 20047 jbe .LBB3_377 20048 # %bb.227: 20049 lea rax, [rcx + 4*r9] 20050 cmp rax, rdx 20051 jbe .LBB3_377 20052 .LBB3_29: 20053 xor esi, esi 20054 .LBB3_622: 20055 mov r8, rsi 20056 not r8 20057 add r8, r9 20058 mov rdi, r9 20059 and rdi, 3 20060 je .LBB3_624 20061 .LBB3_623: # =>This Inner Loop Header: Depth=1 20062 xor eax, eax 20063 cmp dword ptr [rdx + 4*rsi], 0 20064 setne al 20065 mov dword ptr [rcx + 4*rsi], eax 20066 add rsi, 1 20067 add rdi, -1 20068 jne .LBB3_623 20069 .LBB3_624: 20070 cmp r8, 3 20071 jb .LBB3_923 20072 .LBB3_625: # =>This Inner Loop Header: Depth=1 20073 xor eax, eax 20074 cmp dword ptr [rdx + 4*rsi], 0 20075 setne al 20076 mov dword ptr [rcx + 4*rsi], eax 20077 xor eax, eax 20078 cmp dword ptr [rdx + 4*rsi + 4], 0 20079 setne al 20080 mov dword ptr [rcx + 4*rsi + 4], eax 20081 xor eax, eax 20082 cmp dword ptr [rdx + 4*rsi + 8], 0 20083 setne al 20084 mov dword ptr [rcx + 4*rsi + 8], eax 20085 xor eax, eax 20086 cmp dword ptr [rdx + 4*rsi + 12], 0 20087 setne al 20088 mov dword ptr [rcx + 4*rsi + 12], eax 20089 add rsi, 4 20090 cmp r9, rsi 20091 jne .LBB3_625 20092 jmp .LBB3_923 20093 .LBB3_30: 20094 cmp edi, 6 20095 jg .LBB3_67 20096 # %bb.31: 20097 cmp edi, 3 20098 jle .LBB3_96 20099 # %bb.32: 20100 cmp edi, 4 20101 je .LBB3_149 20102 # %bb.33: 20103 cmp edi, 5 20104 je .LBB3_152 20105 # %bb.34: 20106 cmp edi, 6 20107 jne .LBB3_923 20108 # %bb.35: 20109 test r8d, r8d 20110 jle .LBB3_923 20111 # %bb.36: 20112 mov r9d, r8d 20113 cmp r8d, 8 20114 jb .LBB3_37 20115 # %bb.229: 20116 lea rax, [rdx + 4*r9] 20117 cmp rax, rcx 20118 jbe .LBB3_380 20119 # %bb.230: 20120 lea rax, [rcx + 4*r9] 20121 cmp rax, rdx 20122 jbe .LBB3_380 20123 .LBB3_37: 20124 xor esi, esi 20125 .LBB3_536: 20126 mov r8, rsi 20127 not r8 20128 add r8, r9 20129 mov rdi, r9 20130 and rdi, 3 20131 je .LBB3_538 20132 .LBB3_537: # =>This Inner Loop Header: Depth=1 20133 mov eax, dword ptr [rdx + 4*rsi] 20134 mov dword ptr [rcx + 4*rsi], eax 20135 add rsi, 1 20136 add rdi, -1 20137 jne .LBB3_537 20138 .LBB3_538: 20139 cmp r8, 3 20140 jb .LBB3_923 20141 .LBB3_539: # =>This Inner Loop Header: Depth=1 20142 mov eax, dword ptr [rdx + 4*rsi] 20143 mov dword ptr [rcx + 4*rsi], eax 20144 mov eax, dword ptr [rdx + 4*rsi + 4] 20145 mov dword ptr [rcx + 4*rsi + 4], eax 20146 mov eax, dword ptr [rdx + 4*rsi + 8] 20147 mov dword ptr [rcx + 4*rsi + 8], eax 20148 mov eax, dword ptr [rdx + 4*rsi + 12] 20149 mov dword ptr [rcx + 4*rsi + 12], eax 20150 add rsi, 4 20151 cmp r9, rsi 20152 jne .LBB3_539 20153 jmp .LBB3_923 20154 .LBB3_38: 20155 cmp edi, 6 20156 jg .LBB3_74 20157 # %bb.39: 20158 cmp edi, 3 20159 jle .LBB3_101 20160 # %bb.40: 20161 cmp edi, 4 20162 je .LBB3_155 20163 # %bb.41: 20164 cmp edi, 5 20165 je .LBB3_158 20166 # %bb.42: 20167 cmp edi, 6 20168 jne .LBB3_923 20169 # %bb.43: 20170 test r8d, r8d 20171 jle .LBB3_923 20172 # %bb.44: 20173 mov r9d, r8d 20174 cmp r8d, 8 20175 jb .LBB3_45 20176 # %bb.232: 20177 lea rax, [rdx + 4*r9] 20178 cmp rax, rcx 20179 jbe .LBB3_382 20180 # %bb.233: 20181 lea rax, [rcx + 4*r9] 20182 cmp rax, rdx 20183 jbe .LBB3_382 20184 .LBB3_45: 20185 xor esi, esi 20186 .LBB3_546: 20187 mov r8, rsi 20188 not r8 20189 add r8, r9 20190 mov rdi, r9 20191 and rdi, 3 20192 je .LBB3_548 20193 .LBB3_547: # =>This Inner Loop Header: Depth=1 20194 mov eax, dword ptr [rdx + 4*rsi] 20195 mov dword ptr [rcx + 4*rsi], eax 20196 add rsi, 1 20197 add rdi, -1 20198 jne .LBB3_547 20199 .LBB3_548: 20200 cmp r8, 3 20201 jb .LBB3_923 20202 .LBB3_549: # =>This Inner Loop Header: Depth=1 20203 mov eax, dword ptr [rdx + 4*rsi] 20204 mov dword ptr [rcx + 4*rsi], eax 20205 mov eax, dword ptr [rdx + 4*rsi + 4] 20206 mov dword ptr [rcx + 4*rsi + 4], eax 20207 mov eax, dword ptr [rdx + 4*rsi + 8] 20208 mov dword ptr [rcx + 4*rsi + 8], eax 20209 mov eax, dword ptr [rdx + 4*rsi + 12] 20210 mov dword ptr [rcx + 4*rsi + 12], eax 20211 add rsi, 4 20212 cmp r9, rsi 20213 jne .LBB3_549 20214 jmp .LBB3_923 20215 .LBB3_46: 20216 cmp edi, 8 20217 jle .LBB3_106 20218 # %bb.47: 20219 cmp edi, 9 20220 je .LBB3_161 20221 # %bb.48: 20222 cmp edi, 11 20223 je .LBB3_164 20224 # %bb.49: 20225 cmp edi, 12 20226 jne .LBB3_923 20227 # %bb.50: 20228 test r8d, r8d 20229 jle .LBB3_923 20230 # %bb.51: 20231 mov r9d, r8d 20232 cmp r8d, 4 20233 jb .LBB3_52 20234 # %bb.235: 20235 lea rax, [rdx + 8*r9] 20236 cmp rax, rcx 20237 jbe .LBB3_384 20238 # %bb.236: 20239 lea rax, [rcx + 8*r9] 20240 cmp rax, rdx 20241 jbe .LBB3_384 20242 .LBB3_52: 20243 xor esi, esi 20244 .LBB3_630: 20245 mov rax, rsi 20246 not rax 20247 add rax, r9 20248 mov rdi, r9 20249 and rdi, 3 20250 je .LBB3_633 20251 # %bb.631: 20252 movapd xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0] 20253 .LBB3_632: # =>This Inner Loop Header: Depth=1 20254 movsd xmm1, qword ptr [rdx + 8*rsi] # xmm1 = mem[0],zero 20255 xorpd xmm1, xmm0 20256 movlpd qword ptr [rcx + 8*rsi], xmm1 20257 add rsi, 1 20258 add rdi, -1 20259 jne .LBB3_632 20260 .LBB3_633: 20261 cmp rax, 3 20262 jb .LBB3_923 20263 # %bb.634: 20264 movapd xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0] 20265 .LBB3_635: # =>This Inner Loop Header: Depth=1 20266 movsd xmm1, qword ptr [rdx + 8*rsi] # xmm1 = mem[0],zero 20267 xorpd xmm1, xmm0 20268 movlpd qword ptr [rcx + 8*rsi], xmm1 20269 movsd xmm1, qword ptr [rdx + 8*rsi + 8] # xmm1 = mem[0],zero 20270 xorpd xmm1, xmm0 20271 movlpd qword ptr [rcx + 8*rsi + 8], xmm1 20272 movsd xmm1, qword ptr [rdx + 8*rsi + 16] # xmm1 = mem[0],zero 20273 xorpd xmm1, xmm0 20274 movlpd qword ptr [rcx + 8*rsi + 16], xmm1 20275 movsd xmm1, qword ptr [rdx + 8*rsi + 24] # xmm1 = mem[0],zero 20276 xorpd xmm1, xmm0 20277 movlpd qword ptr [rcx + 8*rsi + 24], xmm1 20278 add rsi, 4 20279 cmp r9, rsi 20280 jne .LBB3_635 20281 jmp .LBB3_923 20282 .LBB3_53: 20283 cmp edi, 8 20284 jle .LBB3_111 20285 # %bb.54: 20286 cmp edi, 9 20287 je .LBB3_167 20288 # %bb.55: 20289 cmp edi, 11 20290 je .LBB3_170 20291 # %bb.56: 20292 cmp edi, 12 20293 jne .LBB3_923 20294 # %bb.57: 20295 test r8d, r8d 20296 jle .LBB3_923 20297 # %bb.58: 20298 mov r9d, r8d 20299 cmp r8d, 4 20300 jb .LBB3_59 20301 # %bb.238: 20302 lea rax, [rdx + 8*r9] 20303 cmp rax, rcx 20304 jbe .LBB3_387 20305 # %bb.239: 20306 lea rax, [rcx + 8*r9] 20307 cmp rax, rdx 20308 jbe .LBB3_387 20309 .LBB3_59: 20310 xor esi, esi 20311 .LBB3_640: 20312 mov rax, rsi 20313 not rax 20314 add rax, r9 20315 mov rdi, r9 20316 and rdi, 3 20317 je .LBB3_643 20318 # %bb.641: 20319 movapd xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0] 20320 .LBB3_642: # =>This Inner Loop Header: Depth=1 20321 movsd xmm1, qword ptr [rdx + 8*rsi] # xmm1 = mem[0],zero 20322 xorpd xmm1, xmm0 20323 movlpd qword ptr [rcx + 8*rsi], xmm1 20324 add rsi, 1 20325 add rdi, -1 20326 jne .LBB3_642 20327 .LBB3_643: 20328 cmp rax, 3 20329 jb .LBB3_923 20330 # %bb.644: 20331 movapd xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0] 20332 .LBB3_645: # =>This Inner Loop Header: Depth=1 20333 movsd xmm1, qword ptr [rdx + 8*rsi] # xmm1 = mem[0],zero 20334 xorpd xmm1, xmm0 20335 movlpd qword ptr [rcx + 8*rsi], xmm1 20336 movsd xmm1, qword ptr [rdx + 8*rsi + 8] # xmm1 = mem[0],zero 20337 xorpd xmm1, xmm0 20338 movlpd qword ptr [rcx + 8*rsi + 8], xmm1 20339 movsd xmm1, qword ptr [rdx + 8*rsi + 16] # xmm1 = mem[0],zero 20340 xorpd xmm1, xmm0 20341 movlpd qword ptr [rcx + 8*rsi + 16], xmm1 20342 movsd xmm1, qword ptr [rdx + 8*rsi + 24] # xmm1 = mem[0],zero 20343 xorpd xmm1, xmm0 20344 movlpd qword ptr [rcx + 8*rsi + 24], xmm1 20345 add rsi, 4 20346 cmp r9, rsi 20347 jne .LBB3_645 20348 jmp .LBB3_923 20349 .LBB3_60: 20350 cmp edi, 8 20351 jle .LBB3_116 20352 # %bb.61: 20353 cmp edi, 9 20354 je .LBB3_173 20355 # %bb.62: 20356 cmp edi, 11 20357 je .LBB3_176 20358 # %bb.63: 20359 cmp edi, 12 20360 jne .LBB3_923 20361 # %bb.64: 20362 test r8d, r8d 20363 jle .LBB3_923 20364 # %bb.65: 20365 mov r9d, r8d 20366 cmp r8d, 4 20367 jb .LBB3_66 20368 # %bb.241: 20369 lea rax, [rdx + 8*r9] 20370 cmp rax, rcx 20371 jbe .LBB3_390 20372 # %bb.242: 20373 lea rax, [rcx + 8*r9] 20374 cmp rax, rdx 20375 jbe .LBB3_390 20376 .LBB3_66: 20377 xor esi, esi 20378 .LBB3_650: 20379 mov rax, rsi 20380 not rax 20381 test r9b, 1 20382 je .LBB3_652 20383 # %bb.651: 20384 movsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero 20385 movapd xmm1, xmmword ptr [rip + .LCPI3_0] # xmm1 = [-0.0E+0,-0.0E+0] 20386 andpd xmm1, xmm0 20387 movsd xmm2, qword ptr [rip + .LCPI3_2] # xmm2 = mem[0],zero 20388 orpd xmm2, xmm1 20389 xorpd xmm1, xmm1 20390 cmpeqsd xmm1, xmm0 20391 andnpd xmm1, xmm2 20392 movlpd qword ptr [rcx + 8*rsi], xmm1 20393 or rsi, 1 20394 .LBB3_652: 20395 add rax, r9 20396 je .LBB3_923 20397 # %bb.653: 20398 movapd xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0] 20399 movsd xmm1, qword ptr [rip + .LCPI3_2] # xmm1 = mem[0],zero 20400 xorpd xmm2, xmm2 20401 .LBB3_654: # =>This Inner Loop Header: Depth=1 20402 movsd xmm3, qword ptr [rdx + 8*rsi] # xmm3 = mem[0],zero 20403 movapd xmm4, xmm3 20404 andpd xmm4, xmm0 20405 orpd xmm4, xmm1 20406 cmpeqsd xmm3, xmm2 20407 andnpd xmm3, xmm4 20408 movlpd qword ptr [rcx + 8*rsi], xmm3 20409 movsd xmm3, qword ptr [rdx + 8*rsi + 8] # xmm3 = mem[0],zero 20410 movapd xmm4, xmm3 20411 andpd xmm4, xmm0 20412 orpd xmm4, xmm1 20413 cmpeqsd xmm3, xmm2 20414 andnpd xmm3, xmm4 20415 movlpd qword ptr [rcx + 8*rsi + 8], xmm3 20416 add rsi, 2 20417 cmp r9, rsi 20418 jne .LBB3_654 20419 jmp .LBB3_923 20420 .LBB3_67: 20421 cmp edi, 8 20422 jle .LBB3_121 20423 # %bb.68: 20424 cmp edi, 9 20425 je .LBB3_179 20426 # %bb.69: 20427 cmp edi, 11 20428 je .LBB3_182 20429 # %bb.70: 20430 cmp edi, 12 20431 jne .LBB3_923 20432 # %bb.71: 20433 test r8d, r8d 20434 jle .LBB3_923 20435 # %bb.72: 20436 mov r9d, r8d 20437 cmp r8d, 4 20438 jb .LBB3_73 20439 # %bb.244: 20440 lea rax, [rdx + 8*r9] 20441 cmp rax, rcx 20442 jbe .LBB3_393 20443 # %bb.245: 20444 lea rax, [rcx + 8*r9] 20445 cmp rax, rdx 20446 jbe .LBB3_393 20447 .LBB3_73: 20448 xor esi, esi 20449 .LBB3_659: 20450 movabs r10, 9223372036854775807 20451 mov r8, rsi 20452 not r8 20453 add r8, r9 20454 mov rax, r9 20455 and rax, 3 20456 je .LBB3_661 20457 .LBB3_660: # =>This Inner Loop Header: Depth=1 20458 mov rdi, qword ptr [rdx + 8*rsi] 20459 and rdi, r10 20460 mov qword ptr [rcx + 8*rsi], rdi 20461 add rsi, 1 20462 add rax, -1 20463 jne .LBB3_660 20464 .LBB3_661: 20465 cmp r8, 3 20466 jb .LBB3_923 20467 .LBB3_662: # =>This Inner Loop Header: Depth=1 20468 mov rax, qword ptr [rdx + 8*rsi] 20469 and rax, r10 20470 mov qword ptr [rcx + 8*rsi], rax 20471 mov rax, qword ptr [rdx + 8*rsi + 8] 20472 and rax, r10 20473 mov qword ptr [rcx + 8*rsi + 8], rax 20474 mov rax, qword ptr [rdx + 8*rsi + 16] 20475 and rax, r10 20476 mov qword ptr [rcx + 8*rsi + 16], rax 20477 mov rax, qword ptr [rdx + 8*rsi + 24] 20478 and rax, r10 20479 mov qword ptr [rcx + 8*rsi + 24], rax 20480 add rsi, 4 20481 cmp r9, rsi 20482 jne .LBB3_662 20483 jmp .LBB3_923 20484 .LBB3_74: 20485 cmp edi, 8 20486 jle .LBB3_126 20487 # %bb.75: 20488 cmp edi, 9 20489 je .LBB3_185 20490 # %bb.76: 20491 cmp edi, 11 20492 je .LBB3_188 20493 # %bb.77: 20494 cmp edi, 12 20495 jne .LBB3_923 20496 # %bb.78: 20497 test r8d, r8d 20498 jle .LBB3_923 20499 # %bb.79: 20500 mov r9d, r8d 20501 cmp r8d, 4 20502 jb .LBB3_80 20503 # %bb.247: 20504 lea rax, [rdx + 8*r9] 20505 cmp rax, rcx 20506 jbe .LBB3_396 20507 # %bb.248: 20508 lea rax, [rcx + 8*r9] 20509 cmp rax, rdx 20510 jbe .LBB3_396 20511 .LBB3_80: 20512 xor esi, esi 20513 .LBB3_667: 20514 movabs r10, 9223372036854775807 20515 mov r8, rsi 20516 not r8 20517 add r8, r9 20518 mov rax, r9 20519 and rax, 3 20520 je .LBB3_669 20521 .LBB3_668: # =>This Inner Loop Header: Depth=1 20522 mov rdi, qword ptr [rdx + 8*rsi] 20523 and rdi, r10 20524 mov qword ptr [rcx + 8*rsi], rdi 20525 add rsi, 1 20526 add rax, -1 20527 jne .LBB3_668 20528 .LBB3_669: 20529 cmp r8, 3 20530 jb .LBB3_923 20531 .LBB3_670: # =>This Inner Loop Header: Depth=1 20532 mov rax, qword ptr [rdx + 8*rsi] 20533 and rax, r10 20534 mov qword ptr [rcx + 8*rsi], rax 20535 mov rax, qword ptr [rdx + 8*rsi + 8] 20536 and rax, r10 20537 mov qword ptr [rcx + 8*rsi + 8], rax 20538 mov rax, qword ptr [rdx + 8*rsi + 16] 20539 and rax, r10 20540 mov qword ptr [rcx + 8*rsi + 16], rax 20541 mov rax, qword ptr [rdx + 8*rsi + 24] 20542 and rax, r10 20543 mov qword ptr [rcx + 8*rsi + 24], rax 20544 add rsi, 4 20545 cmp r9, rsi 20546 jne .LBB3_670 20547 jmp .LBB3_923 20548 .LBB3_81: 20549 cmp edi, 2 20550 je .LBB3_191 20551 # %bb.82: 20552 cmp edi, 3 20553 jne .LBB3_923 20554 # %bb.83: 20555 test r8d, r8d 20556 jle .LBB3_923 20557 # %bb.84: 20558 mov r9d, r8d 20559 cmp r8d, 32 20560 jb .LBB3_85 20561 # %bb.250: 20562 lea rax, [rdx + r9] 20563 cmp rax, rcx 20564 jbe .LBB3_399 20565 # %bb.251: 20566 lea rax, [rcx + r9] 20567 cmp rax, rdx 20568 jbe .LBB3_399 20569 .LBB3_85: 20570 xor esi, esi 20571 .LBB3_675: 20572 mov r8, rsi 20573 not r8 20574 add r8, r9 20575 mov rdi, r9 20576 and rdi, 3 20577 je .LBB3_677 20578 .LBB3_676: # =>This Inner Loop Header: Depth=1 20579 movzx r10d, byte ptr [rdx + rsi] 20580 xor eax, eax 20581 sub al, r10b 20582 mov byte ptr [rcx + rsi], al 20583 add rsi, 1 20584 add rdi, -1 20585 jne .LBB3_676 20586 .LBB3_677: 20587 cmp r8, 3 20588 jb .LBB3_923 20589 .LBB3_678: # =>This Inner Loop Header: Depth=1 20590 xor eax, eax 20591 sub al, byte ptr [rdx + rsi] 20592 mov byte ptr [rcx + rsi], al 20593 xor eax, eax 20594 sub al, byte ptr [rdx + rsi + 1] 20595 mov byte ptr [rcx + rsi + 1], al 20596 xor eax, eax 20597 sub al, byte ptr [rdx + rsi + 2] 20598 mov byte ptr [rcx + rsi + 2], al 20599 movzx eax, byte ptr [rdx + rsi + 3] 20600 xor edi, edi 20601 sub dil, al 20602 mov byte ptr [rcx + rsi + 3], dil 20603 add rsi, 4 20604 cmp r9, rsi 20605 jne .LBB3_678 20606 jmp .LBB3_923 20607 .LBB3_86: 20608 cmp edi, 2 20609 je .LBB3_194 20610 # %bb.87: 20611 cmp edi, 3 20612 jne .LBB3_923 20613 # %bb.88: 20614 test r8d, r8d 20615 jle .LBB3_923 20616 # %bb.89: 20617 mov r9d, r8d 20618 cmp r8d, 32 20619 jb .LBB3_90 20620 # %bb.253: 20621 lea rax, [rdx + r9] 20622 cmp rax, rcx 20623 jbe .LBB3_402 20624 # %bb.254: 20625 lea rax, [rcx + r9] 20626 cmp rax, rdx 20627 jbe .LBB3_402 20628 .LBB3_90: 20629 xor esi, esi 20630 .LBB3_683: 20631 mov r8, rsi 20632 not r8 20633 add r8, r9 20634 mov rdi, r9 20635 and rdi, 3 20636 je .LBB3_685 20637 .LBB3_684: # =>This Inner Loop Header: Depth=1 20638 movzx r10d, byte ptr [rdx + rsi] 20639 xor eax, eax 20640 sub al, r10b 20641 mov byte ptr [rcx + rsi], al 20642 add rsi, 1 20643 add rdi, -1 20644 jne .LBB3_684 20645 .LBB3_685: 20646 cmp r8, 3 20647 jb .LBB3_923 20648 .LBB3_686: # =>This Inner Loop Header: Depth=1 20649 xor eax, eax 20650 sub al, byte ptr [rdx + rsi] 20651 mov byte ptr [rcx + rsi], al 20652 xor eax, eax 20653 sub al, byte ptr [rdx + rsi + 1] 20654 mov byte ptr [rcx + rsi + 1], al 20655 xor eax, eax 20656 sub al, byte ptr [rdx + rsi + 2] 20657 mov byte ptr [rcx + rsi + 2], al 20658 movzx eax, byte ptr [rdx + rsi + 3] 20659 xor edi, edi 20660 sub dil, al 20661 mov byte ptr [rcx + rsi + 3], dil 20662 add rsi, 4 20663 cmp r9, rsi 20664 jne .LBB3_686 20665 jmp .LBB3_923 20666 .LBB3_91: 20667 cmp edi, 2 20668 je .LBB3_197 20669 # %bb.92: 20670 cmp edi, 3 20671 jne .LBB3_923 20672 # %bb.93: 20673 test r8d, r8d 20674 jle .LBB3_923 20675 # %bb.94: 20676 mov r9d, r8d 20677 cmp r8d, 32 20678 jb .LBB3_95 20679 # %bb.256: 20680 lea rax, [rdx + r9] 20681 cmp rax, rcx 20682 jbe .LBB3_405 20683 # %bb.257: 20684 lea rax, [rcx + r9] 20685 cmp rax, rdx 20686 jbe .LBB3_405 20687 .LBB3_95: 20688 xor esi, esi 20689 .LBB3_691: 20690 mov rax, rsi 20691 not rax 20692 test r9b, 1 20693 je .LBB3_693 20694 # %bb.692: 20695 mov dil, byte ptr [rdx + rsi] 20696 test dil, dil 20697 setne r8b 20698 neg r8b 20699 test dil, dil 20700 movzx r8d, r8b 20701 mov edi, 1 20702 cmovle edi, r8d 20703 mov byte ptr [rcx + rsi], dil 20704 or rsi, 1 20705 .LBB3_693: 20706 add rax, r9 20707 je .LBB3_923 20708 # %bb.694: 20709 mov edi, 1 20710 .LBB3_695: # =>This Inner Loop Header: Depth=1 20711 movzx r8d, byte ptr [rdx + rsi] 20712 test r8b, r8b 20713 setne al 20714 neg al 20715 test r8b, r8b 20716 movzx eax, al 20717 cmovg eax, edi 20718 mov byte ptr [rcx + rsi], al 20719 movzx r8d, byte ptr [rdx + rsi + 1] 20720 test r8b, r8b 20721 setne al 20722 neg al 20723 test r8b, r8b 20724 movzx eax, al 20725 cmovg eax, edi 20726 mov byte ptr [rcx + rsi + 1], al 20727 add rsi, 2 20728 cmp r9, rsi 20729 jne .LBB3_695 20730 jmp .LBB3_923 20731 .LBB3_96: 20732 cmp edi, 2 20733 je .LBB3_200 20734 # %bb.97: 20735 cmp edi, 3 20736 jne .LBB3_923 20737 # %bb.98: 20738 test r8d, r8d 20739 jle .LBB3_923 20740 # %bb.99: 20741 mov r9d, r8d 20742 cmp r8d, 16 20743 jb .LBB3_100 20744 # %bb.259: 20745 lea rax, [rdx + r9] 20746 cmp rax, rcx 20747 jbe .LBB3_408 20748 # %bb.260: 20749 lea rax, [rcx + r9] 20750 cmp rax, rdx 20751 jbe .LBB3_408 20752 .LBB3_100: 20753 xor esi, esi 20754 .LBB3_700: 20755 mov rax, rsi 20756 not rax 20757 test r9b, 1 20758 je .LBB3_702 20759 # %bb.701: 20760 movsx edi, byte ptr [rdx + rsi] 20761 mov r8d, edi 20762 sar r8d, 7 20763 add edi, r8d 20764 xor edi, r8d 20765 mov byte ptr [rcx + rsi], dil 20766 or rsi, 1 20767 .LBB3_702: 20768 add rax, r9 20769 je .LBB3_923 20770 .LBB3_703: # =>This Inner Loop Header: Depth=1 20771 movsx eax, byte ptr [rdx + rsi] 20772 mov edi, eax 20773 sar edi, 7 20774 add eax, edi 20775 xor eax, edi 20776 mov byte ptr [rcx + rsi], al 20777 movsx eax, byte ptr [rdx + rsi + 1] 20778 mov edi, eax 20779 sar edi, 7 20780 add eax, edi 20781 xor eax, edi 20782 mov byte ptr [rcx + rsi + 1], al 20783 add rsi, 2 20784 cmp r9, rsi 20785 jne .LBB3_703 20786 jmp .LBB3_923 20787 .LBB3_101: 20788 cmp edi, 2 20789 je .LBB3_203 20790 # %bb.102: 20791 cmp edi, 3 20792 jne .LBB3_923 20793 # %bb.103: 20794 test r8d, r8d 20795 jle .LBB3_923 20796 # %bb.104: 20797 mov r9d, r8d 20798 cmp r8d, 16 20799 jb .LBB3_105 20800 # %bb.262: 20801 lea rax, [rdx + r9] 20802 cmp rax, rcx 20803 jbe .LBB3_411 20804 # %bb.263: 20805 lea rax, [rcx + r9] 20806 cmp rax, rdx 20807 jbe .LBB3_411 20808 .LBB3_105: 20809 xor esi, esi 20810 .LBB3_708: 20811 mov rax, rsi 20812 not rax 20813 test r9b, 1 20814 je .LBB3_710 20815 # %bb.709: 20816 movsx edi, byte ptr [rdx + rsi] 20817 mov r8d, edi 20818 sar r8d, 7 20819 add edi, r8d 20820 xor edi, r8d 20821 mov byte ptr [rcx + rsi], dil 20822 or rsi, 1 20823 .LBB3_710: 20824 add rax, r9 20825 je .LBB3_923 20826 .LBB3_711: # =>This Inner Loop Header: Depth=1 20827 movsx eax, byte ptr [rdx + rsi] 20828 mov edi, eax 20829 sar edi, 7 20830 add eax, edi 20831 xor eax, edi 20832 mov byte ptr [rcx + rsi], al 20833 movsx eax, byte ptr [rdx + rsi + 1] 20834 mov edi, eax 20835 sar edi, 7 20836 add eax, edi 20837 xor eax, edi 20838 mov byte ptr [rcx + rsi + 1], al 20839 add rsi, 2 20840 cmp r9, rsi 20841 jne .LBB3_711 20842 jmp .LBB3_923 20843 .LBB3_106: 20844 cmp edi, 7 20845 je .LBB3_206 20846 # %bb.107: 20847 cmp edi, 8 20848 jne .LBB3_923 20849 # %bb.108: 20850 test r8d, r8d 20851 jle .LBB3_923 20852 # %bb.109: 20853 mov r9d, r8d 20854 cmp r8d, 4 20855 jae .LBB3_265 20856 # %bb.110: 20857 xor edx, edx 20858 jmp .LBB3_420 20859 .LBB3_111: 20860 cmp edi, 7 20861 je .LBB3_209 20862 # %bb.112: 20863 cmp edi, 8 20864 jne .LBB3_923 20865 # %bb.113: 20866 test r8d, r8d 20867 jle .LBB3_923 20868 # %bb.114: 20869 mov r9d, r8d 20870 cmp r8d, 4 20871 jb .LBB3_115 20872 # %bb.267: 20873 lea rax, [rdx + 8*r9] 20874 cmp rax, rcx 20875 jbe .LBB3_421 20876 # %bb.268: 20877 lea rax, [rcx + 8*r9] 20878 cmp rax, rdx 20879 jbe .LBB3_421 20880 .LBB3_115: 20881 xor esi, esi 20882 .LBB3_716: 20883 mov r8, rsi 20884 not r8 20885 add r8, r9 20886 mov rdi, r9 20887 and rdi, 3 20888 je .LBB3_718 20889 .LBB3_717: # =>This Inner Loop Header: Depth=1 20890 xor eax, eax 20891 sub rax, qword ptr [rdx + 8*rsi] 20892 mov qword ptr [rcx + 8*rsi], rax 20893 add rsi, 1 20894 add rdi, -1 20895 jne .LBB3_717 20896 .LBB3_718: 20897 cmp r8, 3 20898 jb .LBB3_923 20899 .LBB3_719: # =>This Inner Loop Header: Depth=1 20900 xor eax, eax 20901 sub rax, qword ptr [rdx + 8*rsi] 20902 mov qword ptr [rcx + 8*rsi], rax 20903 xor eax, eax 20904 sub rax, qword ptr [rdx + 8*rsi + 8] 20905 mov qword ptr [rcx + 8*rsi + 8], rax 20906 xor eax, eax 20907 sub rax, qword ptr [rdx + 8*rsi + 16] 20908 mov qword ptr [rcx + 8*rsi + 16], rax 20909 xor eax, eax 20910 sub rax, qword ptr [rdx + 8*rsi + 24] 20911 mov qword ptr [rcx + 8*rsi + 24], rax 20912 add rsi, 4 20913 cmp r9, rsi 20914 jne .LBB3_719 20915 jmp .LBB3_923 20916 .LBB3_116: 20917 cmp edi, 7 20918 je .LBB3_212 20919 # %bb.117: 20920 cmp edi, 8 20921 jne .LBB3_923 20922 # %bb.118: 20923 test r8d, r8d 20924 jle .LBB3_923 20925 # %bb.119: 20926 mov r9d, r8d 20927 cmp r8d, 4 20928 jb .LBB3_120 20929 # %bb.270: 20930 lea rax, [rdx + 8*r9] 20931 cmp rax, rcx 20932 jbe .LBB3_424 20933 # %bb.271: 20934 lea rax, [rcx + 8*r9] 20935 cmp rax, rdx 20936 jbe .LBB3_424 20937 .LBB3_120: 20938 xor esi, esi 20939 .LBB3_724: 20940 mov r8, rsi 20941 not r8 20942 add r8, r9 20943 mov rdi, r9 20944 and rdi, 3 20945 je .LBB3_726 20946 .LBB3_725: # =>This Inner Loop Header: Depth=1 20947 xor eax, eax 20948 cmp qword ptr [rdx + 8*rsi], 0 20949 setne al 20950 mov qword ptr [rcx + 8*rsi], rax 20951 add rsi, 1 20952 add rdi, -1 20953 jne .LBB3_725 20954 .LBB3_726: 20955 cmp r8, 3 20956 jb .LBB3_923 20957 .LBB3_727: # =>This Inner Loop Header: Depth=1 20958 xor eax, eax 20959 cmp qword ptr [rdx + 8*rsi], 0 20960 setne al 20961 mov qword ptr [rcx + 8*rsi], rax 20962 xor eax, eax 20963 cmp qword ptr [rdx + 8*rsi + 8], 0 20964 setne al 20965 mov qword ptr [rcx + 8*rsi + 8], rax 20966 xor eax, eax 20967 cmp qword ptr [rdx + 8*rsi + 16], 0 20968 setne al 20969 mov qword ptr [rcx + 8*rsi + 16], rax 20970 xor eax, eax 20971 cmp qword ptr [rdx + 8*rsi + 24], 0 20972 setne al 20973 mov qword ptr [rcx + 8*rsi + 24], rax 20974 add rsi, 4 20975 cmp r9, rsi 20976 jne .LBB3_727 20977 jmp .LBB3_923 20978 .LBB3_121: 20979 cmp edi, 7 20980 je .LBB3_215 20981 # %bb.122: 20982 cmp edi, 8 20983 jne .LBB3_923 20984 # %bb.123: 20985 test r8d, r8d 20986 jle .LBB3_923 20987 # %bb.124: 20988 mov r9d, r8d 20989 cmp r8d, 4 20990 jb .LBB3_125 20991 # %bb.273: 20992 lea rax, [rdx + 8*r9] 20993 cmp rax, rcx 20994 jbe .LBB3_427 20995 # %bb.274: 20996 lea rax, [rcx + 8*r9] 20997 cmp rax, rdx 20998 jbe .LBB3_427 20999 .LBB3_125: 21000 xor esi, esi 21001 .LBB3_556: 21002 mov r8, rsi 21003 not r8 21004 add r8, r9 21005 mov rdi, r9 21006 and rdi, 3 21007 je .LBB3_558 21008 .LBB3_557: # =>This Inner Loop Header: Depth=1 21009 mov rax, qword ptr [rdx + 8*rsi] 21010 mov qword ptr [rcx + 8*rsi], rax 21011 add rsi, 1 21012 add rdi, -1 21013 jne .LBB3_557 21014 .LBB3_558: 21015 cmp r8, 3 21016 jb .LBB3_923 21017 .LBB3_559: # =>This Inner Loop Header: Depth=1 21018 mov rax, qword ptr [rdx + 8*rsi] 21019 mov qword ptr [rcx + 8*rsi], rax 21020 mov rax, qword ptr [rdx + 8*rsi + 8] 21021 mov qword ptr [rcx + 8*rsi + 8], rax 21022 mov rax, qword ptr [rdx + 8*rsi + 16] 21023 mov qword ptr [rcx + 8*rsi + 16], rax 21024 mov rax, qword ptr [rdx + 8*rsi + 24] 21025 mov qword ptr [rcx + 8*rsi + 24], rax 21026 add rsi, 4 21027 cmp r9, rsi 21028 jne .LBB3_559 21029 jmp .LBB3_923 21030 .LBB3_126: 21031 cmp edi, 7 21032 je .LBB3_218 21033 # %bb.127: 21034 cmp edi, 8 21035 jne .LBB3_923 21036 # %bb.128: 21037 test r8d, r8d 21038 jle .LBB3_923 21039 # %bb.129: 21040 mov r9d, r8d 21041 cmp r8d, 4 21042 jb .LBB3_130 21043 # %bb.276: 21044 lea rax, [rdx + 8*r9] 21045 cmp rax, rcx 21046 jbe .LBB3_429 21047 # %bb.277: 21048 lea rax, [rcx + 8*r9] 21049 cmp rax, rdx 21050 jbe .LBB3_429 21051 .LBB3_130: 21052 xor esi, esi 21053 .LBB3_566: 21054 mov r8, rsi 21055 not r8 21056 add r8, r9 21057 mov rdi, r9 21058 and rdi, 3 21059 je .LBB3_568 21060 .LBB3_567: # =>This Inner Loop Header: Depth=1 21061 mov rax, qword ptr [rdx + 8*rsi] 21062 mov qword ptr [rcx + 8*rsi], rax 21063 add rsi, 1 21064 add rdi, -1 21065 jne .LBB3_567 21066 .LBB3_568: 21067 cmp r8, 3 21068 jb .LBB3_923 21069 .LBB3_569: # =>This Inner Loop Header: Depth=1 21070 mov rax, qword ptr [rdx + 8*rsi] 21071 mov qword ptr [rcx + 8*rsi], rax 21072 mov rax, qword ptr [rdx + 8*rsi + 8] 21073 mov qword ptr [rcx + 8*rsi + 8], rax 21074 mov rax, qword ptr [rdx + 8*rsi + 16] 21075 mov qword ptr [rcx + 8*rsi + 16], rax 21076 mov rax, qword ptr [rdx + 8*rsi + 24] 21077 mov qword ptr [rcx + 8*rsi + 24], rax 21078 add rsi, 4 21079 cmp r9, rsi 21080 jne .LBB3_569 21081 jmp .LBB3_923 21082 .LBB3_131: 21083 test r8d, r8d 21084 jle .LBB3_923 21085 # %bb.132: 21086 mov r9d, r8d 21087 cmp r8d, 16 21088 jae .LBB3_279 21089 # %bb.133: 21090 xor edx, edx 21091 jmp .LBB3_437 21092 .LBB3_134: 21093 test r8d, r8d 21094 jle .LBB3_923 21095 # %bb.135: 21096 mov r9d, r8d 21097 cmp r8d, 16 21098 jb .LBB3_136 21099 # %bb.281: 21100 lea rax, [rdx + 2*r9] 21101 cmp rax, rcx 21102 jbe .LBB3_438 21103 # %bb.282: 21104 lea rax, [rcx + 2*r9] 21105 cmp rax, rdx 21106 jbe .LBB3_438 21107 .LBB3_136: 21108 xor esi, esi 21109 .LBB3_732: 21110 mov r8, rsi 21111 not r8 21112 add r8, r9 21113 mov rdi, r9 21114 and rdi, 3 21115 je .LBB3_734 21116 .LBB3_733: # =>This Inner Loop Header: Depth=1 21117 xor eax, eax 21118 sub ax, word ptr [rdx + 2*rsi] 21119 mov word ptr [rcx + 2*rsi], ax 21120 add rsi, 1 21121 add rdi, -1 21122 jne .LBB3_733 21123 .LBB3_734: 21124 cmp r8, 3 21125 jb .LBB3_923 21126 .LBB3_735: # =>This Inner Loop Header: Depth=1 21127 xor eax, eax 21128 sub ax, word ptr [rdx + 2*rsi] 21129 mov word ptr [rcx + 2*rsi], ax 21130 xor eax, eax 21131 sub ax, word ptr [rdx + 2*rsi + 2] 21132 mov word ptr [rcx + 2*rsi + 2], ax 21133 xor eax, eax 21134 sub ax, word ptr [rdx + 2*rsi + 4] 21135 mov word ptr [rcx + 2*rsi + 4], ax 21136 xor eax, eax 21137 sub ax, word ptr [rdx + 2*rsi + 6] 21138 mov word ptr [rcx + 2*rsi + 6], ax 21139 add rsi, 4 21140 cmp r9, rsi 21141 jne .LBB3_735 21142 jmp .LBB3_923 21143 .LBB3_137: 21144 test r8d, r8d 21145 jle .LBB3_923 21146 # %bb.138: 21147 mov r9d, r8d 21148 cmp r8d, 16 21149 jb .LBB3_139 21150 # %bb.284: 21151 lea rax, [rdx + 2*r9] 21152 cmp rax, rcx 21153 jbe .LBB3_441 21154 # %bb.285: 21155 lea rax, [rcx + 2*r9] 21156 cmp rax, rdx 21157 jbe .LBB3_441 21158 .LBB3_139: 21159 xor esi, esi 21160 .LBB3_740: 21161 mov r8, rsi 21162 not r8 21163 add r8, r9 21164 mov rdi, r9 21165 and rdi, 3 21166 je .LBB3_742 21167 .LBB3_741: # =>This Inner Loop Header: Depth=1 21168 xor eax, eax 21169 sub ax, word ptr [rdx + 2*rsi] 21170 mov word ptr [rcx + 2*rsi], ax 21171 add rsi, 1 21172 add rdi, -1 21173 jne .LBB3_741 21174 .LBB3_742: 21175 cmp r8, 3 21176 jb .LBB3_923 21177 .LBB3_743: # =>This Inner Loop Header: Depth=1 21178 xor eax, eax 21179 sub ax, word ptr [rdx + 2*rsi] 21180 mov word ptr [rcx + 2*rsi], ax 21181 xor eax, eax 21182 sub ax, word ptr [rdx + 2*rsi + 2] 21183 mov word ptr [rcx + 2*rsi + 2], ax 21184 xor eax, eax 21185 sub ax, word ptr [rdx + 2*rsi + 4] 21186 mov word ptr [rcx + 2*rsi + 4], ax 21187 xor eax, eax 21188 sub ax, word ptr [rdx + 2*rsi + 6] 21189 mov word ptr [rcx + 2*rsi + 6], ax 21190 add rsi, 4 21191 cmp r9, rsi 21192 jne .LBB3_743 21193 jmp .LBB3_923 21194 .LBB3_140: 21195 test r8d, r8d 21196 jle .LBB3_923 21197 # %bb.141: 21198 mov r9d, r8d 21199 cmp r8d, 16 21200 jb .LBB3_142 21201 # %bb.287: 21202 lea rax, [rdx + 2*r9] 21203 cmp rax, rcx 21204 jbe .LBB3_444 21205 # %bb.288: 21206 lea rax, [rcx + 2*r9] 21207 cmp rax, rdx 21208 jbe .LBB3_444 21209 .LBB3_142: 21210 xor esi, esi 21211 .LBB3_748: 21212 mov r8, rsi 21213 not r8 21214 add r8, r9 21215 mov rdi, r9 21216 and rdi, 3 21217 je .LBB3_750 21218 .LBB3_749: # =>This Inner Loop Header: Depth=1 21219 xor eax, eax 21220 sub ax, word ptr [rdx + 2*rsi] 21221 mov word ptr [rcx + 2*rsi], ax 21222 add rsi, 1 21223 add rdi, -1 21224 jne .LBB3_749 21225 .LBB3_750: 21226 cmp r8, 3 21227 jb .LBB3_923 21228 .LBB3_751: # =>This Inner Loop Header: Depth=1 21229 xor eax, eax 21230 sub ax, word ptr [rdx + 2*rsi] 21231 mov word ptr [rcx + 2*rsi], ax 21232 xor eax, eax 21233 sub ax, word ptr [rdx + 2*rsi + 2] 21234 mov word ptr [rcx + 2*rsi + 2], ax 21235 xor eax, eax 21236 sub ax, word ptr [rdx + 2*rsi + 4] 21237 mov word ptr [rcx + 2*rsi + 4], ax 21238 xor eax, eax 21239 sub ax, word ptr [rdx + 2*rsi + 6] 21240 mov word ptr [rcx + 2*rsi + 6], ax 21241 add rsi, 4 21242 cmp r9, rsi 21243 jne .LBB3_751 21244 jmp .LBB3_923 21245 .LBB3_143: 21246 test r8d, r8d 21247 jle .LBB3_923 21248 # %bb.144: 21249 mov r9d, r8d 21250 cmp r8d, 16 21251 jb .LBB3_145 21252 # %bb.290: 21253 lea rax, [rdx + 2*r9] 21254 cmp rax, rcx 21255 jbe .LBB3_447 21256 # %bb.291: 21257 lea rax, [rcx + 2*r9] 21258 cmp rax, rdx 21259 jbe .LBB3_447 21260 .LBB3_145: 21261 xor esi, esi 21262 .LBB3_756: 21263 mov r8, rsi 21264 not r8 21265 add r8, r9 21266 mov rdi, r9 21267 and rdi, 3 21268 je .LBB3_758 21269 .LBB3_757: # =>This Inner Loop Header: Depth=1 21270 xor eax, eax 21271 cmp word ptr [rdx + 2*rsi], 0 21272 setne al 21273 mov word ptr [rcx + 2*rsi], ax 21274 add rsi, 1 21275 add rdi, -1 21276 jne .LBB3_757 21277 .LBB3_758: 21278 cmp r8, 3 21279 jb .LBB3_923 21280 .LBB3_759: # =>This Inner Loop Header: Depth=1 21281 xor eax, eax 21282 cmp word ptr [rdx + 2*rsi], 0 21283 setne al 21284 mov word ptr [rcx + 2*rsi], ax 21285 xor eax, eax 21286 cmp word ptr [rdx + 2*rsi + 2], 0 21287 setne al 21288 mov word ptr [rcx + 2*rsi + 2], ax 21289 xor eax, eax 21290 cmp word ptr [rdx + 2*rsi + 4], 0 21291 setne al 21292 mov word ptr [rcx + 2*rsi + 4], ax 21293 xor eax, eax 21294 cmp word ptr [rdx + 2*rsi + 6], 0 21295 setne al 21296 mov word ptr [rcx + 2*rsi + 6], ax 21297 add rsi, 4 21298 cmp r9, rsi 21299 jne .LBB3_759 21300 jmp .LBB3_923 21301 .LBB3_146: 21302 test r8d, r8d 21303 jle .LBB3_923 21304 # %bb.147: 21305 mov r9d, r8d 21306 cmp r8d, 16 21307 jb .LBB3_148 21308 # %bb.293: 21309 lea rax, [rdx + 2*r9] 21310 cmp rax, rcx 21311 jbe .LBB3_450 21312 # %bb.294: 21313 lea rax, [rcx + 2*r9] 21314 cmp rax, rdx 21315 jbe .LBB3_450 21316 .LBB3_148: 21317 xor esi, esi 21318 .LBB3_764: 21319 mov rax, rsi 21320 not rax 21321 test r9b, 1 21322 je .LBB3_766 21323 # %bb.765: 21324 movzx r8d, word ptr [rdx + 2*rsi] 21325 xor r10d, r10d 21326 test r8w, r8w 21327 setne r10b 21328 neg r10d 21329 test r8w, r8w 21330 mov edi, 1 21331 cmovle edi, r10d 21332 mov word ptr [rcx + 2*rsi], di 21333 or rsi, 1 21334 .LBB3_766: 21335 add rax, r9 21336 je .LBB3_923 21337 # %bb.767: 21338 mov r8d, 1 21339 .LBB3_768: # =>This Inner Loop Header: Depth=1 21340 movzx edi, word ptr [rdx + 2*rsi] 21341 xor eax, eax 21342 test di, di 21343 setne al 21344 neg eax 21345 test di, di 21346 cmovg eax, r8d 21347 mov word ptr [rcx + 2*rsi], ax 21348 movzx eax, word ptr [rdx + 2*rsi + 2] 21349 xor edi, edi 21350 test ax, ax 21351 setne dil 21352 neg edi 21353 test ax, ax 21354 cmovg edi, r8d 21355 mov word ptr [rcx + 2*rsi + 2], di 21356 add rsi, 2 21357 cmp r9, rsi 21358 jne .LBB3_768 21359 jmp .LBB3_923 21360 .LBB3_149: 21361 test r8d, r8d 21362 jle .LBB3_923 21363 # %bb.150: 21364 mov r9d, r8d 21365 cmp r8d, 16 21366 jb .LBB3_151 21367 # %bb.296: 21368 lea rax, [rdx + 2*r9] 21369 cmp rax, rcx 21370 jbe .LBB3_453 21371 # %bb.297: 21372 lea rax, [rcx + 2*r9] 21373 cmp rax, rdx 21374 jbe .LBB3_453 21375 .LBB3_151: 21376 xor esi, esi 21377 .LBB3_576: 21378 mov r8, rsi 21379 not r8 21380 add r8, r9 21381 mov rdi, r9 21382 and rdi, 3 21383 je .LBB3_578 21384 .LBB3_577: # =>This Inner Loop Header: Depth=1 21385 movzx eax, word ptr [rdx + 2*rsi] 21386 mov word ptr [rcx + 2*rsi], ax 21387 add rsi, 1 21388 add rdi, -1 21389 jne .LBB3_577 21390 .LBB3_578: 21391 cmp r8, 3 21392 jb .LBB3_923 21393 .LBB3_579: # =>This Inner Loop Header: Depth=1 21394 movzx eax, word ptr [rdx + 2*rsi] 21395 mov word ptr [rcx + 2*rsi], ax 21396 movzx eax, word ptr [rdx + 2*rsi + 2] 21397 mov word ptr [rcx + 2*rsi + 2], ax 21398 movzx eax, word ptr [rdx + 2*rsi + 4] 21399 mov word ptr [rcx + 2*rsi + 4], ax 21400 movzx eax, word ptr [rdx + 2*rsi + 6] 21401 mov word ptr [rcx + 2*rsi + 6], ax 21402 add rsi, 4 21403 cmp r9, rsi 21404 jne .LBB3_579 21405 jmp .LBB3_923 21406 .LBB3_152: 21407 test r8d, r8d 21408 jle .LBB3_923 21409 # %bb.153: 21410 mov r9d, r8d 21411 cmp r8d, 8 21412 jb .LBB3_154 21413 # %bb.299: 21414 lea rax, [rdx + 2*r9] 21415 cmp rax, rcx 21416 jbe .LBB3_455 21417 # %bb.300: 21418 lea rax, [rcx + 2*r9] 21419 cmp rax, rdx 21420 jbe .LBB3_455 21421 .LBB3_154: 21422 xor esi, esi 21423 .LBB3_773: 21424 mov rax, rsi 21425 not rax 21426 test r9b, 1 21427 je .LBB3_775 21428 # %bb.774: 21429 movsx edi, word ptr [rdx + 2*rsi] 21430 mov r8d, edi 21431 sar r8d, 15 21432 add edi, r8d 21433 xor edi, r8d 21434 mov word ptr [rcx + 2*rsi], di 21435 or rsi, 1 21436 .LBB3_775: 21437 add rax, r9 21438 je .LBB3_923 21439 .LBB3_776: # =>This Inner Loop Header: Depth=1 21440 movsx eax, word ptr [rdx + 2*rsi] 21441 mov edi, eax 21442 sar edi, 15 21443 add eax, edi 21444 xor eax, edi 21445 mov word ptr [rcx + 2*rsi], ax 21446 movsx eax, word ptr [rdx + 2*rsi + 2] 21447 mov edi, eax 21448 sar edi, 15 21449 add eax, edi 21450 xor eax, edi 21451 mov word ptr [rcx + 2*rsi + 2], ax 21452 add rsi, 2 21453 cmp r9, rsi 21454 jne .LBB3_776 21455 jmp .LBB3_923 21456 .LBB3_155: 21457 test r8d, r8d 21458 jle .LBB3_923 21459 # %bb.156: 21460 mov r9d, r8d 21461 cmp r8d, 16 21462 jb .LBB3_157 21463 # %bb.302: 21464 lea rax, [rdx + 2*r9] 21465 cmp rax, rcx 21466 jbe .LBB3_458 21467 # %bb.303: 21468 lea rax, [rcx + 2*r9] 21469 cmp rax, rdx 21470 jbe .LBB3_458 21471 .LBB3_157: 21472 xor esi, esi 21473 .LBB3_586: 21474 mov r8, rsi 21475 not r8 21476 add r8, r9 21477 mov rdi, r9 21478 and rdi, 3 21479 je .LBB3_588 21480 .LBB3_587: # =>This Inner Loop Header: Depth=1 21481 movzx eax, word ptr [rdx + 2*rsi] 21482 mov word ptr [rcx + 2*rsi], ax 21483 add rsi, 1 21484 add rdi, -1 21485 jne .LBB3_587 21486 .LBB3_588: 21487 cmp r8, 3 21488 jb .LBB3_923 21489 .LBB3_589: # =>This Inner Loop Header: Depth=1 21490 movzx eax, word ptr [rdx + 2*rsi] 21491 mov word ptr [rcx + 2*rsi], ax 21492 movzx eax, word ptr [rdx + 2*rsi + 2] 21493 mov word ptr [rcx + 2*rsi + 2], ax 21494 movzx eax, word ptr [rdx + 2*rsi + 4] 21495 mov word ptr [rcx + 2*rsi + 4], ax 21496 movzx eax, word ptr [rdx + 2*rsi + 6] 21497 mov word ptr [rcx + 2*rsi + 6], ax 21498 add rsi, 4 21499 cmp r9, rsi 21500 jne .LBB3_589 21501 jmp .LBB3_923 21502 .LBB3_158: 21503 test r8d, r8d 21504 jle .LBB3_923 21505 # %bb.159: 21506 mov r9d, r8d 21507 cmp r8d, 8 21508 jb .LBB3_160 21509 # %bb.305: 21510 lea rax, [rdx + 2*r9] 21511 cmp rax, rcx 21512 jbe .LBB3_460 21513 # %bb.306: 21514 lea rax, [rcx + 2*r9] 21515 cmp rax, rdx 21516 jbe .LBB3_460 21517 .LBB3_160: 21518 xor esi, esi 21519 .LBB3_781: 21520 mov rax, rsi 21521 not rax 21522 test r9b, 1 21523 je .LBB3_783 21524 # %bb.782: 21525 movsx edi, word ptr [rdx + 2*rsi] 21526 mov r8d, edi 21527 sar r8d, 15 21528 add edi, r8d 21529 xor edi, r8d 21530 mov word ptr [rcx + 2*rsi], di 21531 or rsi, 1 21532 .LBB3_783: 21533 add rax, r9 21534 je .LBB3_923 21535 .LBB3_784: # =>This Inner Loop Header: Depth=1 21536 movsx eax, word ptr [rdx + 2*rsi] 21537 mov edi, eax 21538 sar edi, 15 21539 add eax, edi 21540 xor eax, edi 21541 mov word ptr [rcx + 2*rsi], ax 21542 movsx eax, word ptr [rdx + 2*rsi + 2] 21543 mov edi, eax 21544 sar edi, 15 21545 add eax, edi 21546 xor eax, edi 21547 mov word ptr [rcx + 2*rsi + 2], ax 21548 add rsi, 2 21549 cmp r9, rsi 21550 jne .LBB3_784 21551 jmp .LBB3_923 21552 .LBB3_161: 21553 test r8d, r8d 21554 jle .LBB3_923 21555 # %bb.162: 21556 mov r9d, r8d 21557 cmp r8d, 4 21558 jb .LBB3_163 21559 # %bb.308: 21560 lea rax, [rdx + 8*r9] 21561 cmp rax, rcx 21562 jbe .LBB3_463 21563 # %bb.309: 21564 lea rax, [rcx + 8*r9] 21565 cmp rax, rdx 21566 jbe .LBB3_463 21567 .LBB3_163: 21568 xor esi, esi 21569 .LBB3_789: 21570 mov r8, rsi 21571 not r8 21572 add r8, r9 21573 mov rdi, r9 21574 and rdi, 3 21575 je .LBB3_791 21576 .LBB3_790: # =>This Inner Loop Header: Depth=1 21577 xor eax, eax 21578 sub rax, qword ptr [rdx + 8*rsi] 21579 mov qword ptr [rcx + 8*rsi], rax 21580 add rsi, 1 21581 add rdi, -1 21582 jne .LBB3_790 21583 .LBB3_791: 21584 cmp r8, 3 21585 jb .LBB3_923 21586 .LBB3_792: # =>This Inner Loop Header: Depth=1 21587 xor eax, eax 21588 sub rax, qword ptr [rdx + 8*rsi] 21589 mov qword ptr [rcx + 8*rsi], rax 21590 xor eax, eax 21591 sub rax, qword ptr [rdx + 8*rsi + 8] 21592 mov qword ptr [rcx + 8*rsi + 8], rax 21593 xor eax, eax 21594 sub rax, qword ptr [rdx + 8*rsi + 16] 21595 mov qword ptr [rcx + 8*rsi + 16], rax 21596 xor eax, eax 21597 sub rax, qword ptr [rdx + 8*rsi + 24] 21598 mov qword ptr [rcx + 8*rsi + 24], rax 21599 add rsi, 4 21600 cmp r9, rsi 21601 jne .LBB3_792 21602 jmp .LBB3_923 21603 .LBB3_164: 21604 test r8d, r8d 21605 jle .LBB3_923 21606 # %bb.165: 21607 mov r9d, r8d 21608 cmp r8d, 8 21609 jb .LBB3_166 21610 # %bb.311: 21611 lea rax, [rdx + 4*r9] 21612 cmp rax, rcx 21613 jbe .LBB3_466 21614 # %bb.312: 21615 lea rax, [rcx + 4*r9] 21616 cmp rax, rdx 21617 jbe .LBB3_466 21618 .LBB3_166: 21619 xor esi, esi 21620 .LBB3_797: 21621 mov rax, rsi 21622 not rax 21623 add rax, r9 21624 mov rdi, r9 21625 and rdi, 3 21626 je .LBB3_800 21627 # %bb.798: 21628 movapd xmm0, xmmword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 21629 .LBB3_799: # =>This Inner Loop Header: Depth=1 21630 movss xmm1, dword ptr [rdx + 4*rsi] # xmm1 = mem[0],zero,zero,zero 21631 xorpd xmm1, xmm0 21632 movss dword ptr [rcx + 4*rsi], xmm1 21633 add rsi, 1 21634 add rdi, -1 21635 jne .LBB3_799 21636 .LBB3_800: 21637 cmp rax, 3 21638 jb .LBB3_923 21639 # %bb.801: 21640 movapd xmm0, xmmword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 21641 .LBB3_802: # =>This Inner Loop Header: Depth=1 21642 movss xmm1, dword ptr [rdx + 4*rsi] # xmm1 = mem[0],zero,zero,zero 21643 xorpd xmm1, xmm0 21644 movss dword ptr [rcx + 4*rsi], xmm1 21645 movss xmm1, dword ptr [rdx + 4*rsi + 4] # xmm1 = mem[0],zero,zero,zero 21646 xorpd xmm1, xmm0 21647 movss dword ptr [rcx + 4*rsi + 4], xmm1 21648 movss xmm1, dword ptr [rdx + 4*rsi + 8] # xmm1 = mem[0],zero,zero,zero 21649 xorpd xmm1, xmm0 21650 movss dword ptr [rcx + 4*rsi + 8], xmm1 21651 movss xmm1, dword ptr [rdx + 4*rsi + 12] # xmm1 = mem[0],zero,zero,zero 21652 xorpd xmm1, xmm0 21653 movss dword ptr [rcx + 4*rsi + 12], xmm1 21654 add rsi, 4 21655 cmp r9, rsi 21656 jne .LBB3_802 21657 jmp .LBB3_923 21658 .LBB3_167: 21659 test r8d, r8d 21660 jle .LBB3_923 21661 # %bb.168: 21662 mov r9d, r8d 21663 cmp r8d, 4 21664 jb .LBB3_169 21665 # %bb.314: 21666 lea rax, [rdx + 8*r9] 21667 cmp rax, rcx 21668 jbe .LBB3_469 21669 # %bb.315: 21670 lea rax, [rcx + 8*r9] 21671 cmp rax, rdx 21672 jbe .LBB3_469 21673 .LBB3_169: 21674 xor esi, esi 21675 .LBB3_807: 21676 mov r8, rsi 21677 not r8 21678 add r8, r9 21679 mov rdi, r9 21680 and rdi, 3 21681 je .LBB3_809 21682 .LBB3_808: # =>This Inner Loop Header: Depth=1 21683 xor eax, eax 21684 sub rax, qword ptr [rdx + 8*rsi] 21685 mov qword ptr [rcx + 8*rsi], rax 21686 add rsi, 1 21687 add rdi, -1 21688 jne .LBB3_808 21689 .LBB3_809: 21690 cmp r8, 3 21691 jb .LBB3_923 21692 .LBB3_810: # =>This Inner Loop Header: Depth=1 21693 xor eax, eax 21694 sub rax, qword ptr [rdx + 8*rsi] 21695 mov qword ptr [rcx + 8*rsi], rax 21696 xor eax, eax 21697 sub rax, qword ptr [rdx + 8*rsi + 8] 21698 mov qword ptr [rcx + 8*rsi + 8], rax 21699 xor eax, eax 21700 sub rax, qword ptr [rdx + 8*rsi + 16] 21701 mov qword ptr [rcx + 8*rsi + 16], rax 21702 xor eax, eax 21703 sub rax, qword ptr [rdx + 8*rsi + 24] 21704 mov qword ptr [rcx + 8*rsi + 24], rax 21705 add rsi, 4 21706 cmp r9, rsi 21707 jne .LBB3_810 21708 jmp .LBB3_923 21709 .LBB3_170: 21710 test r8d, r8d 21711 jle .LBB3_923 21712 # %bb.171: 21713 mov r9d, r8d 21714 cmp r8d, 8 21715 jb .LBB3_172 21716 # %bb.317: 21717 lea rax, [rdx + 4*r9] 21718 cmp rax, rcx 21719 jbe .LBB3_472 21720 # %bb.318: 21721 lea rax, [rcx + 4*r9] 21722 cmp rax, rdx 21723 jbe .LBB3_472 21724 .LBB3_172: 21725 xor esi, esi 21726 .LBB3_815: 21727 mov rax, rsi 21728 not rax 21729 add rax, r9 21730 mov rdi, r9 21731 and rdi, 3 21732 je .LBB3_818 21733 # %bb.816: 21734 movapd xmm0, xmmword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 21735 .LBB3_817: # =>This Inner Loop Header: Depth=1 21736 movss xmm1, dword ptr [rdx + 4*rsi] # xmm1 = mem[0],zero,zero,zero 21737 xorpd xmm1, xmm0 21738 movss dword ptr [rcx + 4*rsi], xmm1 21739 add rsi, 1 21740 add rdi, -1 21741 jne .LBB3_817 21742 .LBB3_818: 21743 cmp rax, 3 21744 jb .LBB3_923 21745 # %bb.819: 21746 movapd xmm0, xmmword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 21747 .LBB3_820: # =>This Inner Loop Header: Depth=1 21748 movss xmm1, dword ptr [rdx + 4*rsi] # xmm1 = mem[0],zero,zero,zero 21749 xorpd xmm1, xmm0 21750 movss dword ptr [rcx + 4*rsi], xmm1 21751 movss xmm1, dword ptr [rdx + 4*rsi + 4] # xmm1 = mem[0],zero,zero,zero 21752 xorpd xmm1, xmm0 21753 movss dword ptr [rcx + 4*rsi + 4], xmm1 21754 movss xmm1, dword ptr [rdx + 4*rsi + 8] # xmm1 = mem[0],zero,zero,zero 21755 xorpd xmm1, xmm0 21756 movss dword ptr [rcx + 4*rsi + 8], xmm1 21757 movss xmm1, dword ptr [rdx + 4*rsi + 12] # xmm1 = mem[0],zero,zero,zero 21758 xorpd xmm1, xmm0 21759 movss dword ptr [rcx + 4*rsi + 12], xmm1 21760 add rsi, 4 21761 cmp r9, rsi 21762 jne .LBB3_820 21763 jmp .LBB3_923 21764 .LBB3_173: 21765 test r8d, r8d 21766 jle .LBB3_923 21767 # %bb.174: 21768 mov r9d, r8d 21769 cmp r8d, 4 21770 jb .LBB3_175 21771 # %bb.320: 21772 lea rax, [rdx + 8*r9] 21773 cmp rax, rcx 21774 jbe .LBB3_475 21775 # %bb.321: 21776 lea rax, [rcx + 8*r9] 21777 cmp rax, rdx 21778 jbe .LBB3_475 21779 .LBB3_175: 21780 xor esi, esi 21781 .LBB3_825: 21782 mov rax, rsi 21783 not rax 21784 test r9b, 1 21785 je .LBB3_827 21786 # %bb.826: 21787 mov r8, qword ptr [rdx + 8*rsi] 21788 xor r10d, r10d 21789 test r8, r8 21790 setne r10b 21791 neg r10 21792 test r8, r8 21793 mov edi, 1 21794 cmovle rdi, r10 21795 mov qword ptr [rcx + 8*rsi], rdi 21796 or rsi, 1 21797 .LBB3_827: 21798 add rax, r9 21799 je .LBB3_923 21800 # %bb.828: 21801 mov r8d, 1 21802 .LBB3_829: # =>This Inner Loop Header: Depth=1 21803 mov rdi, qword ptr [rdx + 8*rsi] 21804 xor eax, eax 21805 test rdi, rdi 21806 setne al 21807 neg rax 21808 test rdi, rdi 21809 cmovg rax, r8 21810 mov qword ptr [rcx + 8*rsi], rax 21811 mov rax, qword ptr [rdx + 8*rsi + 8] 21812 xor edi, edi 21813 test rax, rax 21814 setne dil 21815 neg rdi 21816 test rax, rax 21817 cmovg rdi, r8 21818 mov qword ptr [rcx + 8*rsi + 8], rdi 21819 add rsi, 2 21820 cmp r9, rsi 21821 jne .LBB3_829 21822 jmp .LBB3_923 21823 .LBB3_176: 21824 test r8d, r8d 21825 jle .LBB3_923 21826 # %bb.177: 21827 mov eax, r8d 21828 cmp r8d, 8 21829 jb .LBB3_178 21830 # %bb.323: 21831 lea rsi, [rdx + 4*rax] 21832 cmp rsi, rcx 21833 jbe .LBB3_478 21834 # %bb.324: 21835 lea rsi, [rcx + 4*rax] 21836 cmp rsi, rdx 21837 jbe .LBB3_478 21838 .LBB3_178: 21839 xor esi, esi 21840 .LBB3_481: 21841 mov r8, rsi 21842 not r8 21843 test al, 1 21844 je .LBB3_483 21845 # %bb.482: 21846 movss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero 21847 movmskps edi, xmm0 21848 and edi, 1 21849 neg edi 21850 or edi, 1 21851 xorps xmm1, xmm1 21852 cvtsi2ss xmm1, edi 21853 xorps xmm2, xmm2 21854 cmpeqss xmm2, xmm0 21855 andnps xmm2, xmm1 21856 movss dword ptr [rcx + 4*rsi], xmm2 21857 or rsi, 1 21858 .LBB3_483: 21859 add r8, rax 21860 je .LBB3_923 21861 # %bb.484: 21862 xorps xmm0, xmm0 21863 .LBB3_485: # =>This Inner Loop Header: Depth=1 21864 movss xmm1, dword ptr [rdx + 4*rsi] # xmm1 = mem[0],zero,zero,zero 21865 movmskps edi, xmm1 21866 and edi, 1 21867 neg edi 21868 or edi, 1 21869 xorps xmm2, xmm2 21870 cvtsi2ss xmm2, edi 21871 cmpeqss xmm1, xmm0 21872 andnps xmm1, xmm2 21873 movss dword ptr [rcx + 4*rsi], xmm1 21874 movss xmm1, dword ptr [rdx + 4*rsi + 4] # xmm1 = mem[0],zero,zero,zero 21875 movmskps edi, xmm1 21876 and edi, 1 21877 neg edi 21878 or edi, 1 21879 xorps xmm2, xmm2 21880 cvtsi2ss xmm2, edi 21881 cmpeqss xmm1, xmm0 21882 andnps xmm1, xmm2 21883 movss dword ptr [rcx + 4*rsi + 4], xmm1 21884 add rsi, 2 21885 cmp rax, rsi 21886 jne .LBB3_485 21887 jmp .LBB3_923 21888 .LBB3_179: 21889 test r8d, r8d 21890 jle .LBB3_923 21891 # %bb.180: 21892 mov r9d, r8d 21893 cmp r8d, 4 21894 jb .LBB3_181 21895 # %bb.326: 21896 lea rax, [rdx + 8*r9] 21897 cmp rax, rcx 21898 jbe .LBB3_486 21899 # %bb.327: 21900 lea rax, [rcx + 8*r9] 21901 cmp rax, rdx 21902 jbe .LBB3_486 21903 .LBB3_181: 21904 xor esi, esi 21905 .LBB3_834: 21906 mov rax, rsi 21907 not rax 21908 test r9b, 1 21909 je .LBB3_836 21910 # %bb.835: 21911 mov r8, qword ptr [rdx + 8*rsi] 21912 mov rdi, r8 21913 neg rdi 21914 cmovl rdi, r8 21915 mov qword ptr [rcx + 8*rsi], rdi 21916 or rsi, 1 21917 .LBB3_836: 21918 add rax, r9 21919 je .LBB3_923 21920 .LBB3_837: # =>This Inner Loop Header: Depth=1 21921 mov rax, qword ptr [rdx + 8*rsi] 21922 mov rdi, rax 21923 neg rdi 21924 cmovl rdi, rax 21925 mov qword ptr [rcx + 8*rsi], rdi 21926 mov rax, qword ptr [rdx + 8*rsi + 8] 21927 mov rdi, rax 21928 neg rdi 21929 cmovl rdi, rax 21930 mov qword ptr [rcx + 8*rsi + 8], rdi 21931 add rsi, 2 21932 cmp r9, rsi 21933 jne .LBB3_837 21934 jmp .LBB3_923 21935 .LBB3_182: 21936 test r8d, r8d 21937 jle .LBB3_923 21938 # %bb.183: 21939 mov r9d, r8d 21940 cmp r8d, 8 21941 jb .LBB3_184 21942 # %bb.329: 21943 lea rax, [rdx + 4*r9] 21944 cmp rax, rcx 21945 jbe .LBB3_489 21946 # %bb.330: 21947 lea rax, [rcx + 4*r9] 21948 cmp rax, rdx 21949 jbe .LBB3_489 21950 .LBB3_184: 21951 xor esi, esi 21952 .LBB3_842: 21953 mov r8, rsi 21954 not r8 21955 add r8, r9 21956 mov rdi, r9 21957 and rdi, 3 21958 je .LBB3_845 21959 # %bb.843: 21960 mov r10d, 2147483647 21961 .LBB3_844: # =>This Inner Loop Header: Depth=1 21962 mov eax, dword ptr [rdx + 4*rsi] 21963 and eax, r10d 21964 mov dword ptr [rcx + 4*rsi], eax 21965 add rsi, 1 21966 add rdi, -1 21967 jne .LBB3_844 21968 .LBB3_845: 21969 cmp r8, 3 21970 jb .LBB3_923 21971 # %bb.846: 21972 mov eax, 2147483647 21973 .LBB3_847: # =>This Inner Loop Header: Depth=1 21974 mov edi, dword ptr [rdx + 4*rsi] 21975 and edi, eax 21976 mov dword ptr [rcx + 4*rsi], edi 21977 mov edi, dword ptr [rdx + 4*rsi + 4] 21978 and edi, eax 21979 mov dword ptr [rcx + 4*rsi + 4], edi 21980 mov edi, dword ptr [rdx + 4*rsi + 8] 21981 and edi, eax 21982 mov dword ptr [rcx + 4*rsi + 8], edi 21983 mov edi, dword ptr [rdx + 4*rsi + 12] 21984 and edi, eax 21985 mov dword ptr [rcx + 4*rsi + 12], edi 21986 add rsi, 4 21987 cmp r9, rsi 21988 jne .LBB3_847 21989 jmp .LBB3_923 21990 .LBB3_185: 21991 test r8d, r8d 21992 jle .LBB3_923 21993 # %bb.186: 21994 mov r9d, r8d 21995 cmp r8d, 4 21996 jb .LBB3_187 21997 # %bb.332: 21998 lea rax, [rdx + 8*r9] 21999 cmp rax, rcx 22000 jbe .LBB3_492 22001 # %bb.333: 22002 lea rax, [rcx + 8*r9] 22003 cmp rax, rdx 22004 jbe .LBB3_492 22005 .LBB3_187: 22006 xor esi, esi 22007 .LBB3_852: 22008 mov rax, rsi 22009 not rax 22010 test r9b, 1 22011 je .LBB3_854 22012 # %bb.853: 22013 mov r8, qword ptr [rdx + 8*rsi] 22014 mov rdi, r8 22015 neg rdi 22016 cmovl rdi, r8 22017 mov qword ptr [rcx + 8*rsi], rdi 22018 or rsi, 1 22019 .LBB3_854: 22020 add rax, r9 22021 je .LBB3_923 22022 .LBB3_855: # =>This Inner Loop Header: Depth=1 22023 mov rax, qword ptr [rdx + 8*rsi] 22024 mov rdi, rax 22025 neg rdi 22026 cmovl rdi, rax 22027 mov qword ptr [rcx + 8*rsi], rdi 22028 mov rax, qword ptr [rdx + 8*rsi + 8] 22029 mov rdi, rax 22030 neg rdi 22031 cmovl rdi, rax 22032 mov qword ptr [rcx + 8*rsi + 8], rdi 22033 add rsi, 2 22034 cmp r9, rsi 22035 jne .LBB3_855 22036 jmp .LBB3_923 22037 .LBB3_188: 22038 test r8d, r8d 22039 jle .LBB3_923 22040 # %bb.189: 22041 mov r9d, r8d 22042 cmp r8d, 8 22043 jb .LBB3_190 22044 # %bb.335: 22045 lea rax, [rdx + 4*r9] 22046 cmp rax, rcx 22047 jbe .LBB3_495 22048 # %bb.336: 22049 lea rax, [rcx + 4*r9] 22050 cmp rax, rdx 22051 jbe .LBB3_495 22052 .LBB3_190: 22053 xor esi, esi 22054 .LBB3_860: 22055 mov r8, rsi 22056 not r8 22057 add r8, r9 22058 mov rdi, r9 22059 and rdi, 3 22060 je .LBB3_863 22061 # %bb.861: 22062 mov r10d, 2147483647 22063 .LBB3_862: # =>This Inner Loop Header: Depth=1 22064 mov eax, dword ptr [rdx + 4*rsi] 22065 and eax, r10d 22066 mov dword ptr [rcx + 4*rsi], eax 22067 add rsi, 1 22068 add rdi, -1 22069 jne .LBB3_862 22070 .LBB3_863: 22071 cmp r8, 3 22072 jb .LBB3_923 22073 # %bb.864: 22074 mov eax, 2147483647 22075 .LBB3_865: # =>This Inner Loop Header: Depth=1 22076 mov edi, dword ptr [rdx + 4*rsi] 22077 and edi, eax 22078 mov dword ptr [rcx + 4*rsi], edi 22079 mov edi, dword ptr [rdx + 4*rsi + 4] 22080 and edi, eax 22081 mov dword ptr [rcx + 4*rsi + 4], edi 22082 mov edi, dword ptr [rdx + 4*rsi + 8] 22083 and edi, eax 22084 mov dword ptr [rcx + 4*rsi + 8], edi 22085 mov edi, dword ptr [rdx + 4*rsi + 12] 22086 and edi, eax 22087 mov dword ptr [rcx + 4*rsi + 12], edi 22088 add rsi, 4 22089 cmp r9, rsi 22090 jne .LBB3_865 22091 jmp .LBB3_923 22092 .LBB3_191: 22093 test r8d, r8d 22094 jle .LBB3_923 22095 # %bb.192: 22096 mov r9d, r8d 22097 cmp r8d, 32 22098 jae .LBB3_338 22099 # %bb.193: 22100 xor edx, edx 22101 jmp .LBB3_504 22102 .LBB3_194: 22103 test r8d, r8d 22104 jle .LBB3_923 22105 # %bb.195: 22106 mov r9d, r8d 22107 cmp r8d, 32 22108 jb .LBB3_196 22109 # %bb.340: 22110 lea rax, [rdx + r9] 22111 cmp rax, rcx 22112 jbe .LBB3_505 22113 # %bb.341: 22114 lea rax, [rcx + r9] 22115 cmp rax, rdx 22116 jbe .LBB3_505 22117 .LBB3_196: 22118 xor esi, esi 22119 .LBB3_870: 22120 mov r8, rsi 22121 not r8 22122 add r8, r9 22123 mov rdi, r9 22124 and rdi, 3 22125 je .LBB3_872 22126 .LBB3_871: # =>This Inner Loop Header: Depth=1 22127 movzx r10d, byte ptr [rdx + rsi] 22128 xor eax, eax 22129 sub al, r10b 22130 mov byte ptr [rcx + rsi], al 22131 add rsi, 1 22132 add rdi, -1 22133 jne .LBB3_871 22134 .LBB3_872: 22135 cmp r8, 3 22136 jb .LBB3_923 22137 .LBB3_873: # =>This Inner Loop Header: Depth=1 22138 xor eax, eax 22139 sub al, byte ptr [rdx + rsi] 22140 mov byte ptr [rcx + rsi], al 22141 xor eax, eax 22142 sub al, byte ptr [rdx + rsi + 1] 22143 mov byte ptr [rcx + rsi + 1], al 22144 xor eax, eax 22145 sub al, byte ptr [rdx + rsi + 2] 22146 mov byte ptr [rcx + rsi + 2], al 22147 movzx eax, byte ptr [rdx + rsi + 3] 22148 xor edi, edi 22149 sub dil, al 22150 mov byte ptr [rcx + rsi + 3], dil 22151 add rsi, 4 22152 cmp r9, rsi 22153 jne .LBB3_873 22154 jmp .LBB3_923 22155 .LBB3_197: 22156 test r8d, r8d 22157 jle .LBB3_923 22158 # %bb.198: 22159 mov r9d, r8d 22160 cmp r8d, 32 22161 jb .LBB3_199 22162 # %bb.343: 22163 lea rax, [rdx + r9] 22164 cmp rax, rcx 22165 jbe .LBB3_508 22166 # %bb.344: 22167 lea rax, [rcx + r9] 22168 cmp rax, rdx 22169 jbe .LBB3_508 22170 .LBB3_199: 22171 xor esi, esi 22172 .LBB3_878: 22173 mov rax, rsi 22174 not rax 22175 add rax, r9 22176 mov rdi, r9 22177 and rdi, 3 22178 je .LBB3_880 22179 .LBB3_879: # =>This Inner Loop Header: Depth=1 22180 cmp byte ptr [rdx + rsi], 0 22181 setne byte ptr [rcx + rsi] 22182 add rsi, 1 22183 add rdi, -1 22184 jne .LBB3_879 22185 .LBB3_880: 22186 cmp rax, 3 22187 jb .LBB3_923 22188 .LBB3_881: # =>This Inner Loop Header: Depth=1 22189 cmp byte ptr [rdx + rsi], 0 22190 setne byte ptr [rcx + rsi] 22191 cmp byte ptr [rdx + rsi + 1], 0 22192 setne byte ptr [rcx + rsi + 1] 22193 cmp byte ptr [rdx + rsi + 2], 0 22194 setne byte ptr [rcx + rsi + 2] 22195 cmp byte ptr [rdx + rsi + 3], 0 22196 setne byte ptr [rcx + rsi + 3] 22197 add rsi, 4 22198 cmp r9, rsi 22199 jne .LBB3_881 22200 jmp .LBB3_923 22201 .LBB3_200: 22202 test r8d, r8d 22203 jle .LBB3_923 22204 # %bb.201: 22205 mov r9d, r8d 22206 cmp r8d, 32 22207 jb .LBB3_202 22208 # %bb.346: 22209 lea rax, [rdx + r9] 22210 cmp rax, rcx 22211 jbe .LBB3_511 22212 # %bb.347: 22213 lea rax, [rcx + r9] 22214 cmp rax, rdx 22215 jbe .LBB3_511 22216 .LBB3_202: 22217 xor esi, esi 22218 .LBB3_596: 22219 mov r8, rsi 22220 not r8 22221 add r8, r9 22222 mov rdi, r9 22223 and rdi, 3 22224 je .LBB3_598 22225 .LBB3_597: # =>This Inner Loop Header: Depth=1 22226 movzx eax, byte ptr [rdx + rsi] 22227 mov byte ptr [rcx + rsi], al 22228 add rsi, 1 22229 add rdi, -1 22230 jne .LBB3_597 22231 .LBB3_598: 22232 cmp r8, 3 22233 jb .LBB3_923 22234 .LBB3_599: # =>This Inner Loop Header: Depth=1 22235 movzx eax, byte ptr [rdx + rsi] 22236 mov byte ptr [rcx + rsi], al 22237 movzx eax, byte ptr [rdx + rsi + 1] 22238 mov byte ptr [rcx + rsi + 1], al 22239 movzx eax, byte ptr [rdx + rsi + 2] 22240 mov byte ptr [rcx + rsi + 2], al 22241 movzx eax, byte ptr [rdx + rsi + 3] 22242 mov byte ptr [rcx + rsi + 3], al 22243 add rsi, 4 22244 cmp r9, rsi 22245 jne .LBB3_599 22246 jmp .LBB3_923 22247 .LBB3_203: 22248 test r8d, r8d 22249 jle .LBB3_923 22250 # %bb.204: 22251 mov r9d, r8d 22252 cmp r8d, 32 22253 jb .LBB3_205 22254 # %bb.349: 22255 lea rax, [rdx + r9] 22256 cmp rax, rcx 22257 jbe .LBB3_513 22258 # %bb.350: 22259 lea rax, [rcx + r9] 22260 cmp rax, rdx 22261 jbe .LBB3_513 22262 .LBB3_205: 22263 xor esi, esi 22264 .LBB3_606: 22265 mov r8, rsi 22266 not r8 22267 add r8, r9 22268 mov rdi, r9 22269 and rdi, 3 22270 je .LBB3_608 22271 .LBB3_607: # =>This Inner Loop Header: Depth=1 22272 movzx eax, byte ptr [rdx + rsi] 22273 mov byte ptr [rcx + rsi], al 22274 add rsi, 1 22275 add rdi, -1 22276 jne .LBB3_607 22277 .LBB3_608: 22278 cmp r8, 3 22279 jb .LBB3_923 22280 .LBB3_609: # =>This Inner Loop Header: Depth=1 22281 movzx eax, byte ptr [rdx + rsi] 22282 mov byte ptr [rcx + rsi], al 22283 movzx eax, byte ptr [rdx + rsi + 1] 22284 mov byte ptr [rcx + rsi + 1], al 22285 movzx eax, byte ptr [rdx + rsi + 2] 22286 mov byte ptr [rcx + rsi + 2], al 22287 movzx eax, byte ptr [rdx + rsi + 3] 22288 mov byte ptr [rcx + rsi + 3], al 22289 add rsi, 4 22290 cmp r9, rsi 22291 jne .LBB3_609 22292 jmp .LBB3_923 22293 .LBB3_206: 22294 test r8d, r8d 22295 jle .LBB3_923 22296 # %bb.207: 22297 mov r9d, r8d 22298 cmp r8d, 8 22299 jb .LBB3_208 22300 # %bb.352: 22301 lea rax, [rdx + 4*r9] 22302 cmp rax, rcx 22303 jbe .LBB3_515 22304 # %bb.353: 22305 lea rax, [rcx + 4*r9] 22306 cmp rax, rdx 22307 jbe .LBB3_515 22308 .LBB3_208: 22309 xor esi, esi 22310 .LBB3_886: 22311 mov r8, rsi 22312 not r8 22313 add r8, r9 22314 mov rdi, r9 22315 and rdi, 3 22316 je .LBB3_888 22317 .LBB3_887: # =>This Inner Loop Header: Depth=1 22318 xor eax, eax 22319 sub eax, dword ptr [rdx + 4*rsi] 22320 mov dword ptr [rcx + 4*rsi], eax 22321 add rsi, 1 22322 add rdi, -1 22323 jne .LBB3_887 22324 .LBB3_888: 22325 cmp r8, 3 22326 jb .LBB3_923 22327 .LBB3_889: # =>This Inner Loop Header: Depth=1 22328 xor eax, eax 22329 sub eax, dword ptr [rdx + 4*rsi] 22330 mov dword ptr [rcx + 4*rsi], eax 22331 xor eax, eax 22332 sub eax, dword ptr [rdx + 4*rsi + 4] 22333 mov dword ptr [rcx + 4*rsi + 4], eax 22334 xor eax, eax 22335 sub eax, dword ptr [rdx + 4*rsi + 8] 22336 mov dword ptr [rcx + 4*rsi + 8], eax 22337 xor eax, eax 22338 sub eax, dword ptr [rdx + 4*rsi + 12] 22339 mov dword ptr [rcx + 4*rsi + 12], eax 22340 add rsi, 4 22341 cmp r9, rsi 22342 jne .LBB3_889 22343 jmp .LBB3_923 22344 .LBB3_209: 22345 test r8d, r8d 22346 jle .LBB3_923 22347 # %bb.210: 22348 mov r9d, r8d 22349 cmp r8d, 8 22350 jb .LBB3_211 22351 # %bb.355: 22352 lea rax, [rdx + 4*r9] 22353 cmp rax, rcx 22354 jbe .LBB3_518 22355 # %bb.356: 22356 lea rax, [rcx + 4*r9] 22357 cmp rax, rdx 22358 jbe .LBB3_518 22359 .LBB3_211: 22360 xor esi, esi 22361 .LBB3_894: 22362 mov r8, rsi 22363 not r8 22364 add r8, r9 22365 mov rdi, r9 22366 and rdi, 3 22367 je .LBB3_896 22368 .LBB3_895: # =>This Inner Loop Header: Depth=1 22369 xor eax, eax 22370 sub eax, dword ptr [rdx + 4*rsi] 22371 mov dword ptr [rcx + 4*rsi], eax 22372 add rsi, 1 22373 add rdi, -1 22374 jne .LBB3_895 22375 .LBB3_896: 22376 cmp r8, 3 22377 jb .LBB3_923 22378 .LBB3_897: # =>This Inner Loop Header: Depth=1 22379 xor eax, eax 22380 sub eax, dword ptr [rdx + 4*rsi] 22381 mov dword ptr [rcx + 4*rsi], eax 22382 xor eax, eax 22383 sub eax, dword ptr [rdx + 4*rsi + 4] 22384 mov dword ptr [rcx + 4*rsi + 4], eax 22385 xor eax, eax 22386 sub eax, dword ptr [rdx + 4*rsi + 8] 22387 mov dword ptr [rcx + 4*rsi + 8], eax 22388 xor eax, eax 22389 sub eax, dword ptr [rdx + 4*rsi + 12] 22390 mov dword ptr [rcx + 4*rsi + 12], eax 22391 add rsi, 4 22392 cmp r9, rsi 22393 jne .LBB3_897 22394 jmp .LBB3_923 22395 .LBB3_212: 22396 test r8d, r8d 22397 jle .LBB3_923 22398 # %bb.213: 22399 mov r9d, r8d 22400 cmp r8d, 8 22401 jb .LBB3_214 22402 # %bb.358: 22403 lea rax, [rdx + 4*r9] 22404 cmp rax, rcx 22405 jbe .LBB3_521 22406 # %bb.359: 22407 lea rax, [rcx + 4*r9] 22408 cmp rax, rdx 22409 jbe .LBB3_521 22410 .LBB3_214: 22411 xor esi, esi 22412 .LBB3_902: 22413 mov rax, rsi 22414 not rax 22415 test r9b, 1 22416 je .LBB3_904 22417 # %bb.903: 22418 mov r8d, dword ptr [rdx + 4*rsi] 22419 xor r10d, r10d 22420 test r8d, r8d 22421 setne r10b 22422 neg r10d 22423 test r8d, r8d 22424 mov edi, 1 22425 cmovle edi, r10d 22426 mov dword ptr [rcx + 4*rsi], edi 22427 or rsi, 1 22428 .LBB3_904: 22429 add rax, r9 22430 je .LBB3_923 22431 # %bb.905: 22432 mov r8d, 1 22433 .LBB3_906: # =>This Inner Loop Header: Depth=1 22434 mov edi, dword ptr [rdx + 4*rsi] 22435 xor eax, eax 22436 test edi, edi 22437 setne al 22438 neg eax 22439 test edi, edi 22440 cmovg eax, r8d 22441 mov dword ptr [rcx + 4*rsi], eax 22442 mov eax, dword ptr [rdx + 4*rsi + 4] 22443 xor edi, edi 22444 test eax, eax 22445 setne dil 22446 neg edi 22447 test eax, eax 22448 cmovg edi, r8d 22449 mov dword ptr [rcx + 4*rsi + 4], edi 22450 add rsi, 2 22451 cmp r9, rsi 22452 jne .LBB3_906 22453 jmp .LBB3_923 22454 .LBB3_215: 22455 test r8d, r8d 22456 jle .LBB3_923 22457 # %bb.216: 22458 mov r9d, r8d 22459 cmp r8d, 8 22460 jb .LBB3_217 22461 # %bb.361: 22462 lea rax, [rdx + 4*r9] 22463 cmp rax, rcx 22464 jbe .LBB3_524 22465 # %bb.362: 22466 lea rax, [rcx + 4*r9] 22467 cmp rax, rdx 22468 jbe .LBB3_524 22469 .LBB3_217: 22470 xor esi, esi 22471 .LBB3_911: 22472 mov rax, rsi 22473 not rax 22474 test r9b, 1 22475 je .LBB3_913 22476 # %bb.912: 22477 mov r8d, dword ptr [rdx + 4*rsi] 22478 mov edi, r8d 22479 neg edi 22480 cmovl edi, r8d 22481 mov dword ptr [rcx + 4*rsi], edi 22482 or rsi, 1 22483 .LBB3_913: 22484 add rax, r9 22485 je .LBB3_923 22486 .LBB3_914: # =>This Inner Loop Header: Depth=1 22487 mov eax, dword ptr [rdx + 4*rsi] 22488 mov edi, eax 22489 neg edi 22490 cmovl edi, eax 22491 mov dword ptr [rcx + 4*rsi], edi 22492 mov eax, dword ptr [rdx + 4*rsi + 4] 22493 mov edi, eax 22494 neg edi 22495 cmovl edi, eax 22496 mov dword ptr [rcx + 4*rsi + 4], edi 22497 add rsi, 2 22498 cmp r9, rsi 22499 jne .LBB3_914 22500 jmp .LBB3_923 22501 .LBB3_218: 22502 test r8d, r8d 22503 jle .LBB3_923 22504 # %bb.219: 22505 mov r9d, r8d 22506 cmp r8d, 8 22507 jb .LBB3_220 22508 # %bb.364: 22509 lea rax, [rdx + 4*r9] 22510 cmp rax, rcx 22511 jbe .LBB3_527 22512 # %bb.365: 22513 lea rax, [rcx + 4*r9] 22514 cmp rax, rdx 22515 jbe .LBB3_527 22516 .LBB3_220: 22517 xor esi, esi 22518 .LBB3_919: 22519 mov rax, rsi 22520 not rax 22521 test r9b, 1 22522 je .LBB3_921 22523 # %bb.920: 22524 mov r8d, dword ptr [rdx + 4*rsi] 22525 mov edi, r8d 22526 neg edi 22527 cmovl edi, r8d 22528 mov dword ptr [rcx + 4*rsi], edi 22529 or rsi, 1 22530 .LBB3_921: 22531 add rax, r9 22532 je .LBB3_923 22533 .LBB3_922: # =>This Inner Loop Header: Depth=1 22534 mov eax, dword ptr [rdx + 4*rsi] 22535 mov edi, eax 22536 neg edi 22537 cmovl edi, eax 22538 mov dword ptr [rcx + 4*rsi], edi 22539 mov eax, dword ptr [rdx + 4*rsi + 4] 22540 mov edi, eax 22541 neg edi 22542 cmovl edi, eax 22543 mov dword ptr [rcx + 4*rsi + 4], edi 22544 add rsi, 2 22545 cmp r9, rsi 22546 jne .LBB3_922 22547 jmp .LBB3_923 22548 .LBB3_221: 22549 mov edx, r9d 22550 and edx, -8 22551 lea rax, [rdx - 8] 22552 mov rdi, rax 22553 shr rdi, 3 22554 add rdi, 1 22555 mov esi, edi 22556 and esi, 7 22557 cmp rax, 56 22558 jae .LBB3_367 22559 # %bb.222: 22560 xor eax, eax 22561 jmp .LBB3_369 22562 .LBB3_265: 22563 mov edx, r9d 22564 and edx, -4 22565 lea rax, [rdx - 4] 22566 mov rdi, rax 22567 shr rdi, 2 22568 add rdi, 1 22569 mov esi, edi 22570 and esi, 7 22571 cmp rax, 28 22572 jae .LBB3_414 22573 # %bb.266: 22574 xor eax, eax 22575 jmp .LBB3_416 22576 .LBB3_279: 22577 mov edx, r9d 22578 and edx, -16 22579 lea rax, [rdx - 16] 22580 mov rdi, rax 22581 shr rdi, 4 22582 add rdi, 1 22583 mov esi, edi 22584 and esi, 7 22585 cmp rax, 112 22586 jae .LBB3_431 22587 # %bb.280: 22588 xor eax, eax 22589 jmp .LBB3_433 22590 .LBB3_338: 22591 mov edx, r9d 22592 and edx, -32 22593 lea rax, [rdx - 32] 22594 mov rdi, rax 22595 shr rdi, 5 22596 add rdi, 1 22597 mov esi, edi 22598 and esi, 7 22599 cmp rax, 224 22600 jae .LBB3_498 22601 # %bb.339: 22602 xor eax, eax 22603 jmp .LBB3_500 22604 .LBB3_374: 22605 mov esi, r9d 22606 and esi, -8 22607 lea rax, [rsi - 8] 22608 mov r8, rax 22609 shr r8, 3 22610 add r8, 1 22611 test rax, rax 22612 je .LBB3_610 22613 # %bb.375: 22614 mov rax, r8 22615 and rax, -2 22616 neg rax 22617 xor edi, edi 22618 .LBB3_376: # =>This Inner Loop Header: Depth=1 22619 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 22620 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 22621 pxor xmm2, xmm2 22622 psubd xmm2, xmm0 22623 pxor xmm0, xmm0 22624 psubd xmm0, xmm1 22625 movdqu xmmword ptr [rcx + 4*rdi], xmm2 22626 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm0 22627 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 22628 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 22629 pxor xmm2, xmm2 22630 psubd xmm2, xmm0 22631 pxor xmm0, xmm0 22632 psubd xmm0, xmm1 22633 movdqu xmmword ptr [rcx + 4*rdi + 32], xmm2 22634 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm0 22635 add rdi, 16 22636 add rax, 2 22637 jne .LBB3_376 22638 jmp .LBB3_611 22639 .LBB3_377: 22640 mov esi, r9d 22641 and esi, -8 22642 lea rax, [rsi - 8] 22643 mov r8, rax 22644 shr r8, 3 22645 add r8, 1 22646 test rax, rax 22647 je .LBB3_618 22648 # %bb.378: 22649 mov rax, r8 22650 and rax, -2 22651 neg rax 22652 xor edi, edi 22653 pxor xmm0, xmm0 22654 movdqa xmm1, xmmword ptr [rip + .LCPI3_3] # xmm1 = [1,1,1,1] 22655 .LBB3_379: # =>This Inner Loop Header: Depth=1 22656 movdqu xmm2, xmmword ptr [rdx + 4*rdi] 22657 movdqu xmm3, xmmword ptr [rdx + 4*rdi + 16] 22658 pcmpeqd xmm2, xmm0 22659 pandn xmm2, xmm1 22660 pcmpeqd xmm3, xmm0 22661 pandn xmm3, xmm1 22662 movdqu xmmword ptr [rcx + 4*rdi], xmm2 22663 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm3 22664 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 32] 22665 movdqu xmm3, xmmword ptr [rdx + 4*rdi + 48] 22666 pcmpeqd xmm2, xmm0 22667 pandn xmm2, xmm1 22668 pcmpeqd xmm3, xmm0 22669 pandn xmm3, xmm1 22670 movdqu xmmword ptr [rcx + 4*rdi + 32], xmm2 22671 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm3 22672 add rdi, 16 22673 add rax, 2 22674 jne .LBB3_379 22675 jmp .LBB3_619 22676 .LBB3_380: 22677 mov esi, r9d 22678 and esi, -8 22679 lea rax, [rsi - 8] 22680 mov rdi, rax 22681 shr rdi, 3 22682 add rdi, 1 22683 mov r8d, edi 22684 and r8d, 3 22685 cmp rax, 24 22686 jae .LBB3_530 22687 # %bb.381: 22688 xor eax, eax 22689 jmp .LBB3_532 22690 .LBB3_382: 22691 mov esi, r9d 22692 and esi, -8 22693 lea rax, [rsi - 8] 22694 mov rdi, rax 22695 shr rdi, 3 22696 add rdi, 1 22697 mov r8d, edi 22698 and r8d, 3 22699 cmp rax, 24 22700 jae .LBB3_540 22701 # %bb.383: 22702 xor eax, eax 22703 jmp .LBB3_542 22704 .LBB3_384: 22705 mov esi, r9d 22706 and esi, -4 22707 lea rax, [rsi - 4] 22708 mov r8, rax 22709 shr r8, 2 22710 add r8, 1 22711 test rax, rax 22712 je .LBB3_626 22713 # %bb.385: 22714 mov rax, r8 22715 and rax, -2 22716 neg rax 22717 xor edi, edi 22718 movapd xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0] 22719 .LBB3_386: # =>This Inner Loop Header: Depth=1 22720 movupd xmm1, xmmword ptr [rdx + 8*rdi] 22721 movupd xmm2, xmmword ptr [rdx + 8*rdi + 16] 22722 xorpd xmm1, xmm0 22723 xorpd xmm2, xmm0 22724 movupd xmmword ptr [rcx + 8*rdi], xmm1 22725 movupd xmmword ptr [rcx + 8*rdi + 16], xmm2 22726 movupd xmm1, xmmword ptr [rdx + 8*rdi + 32] 22727 movupd xmm2, xmmword ptr [rdx + 8*rdi + 48] 22728 xorpd xmm1, xmm0 22729 xorpd xmm2, xmm0 22730 movupd xmmword ptr [rcx + 8*rdi + 32], xmm1 22731 movupd xmmword ptr [rcx + 8*rdi + 48], xmm2 22732 add rdi, 8 22733 add rax, 2 22734 jne .LBB3_386 22735 jmp .LBB3_627 22736 .LBB3_387: 22737 mov esi, r9d 22738 and esi, -4 22739 lea rax, [rsi - 4] 22740 mov r8, rax 22741 shr r8, 2 22742 add r8, 1 22743 test rax, rax 22744 je .LBB3_636 22745 # %bb.388: 22746 mov rax, r8 22747 and rax, -2 22748 neg rax 22749 xor edi, edi 22750 movapd xmm0, xmmword ptr [rip + .LCPI3_0] # xmm0 = [-0.0E+0,-0.0E+0] 22751 .LBB3_389: # =>This Inner Loop Header: Depth=1 22752 movupd xmm1, xmmword ptr [rdx + 8*rdi] 22753 movupd xmm2, xmmword ptr [rdx + 8*rdi + 16] 22754 xorpd xmm1, xmm0 22755 xorpd xmm2, xmm0 22756 movupd xmmword ptr [rcx + 8*rdi], xmm1 22757 movupd xmmword ptr [rcx + 8*rdi + 16], xmm2 22758 movupd xmm1, xmmword ptr [rdx + 8*rdi + 32] 22759 movupd xmm2, xmmword ptr [rdx + 8*rdi + 48] 22760 xorpd xmm1, xmm0 22761 xorpd xmm2, xmm0 22762 movupd xmmword ptr [rcx + 8*rdi + 32], xmm1 22763 movupd xmmword ptr [rcx + 8*rdi + 48], xmm2 22764 add rdi, 8 22765 add rax, 2 22766 jne .LBB3_389 22767 jmp .LBB3_637 22768 .LBB3_390: 22769 mov esi, r9d 22770 and esi, -4 22771 lea rax, [rsi - 4] 22772 mov r8, rax 22773 shr r8, 2 22774 add r8, 1 22775 test rax, rax 22776 je .LBB3_646 22777 # %bb.391: 22778 mov rax, r8 22779 and rax, -2 22780 neg rax 22781 xor edi, edi 22782 xorpd xmm0, xmm0 22783 movapd xmm1, xmmword ptr [rip + .LCPI3_0] # xmm1 = [-0.0E+0,-0.0E+0] 22784 movapd xmm2, xmmword ptr [rip + .LCPI3_1] # xmm2 = [1.0E+0,1.0E+0] 22785 .LBB3_392: # =>This Inner Loop Header: Depth=1 22786 movupd xmm3, xmmword ptr [rdx + 8*rdi] 22787 movupd xmm4, xmmword ptr [rdx + 8*rdi + 16] 22788 movapd xmm5, xmm3 22789 andpd xmm5, xmm1 22790 orpd xmm5, xmm2 22791 movapd xmm6, xmm4 22792 andpd xmm6, xmm1 22793 orpd xmm6, xmm2 22794 cmpneqpd xmm3, xmm0 22795 andpd xmm3, xmm5 22796 cmpneqpd xmm4, xmm0 22797 andpd xmm4, xmm6 22798 movupd xmmword ptr [rcx + 8*rdi], xmm3 22799 movupd xmmword ptr [rcx + 8*rdi + 16], xmm4 22800 movupd xmm3, xmmword ptr [rdx + 8*rdi + 32] 22801 movupd xmm4, xmmword ptr [rdx + 8*rdi + 48] 22802 movapd xmm5, xmm3 22803 andpd xmm5, xmm1 22804 orpd xmm5, xmm2 22805 movapd xmm6, xmm4 22806 andpd xmm6, xmm1 22807 orpd xmm6, xmm2 22808 cmpneqpd xmm3, xmm0 22809 andpd xmm3, xmm5 22810 cmpneqpd xmm4, xmm0 22811 andpd xmm4, xmm6 22812 movupd xmmword ptr [rcx + 8*rdi + 32], xmm3 22813 movupd xmmword ptr [rcx + 8*rdi + 48], xmm4 22814 add rdi, 8 22815 add rax, 2 22816 jne .LBB3_392 22817 jmp .LBB3_647 22818 .LBB3_393: 22819 mov esi, r9d 22820 and esi, -4 22821 lea rax, [rsi - 4] 22822 mov r8, rax 22823 shr r8, 2 22824 add r8, 1 22825 test rax, rax 22826 je .LBB3_655 22827 # %bb.394: 22828 mov rax, r8 22829 and rax, -2 22830 neg rax 22831 xor edi, edi 22832 movapd xmm0, xmmword ptr [rip + .LCPI3_8] # xmm0 = [9223372036854775807,9223372036854775807] 22833 .LBB3_395: # =>This Inner Loop Header: Depth=1 22834 movupd xmm1, xmmword ptr [rdx + 8*rdi] 22835 movupd xmm2, xmmword ptr [rdx + 8*rdi + 16] 22836 andpd xmm1, xmm0 22837 andpd xmm2, xmm0 22838 movupd xmmword ptr [rcx + 8*rdi], xmm1 22839 movupd xmmword ptr [rcx + 8*rdi + 16], xmm2 22840 movupd xmm1, xmmword ptr [rdx + 8*rdi + 32] 22841 movupd xmm2, xmmword ptr [rdx + 8*rdi + 48] 22842 andpd xmm1, xmm0 22843 andpd xmm2, xmm0 22844 movupd xmmword ptr [rcx + 8*rdi + 32], xmm1 22845 movupd xmmword ptr [rcx + 8*rdi + 48], xmm2 22846 add rdi, 8 22847 add rax, 2 22848 jne .LBB3_395 22849 jmp .LBB3_656 22850 .LBB3_396: 22851 mov esi, r9d 22852 and esi, -4 22853 lea rax, [rsi - 4] 22854 mov r8, rax 22855 shr r8, 2 22856 add r8, 1 22857 test rax, rax 22858 je .LBB3_663 22859 # %bb.397: 22860 mov rax, r8 22861 and rax, -2 22862 neg rax 22863 xor edi, edi 22864 movapd xmm0, xmmword ptr [rip + .LCPI3_8] # xmm0 = [9223372036854775807,9223372036854775807] 22865 .LBB3_398: # =>This Inner Loop Header: Depth=1 22866 movupd xmm1, xmmword ptr [rdx + 8*rdi] 22867 movupd xmm2, xmmword ptr [rdx + 8*rdi + 16] 22868 andpd xmm1, xmm0 22869 andpd xmm2, xmm0 22870 movupd xmmword ptr [rcx + 8*rdi], xmm1 22871 movupd xmmword ptr [rcx + 8*rdi + 16], xmm2 22872 movupd xmm1, xmmword ptr [rdx + 8*rdi + 32] 22873 movupd xmm2, xmmword ptr [rdx + 8*rdi + 48] 22874 andpd xmm1, xmm0 22875 andpd xmm2, xmm0 22876 movupd xmmword ptr [rcx + 8*rdi + 32], xmm1 22877 movupd xmmword ptr [rcx + 8*rdi + 48], xmm2 22878 add rdi, 8 22879 add rax, 2 22880 jne .LBB3_398 22881 jmp .LBB3_664 22882 .LBB3_399: 22883 mov esi, r9d 22884 and esi, -32 22885 lea rax, [rsi - 32] 22886 mov r8, rax 22887 shr r8, 5 22888 add r8, 1 22889 test rax, rax 22890 je .LBB3_671 22891 # %bb.400: 22892 mov rax, r8 22893 and rax, -2 22894 neg rax 22895 xor edi, edi 22896 .LBB3_401: # =>This Inner Loop Header: Depth=1 22897 movdqu xmm0, xmmword ptr [rdx + rdi] 22898 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 22899 pxor xmm2, xmm2 22900 psubb xmm2, xmm0 22901 pxor xmm0, xmm0 22902 psubb xmm0, xmm1 22903 movdqu xmmword ptr [rcx + rdi], xmm2 22904 movdqu xmmword ptr [rcx + rdi + 16], xmm0 22905 movdqu xmm0, xmmword ptr [rdx + rdi + 32] 22906 movdqu xmm1, xmmword ptr [rdx + rdi + 48] 22907 pxor xmm2, xmm2 22908 psubb xmm2, xmm0 22909 pxor xmm0, xmm0 22910 psubb xmm0, xmm1 22911 movdqu xmmword ptr [rcx + rdi + 32], xmm2 22912 movdqu xmmword ptr [rcx + rdi + 48], xmm0 22913 add rdi, 64 22914 add rax, 2 22915 jne .LBB3_401 22916 jmp .LBB3_672 22917 .LBB3_402: 22918 mov esi, r9d 22919 and esi, -32 22920 lea rax, [rsi - 32] 22921 mov r8, rax 22922 shr r8, 5 22923 add r8, 1 22924 test rax, rax 22925 je .LBB3_679 22926 # %bb.403: 22927 mov rax, r8 22928 and rax, -2 22929 neg rax 22930 xor edi, edi 22931 .LBB3_404: # =>This Inner Loop Header: Depth=1 22932 movdqu xmm0, xmmword ptr [rdx + rdi] 22933 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 22934 pxor xmm2, xmm2 22935 psubb xmm2, xmm0 22936 pxor xmm0, xmm0 22937 psubb xmm0, xmm1 22938 movdqu xmmword ptr [rcx + rdi], xmm2 22939 movdqu xmmword ptr [rcx + rdi + 16], xmm0 22940 movdqu xmm0, xmmword ptr [rdx + rdi + 32] 22941 movdqu xmm1, xmmword ptr [rdx + rdi + 48] 22942 pxor xmm2, xmm2 22943 psubb xmm2, xmm0 22944 pxor xmm0, xmm0 22945 psubb xmm0, xmm1 22946 movdqu xmmword ptr [rcx + rdi + 32], xmm2 22947 movdqu xmmword ptr [rcx + rdi + 48], xmm0 22948 add rdi, 64 22949 add rax, 2 22950 jne .LBB3_404 22951 jmp .LBB3_680 22952 .LBB3_405: 22953 mov esi, r9d 22954 and esi, -32 22955 lea rax, [rsi - 32] 22956 mov r8, rax 22957 shr r8, 5 22958 add r8, 1 22959 test rax, rax 22960 je .LBB3_687 22961 # %bb.406: 22962 mov rax, r8 22963 and rax, -2 22964 neg rax 22965 xor edi, edi 22966 pxor xmm2, xmm2 22967 pcmpeqd xmm3, xmm3 22968 movdqa xmm4, xmmword ptr [rip + .LCPI3_6] # xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 22969 .LBB3_407: # =>This Inner Loop Header: Depth=1 22970 movdqu xmm5, xmmword ptr [rdx + rdi] 22971 movdqu xmm6, xmmword ptr [rdx + rdi + 16] 22972 movdqa xmm0, xmm4 22973 pcmpgtb xmm0, xmm5 22974 pcmpeqb xmm5, xmm2 22975 pxor xmm5, xmm3 22976 movdqa xmm1, xmm4 22977 pcmpgtb xmm1, xmm6 22978 pcmpeqb xmm6, xmm2 22979 pxor xmm6, xmm3 22980 movdqa xmm7, xmm4 22981 pblendvb xmm7, xmm5, xmm0 22982 movdqa xmm5, xmm4 22983 movdqa xmm0, xmm1 22984 pblendvb xmm5, xmm6, xmm0 22985 movdqu xmmword ptr [rcx + rdi], xmm7 22986 movdqu xmmword ptr [rcx + rdi + 16], xmm5 22987 movdqu xmm5, xmmword ptr [rdx + rdi + 32] 22988 movdqu xmm6, xmmword ptr [rdx + rdi + 48] 22989 movdqa xmm0, xmm4 22990 pcmpgtb xmm0, xmm5 22991 pcmpeqb xmm5, xmm2 22992 pxor xmm5, xmm3 22993 movdqa xmm1, xmm4 22994 pcmpgtb xmm1, xmm6 22995 pcmpeqb xmm6, xmm2 22996 pxor xmm6, xmm3 22997 movdqa xmm7, xmm4 22998 pblendvb xmm7, xmm5, xmm0 22999 movdqa xmm5, xmm4 23000 movdqa xmm0, xmm1 23001 pblendvb xmm5, xmm6, xmm0 23002 movdqu xmmword ptr [rcx + rdi + 32], xmm7 23003 movdqu xmmword ptr [rcx + rdi + 48], xmm5 23004 add rdi, 64 23005 add rax, 2 23006 jne .LBB3_407 23007 jmp .LBB3_688 23008 .LBB3_408: 23009 mov esi, r9d 23010 and esi, -16 23011 lea rax, [rsi - 16] 23012 mov r8, rax 23013 shr r8, 4 23014 add r8, 1 23015 test rax, rax 23016 je .LBB3_696 23017 # %bb.409: 23018 mov rax, r8 23019 and rax, -2 23020 neg rax 23021 xor edi, edi 23022 movdqa xmm8, xmmword ptr [rip + .LCPI3_10] # xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 23023 .LBB3_410: # =>This Inner Loop Header: Depth=1 23024 pmovsxbd xmm4, dword ptr [rdx + rdi + 12] 23025 pmovsxbd xmm1, dword ptr [rdx + rdi + 8] 23026 pmovsxbd xmm3, dword ptr [rdx + rdi + 4] 23027 pmovsxbd xmm2, dword ptr [rdx + rdi] 23028 movdqa xmm5, xmm2 23029 psrad xmm5, 7 23030 movdqa xmm6, xmm3 23031 psrad xmm6, 7 23032 movdqa xmm7, xmm1 23033 psrad xmm7, 7 23034 movdqa xmm0, xmm4 23035 psrad xmm0, 7 23036 paddd xmm4, xmm0 23037 paddd xmm1, xmm7 23038 paddd xmm3, xmm6 23039 paddd xmm2, xmm5 23040 pxor xmm2, xmm5 23041 pxor xmm3, xmm6 23042 pxor xmm1, xmm7 23043 pxor xmm4, xmm0 23044 pand xmm4, xmm8 23045 pand xmm1, xmm8 23046 packusdw xmm1, xmm4 23047 pand xmm3, xmm8 23048 pand xmm2, xmm8 23049 packusdw xmm2, xmm3 23050 packuswb xmm2, xmm1 23051 movdqu xmmword ptr [rcx + rdi], xmm2 23052 pmovsxbd xmm4, dword ptr [rdx + rdi + 28] 23053 pmovsxbd xmm1, dword ptr [rdx + rdi + 24] 23054 pmovsxbd xmm3, dword ptr [rdx + rdi + 20] 23055 pmovsxbd xmm2, dword ptr [rdx + rdi + 16] 23056 movdqa xmm0, xmm2 23057 psrad xmm0, 7 23058 movdqa xmm5, xmm3 23059 psrad xmm5, 7 23060 movdqa xmm6, xmm1 23061 psrad xmm6, 7 23062 movdqa xmm7, xmm4 23063 psrad xmm7, 7 23064 paddd xmm4, xmm7 23065 paddd xmm1, xmm6 23066 paddd xmm3, xmm5 23067 paddd xmm2, xmm0 23068 pxor xmm2, xmm0 23069 pxor xmm3, xmm5 23070 pxor xmm1, xmm6 23071 pxor xmm4, xmm7 23072 pand xmm4, xmm8 23073 pand xmm1, xmm8 23074 packusdw xmm1, xmm4 23075 pand xmm3, xmm8 23076 pand xmm2, xmm8 23077 packusdw xmm2, xmm3 23078 packuswb xmm2, xmm1 23079 movdqu xmmword ptr [rcx + rdi + 16], xmm2 23080 add rdi, 32 23081 add rax, 2 23082 jne .LBB3_410 23083 jmp .LBB3_697 23084 .LBB3_411: 23085 mov esi, r9d 23086 and esi, -16 23087 lea rax, [rsi - 16] 23088 mov r8, rax 23089 shr r8, 4 23090 add r8, 1 23091 test rax, rax 23092 je .LBB3_704 23093 # %bb.412: 23094 mov rax, r8 23095 and rax, -2 23096 neg rax 23097 xor edi, edi 23098 movdqa xmm8, xmmword ptr [rip + .LCPI3_10] # xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 23099 .LBB3_413: # =>This Inner Loop Header: Depth=1 23100 pmovsxbd xmm4, dword ptr [rdx + rdi + 12] 23101 pmovsxbd xmm1, dword ptr [rdx + rdi + 8] 23102 pmovsxbd xmm3, dword ptr [rdx + rdi + 4] 23103 pmovsxbd xmm2, dword ptr [rdx + rdi] 23104 movdqa xmm5, xmm2 23105 psrad xmm5, 7 23106 movdqa xmm6, xmm3 23107 psrad xmm6, 7 23108 movdqa xmm7, xmm1 23109 psrad xmm7, 7 23110 movdqa xmm0, xmm4 23111 psrad xmm0, 7 23112 paddd xmm4, xmm0 23113 paddd xmm1, xmm7 23114 paddd xmm3, xmm6 23115 paddd xmm2, xmm5 23116 pxor xmm2, xmm5 23117 pxor xmm3, xmm6 23118 pxor xmm1, xmm7 23119 pxor xmm4, xmm0 23120 pand xmm4, xmm8 23121 pand xmm1, xmm8 23122 packusdw xmm1, xmm4 23123 pand xmm3, xmm8 23124 pand xmm2, xmm8 23125 packusdw xmm2, xmm3 23126 packuswb xmm2, xmm1 23127 movdqu xmmword ptr [rcx + rdi], xmm2 23128 pmovsxbd xmm4, dword ptr [rdx + rdi + 28] 23129 pmovsxbd xmm1, dword ptr [rdx + rdi + 24] 23130 pmovsxbd xmm3, dword ptr [rdx + rdi + 20] 23131 pmovsxbd xmm2, dword ptr [rdx + rdi + 16] 23132 movdqa xmm0, xmm2 23133 psrad xmm0, 7 23134 movdqa xmm5, xmm3 23135 psrad xmm5, 7 23136 movdqa xmm6, xmm1 23137 psrad xmm6, 7 23138 movdqa xmm7, xmm4 23139 psrad xmm7, 7 23140 paddd xmm4, xmm7 23141 paddd xmm1, xmm6 23142 paddd xmm3, xmm5 23143 paddd xmm2, xmm0 23144 pxor xmm2, xmm0 23145 pxor xmm3, xmm5 23146 pxor xmm1, xmm6 23147 pxor xmm4, xmm7 23148 pand xmm4, xmm8 23149 pand xmm1, xmm8 23150 packusdw xmm1, xmm4 23151 pand xmm3, xmm8 23152 pand xmm2, xmm8 23153 packusdw xmm2, xmm3 23154 packuswb xmm2, xmm1 23155 movdqu xmmword ptr [rcx + rdi + 16], xmm2 23156 add rdi, 32 23157 add rax, 2 23158 jne .LBB3_413 23159 jmp .LBB3_705 23160 .LBB3_421: 23161 mov esi, r9d 23162 and esi, -4 23163 lea rax, [rsi - 4] 23164 mov r8, rax 23165 shr r8, 2 23166 add r8, 1 23167 test rax, rax 23168 je .LBB3_712 23169 # %bb.422: 23170 mov rax, r8 23171 and rax, -2 23172 neg rax 23173 xor edi, edi 23174 .LBB3_423: # =>This Inner Loop Header: Depth=1 23175 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 23176 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 23177 pxor xmm2, xmm2 23178 psubq xmm2, xmm0 23179 pxor xmm0, xmm0 23180 psubq xmm0, xmm1 23181 movdqu xmmword ptr [rcx + 8*rdi], xmm2 23182 movdqu xmmword ptr [rcx + 8*rdi + 16], xmm0 23183 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] 23184 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] 23185 pxor xmm2, xmm2 23186 psubq xmm2, xmm0 23187 pxor xmm0, xmm0 23188 psubq xmm0, xmm1 23189 movdqu xmmword ptr [rcx + 8*rdi + 32], xmm2 23190 movdqu xmmword ptr [rcx + 8*rdi + 48], xmm0 23191 add rdi, 8 23192 add rax, 2 23193 jne .LBB3_423 23194 jmp .LBB3_713 23195 .LBB3_424: 23196 mov esi, r9d 23197 and esi, -4 23198 lea rax, [rsi - 4] 23199 mov r8, rax 23200 shr r8, 2 23201 add r8, 1 23202 test rax, rax 23203 je .LBB3_720 23204 # %bb.425: 23205 mov rax, r8 23206 and rax, -2 23207 neg rax 23208 xor edi, edi 23209 pxor xmm0, xmm0 23210 movdqa xmm1, xmmword ptr [rip + .LCPI3_4] # xmm1 = [1,1] 23211 .LBB3_426: # =>This Inner Loop Header: Depth=1 23212 movdqu xmm2, xmmword ptr [rdx + 8*rdi] 23213 movdqu xmm3, xmmword ptr [rdx + 8*rdi + 16] 23214 pcmpeqq xmm2, xmm0 23215 pandn xmm2, xmm1 23216 pcmpeqq xmm3, xmm0 23217 pandn xmm3, xmm1 23218 movdqu xmmword ptr [rcx + 8*rdi], xmm2 23219 movdqu xmmword ptr [rcx + 8*rdi + 16], xmm3 23220 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 32] 23221 movdqu xmm3, xmmword ptr [rdx + 8*rdi + 48] 23222 pcmpeqq xmm2, xmm0 23223 pandn xmm2, xmm1 23224 pcmpeqq xmm3, xmm0 23225 pandn xmm3, xmm1 23226 movdqu xmmword ptr [rcx + 8*rdi + 32], xmm2 23227 movdqu xmmword ptr [rcx + 8*rdi + 48], xmm3 23228 add rdi, 8 23229 add rax, 2 23230 jne .LBB3_426 23231 jmp .LBB3_721 23232 .LBB3_427: 23233 mov esi, r9d 23234 and esi, -4 23235 lea rax, [rsi - 4] 23236 mov rdi, rax 23237 shr rdi, 2 23238 add rdi, 1 23239 mov r8d, edi 23240 and r8d, 3 23241 cmp rax, 12 23242 jae .LBB3_550 23243 # %bb.428: 23244 xor eax, eax 23245 jmp .LBB3_552 23246 .LBB3_429: 23247 mov esi, r9d 23248 and esi, -4 23249 lea rax, [rsi - 4] 23250 mov rdi, rax 23251 shr rdi, 2 23252 add rdi, 1 23253 mov r8d, edi 23254 and r8d, 3 23255 cmp rax, 12 23256 jae .LBB3_560 23257 # %bb.430: 23258 xor eax, eax 23259 jmp .LBB3_562 23260 .LBB3_438: 23261 mov esi, r9d 23262 and esi, -16 23263 lea rax, [rsi - 16] 23264 mov r8, rax 23265 shr r8, 4 23266 add r8, 1 23267 test rax, rax 23268 je .LBB3_728 23269 # %bb.439: 23270 mov rax, r8 23271 and rax, -2 23272 neg rax 23273 xor edi, edi 23274 .LBB3_440: # =>This Inner Loop Header: Depth=1 23275 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 23276 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 23277 pxor xmm2, xmm2 23278 psubw xmm2, xmm0 23279 pxor xmm0, xmm0 23280 psubw xmm0, xmm1 23281 movdqu xmmword ptr [rcx + 2*rdi], xmm2 23282 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm0 23283 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 23284 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 23285 pxor xmm2, xmm2 23286 psubw xmm2, xmm0 23287 pxor xmm0, xmm0 23288 psubw xmm0, xmm1 23289 movdqu xmmword ptr [rcx + 2*rdi + 32], xmm2 23290 movdqu xmmword ptr [rcx + 2*rdi + 48], xmm0 23291 add rdi, 32 23292 add rax, 2 23293 jne .LBB3_440 23294 jmp .LBB3_729 23295 .LBB3_441: 23296 mov esi, r9d 23297 and esi, -16 23298 lea rax, [rsi - 16] 23299 mov r8, rax 23300 shr r8, 4 23301 add r8, 1 23302 test rax, rax 23303 je .LBB3_736 23304 # %bb.442: 23305 mov rax, r8 23306 and rax, -2 23307 neg rax 23308 xor edi, edi 23309 .LBB3_443: # =>This Inner Loop Header: Depth=1 23310 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 23311 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 23312 pxor xmm2, xmm2 23313 psubw xmm2, xmm0 23314 pxor xmm0, xmm0 23315 psubw xmm0, xmm1 23316 movdqu xmmword ptr [rcx + 2*rdi], xmm2 23317 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm0 23318 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 23319 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 23320 pxor xmm2, xmm2 23321 psubw xmm2, xmm0 23322 pxor xmm0, xmm0 23323 psubw xmm0, xmm1 23324 movdqu xmmword ptr [rcx + 2*rdi + 32], xmm2 23325 movdqu xmmword ptr [rcx + 2*rdi + 48], xmm0 23326 add rdi, 32 23327 add rax, 2 23328 jne .LBB3_443 23329 jmp .LBB3_737 23330 .LBB3_444: 23331 mov esi, r9d 23332 and esi, -16 23333 lea rax, [rsi - 16] 23334 mov r8, rax 23335 shr r8, 4 23336 add r8, 1 23337 test rax, rax 23338 je .LBB3_744 23339 # %bb.445: 23340 mov rax, r8 23341 and rax, -2 23342 neg rax 23343 xor edi, edi 23344 .LBB3_446: # =>This Inner Loop Header: Depth=1 23345 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 23346 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 23347 pxor xmm2, xmm2 23348 psubw xmm2, xmm0 23349 pxor xmm0, xmm0 23350 psubw xmm0, xmm1 23351 movdqu xmmword ptr [rcx + 2*rdi], xmm2 23352 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm0 23353 movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32] 23354 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48] 23355 pxor xmm2, xmm2 23356 psubw xmm2, xmm0 23357 pxor xmm0, xmm0 23358 psubw xmm0, xmm1 23359 movdqu xmmword ptr [rcx + 2*rdi + 32], xmm2 23360 movdqu xmmword ptr [rcx + 2*rdi + 48], xmm0 23361 add rdi, 32 23362 add rax, 2 23363 jne .LBB3_446 23364 jmp .LBB3_745 23365 .LBB3_447: 23366 mov esi, r9d 23367 and esi, -16 23368 lea rax, [rsi - 16] 23369 mov r8, rax 23370 shr r8, 4 23371 add r8, 1 23372 test rax, rax 23373 je .LBB3_752 23374 # %bb.448: 23375 mov rax, r8 23376 and rax, -2 23377 neg rax 23378 xor edi, edi 23379 pxor xmm0, xmm0 23380 movdqa xmm1, xmmword ptr [rip + .LCPI3_5] # xmm1 = [1,1,1,1,1,1,1,1] 23381 .LBB3_449: # =>This Inner Loop Header: Depth=1 23382 movdqu xmm2, xmmword ptr [rdx + 2*rdi] 23383 movdqu xmm3, xmmword ptr [rdx + 2*rdi + 16] 23384 pcmpeqw xmm2, xmm0 23385 pandn xmm2, xmm1 23386 pcmpeqw xmm3, xmm0 23387 pandn xmm3, xmm1 23388 movdqu xmmword ptr [rcx + 2*rdi], xmm2 23389 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm3 23390 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 32] 23391 movdqu xmm3, xmmword ptr [rdx + 2*rdi + 48] 23392 pcmpeqw xmm2, xmm0 23393 pandn xmm2, xmm1 23394 pcmpeqw xmm3, xmm0 23395 pandn xmm3, xmm1 23396 movdqu xmmword ptr [rcx + 2*rdi + 32], xmm2 23397 movdqu xmmword ptr [rcx + 2*rdi + 48], xmm3 23398 add rdi, 32 23399 add rax, 2 23400 jne .LBB3_449 23401 jmp .LBB3_753 23402 .LBB3_450: 23403 mov esi, r9d 23404 and esi, -16 23405 lea rax, [rsi - 16] 23406 mov r8, rax 23407 shr r8, 4 23408 add r8, 1 23409 test rax, rax 23410 je .LBB3_760 23411 # %bb.451: 23412 mov rax, r8 23413 and rax, -2 23414 neg rax 23415 xor edi, edi 23416 pxor xmm2, xmm2 23417 pcmpeqd xmm3, xmm3 23418 movdqa xmm4, xmmword ptr [rip + .LCPI3_5] # xmm4 = [1,1,1,1,1,1,1,1] 23419 .LBB3_452: # =>This Inner Loop Header: Depth=1 23420 movdqu xmm5, xmmword ptr [rdx + 2*rdi] 23421 movdqu xmm6, xmmword ptr [rdx + 2*rdi + 16] 23422 movdqa xmm0, xmm4 23423 pcmpgtw xmm0, xmm5 23424 pcmpeqw xmm5, xmm2 23425 pxor xmm5, xmm3 23426 movdqa xmm1, xmm4 23427 pcmpgtw xmm1, xmm6 23428 pcmpeqw xmm6, xmm2 23429 pxor xmm6, xmm3 23430 movdqa xmm7, xmm4 23431 pblendvb xmm7, xmm5, xmm0 23432 movdqa xmm5, xmm4 23433 movdqa xmm0, xmm1 23434 pblendvb xmm5, xmm6, xmm0 23435 movdqu xmmword ptr [rcx + 2*rdi], xmm7 23436 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm5 23437 movdqu xmm5, xmmword ptr [rdx + 2*rdi + 32] 23438 movdqu xmm6, xmmword ptr [rdx + 2*rdi + 48] 23439 movdqa xmm0, xmm4 23440 pcmpgtw xmm0, xmm5 23441 pcmpeqw xmm5, xmm2 23442 pxor xmm5, xmm3 23443 movdqa xmm1, xmm4 23444 pcmpgtw xmm1, xmm6 23445 pcmpeqw xmm6, xmm2 23446 pxor xmm6, xmm3 23447 movdqa xmm7, xmm4 23448 pblendvb xmm7, xmm5, xmm0 23449 movdqa xmm5, xmm4 23450 movdqa xmm0, xmm1 23451 pblendvb xmm5, xmm6, xmm0 23452 movdqu xmmword ptr [rcx + 2*rdi + 32], xmm7 23453 movdqu xmmword ptr [rcx + 2*rdi + 48], xmm5 23454 add rdi, 32 23455 add rax, 2 23456 jne .LBB3_452 23457 jmp .LBB3_761 23458 .LBB3_453: 23459 mov esi, r9d 23460 and esi, -16 23461 lea rax, [rsi - 16] 23462 mov rdi, rax 23463 shr rdi, 4 23464 add rdi, 1 23465 mov r8d, edi 23466 and r8d, 3 23467 cmp rax, 48 23468 jae .LBB3_570 23469 # %bb.454: 23470 xor eax, eax 23471 jmp .LBB3_572 23472 .LBB3_455: 23473 mov esi, r9d 23474 and esi, -8 23475 lea rax, [rsi - 8] 23476 mov r8, rax 23477 shr r8, 3 23478 add r8, 1 23479 test rax, rax 23480 je .LBB3_769 23481 # %bb.456: 23482 mov rax, r8 23483 and rax, -2 23484 neg rax 23485 xor edi, edi 23486 pxor xmm0, xmm0 23487 .LBB3_457: # =>This Inner Loop Header: Depth=1 23488 pmovsxwd xmm1, qword ptr [rdx + 2*rdi + 8] 23489 pmovsxwd xmm2, qword ptr [rdx + 2*rdi] 23490 movdqa xmm3, xmm2 23491 psrad xmm3, 15 23492 movdqa xmm4, xmm1 23493 psrad xmm4, 15 23494 paddd xmm1, xmm4 23495 paddd xmm2, xmm3 23496 pxor xmm2, xmm3 23497 pxor xmm1, xmm4 23498 pblendw xmm1, xmm0, 170 # xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] 23499 pblendw xmm2, xmm0, 170 # xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] 23500 packusdw xmm2, xmm1 23501 movdqu xmmword ptr [rcx + 2*rdi], xmm2 23502 pmovsxwd xmm1, qword ptr [rdx + 2*rdi + 24] 23503 pmovsxwd xmm2, qword ptr [rdx + 2*rdi + 16] 23504 movdqa xmm3, xmm2 23505 psrad xmm3, 15 23506 movdqa xmm4, xmm1 23507 psrad xmm4, 15 23508 paddd xmm1, xmm4 23509 paddd xmm2, xmm3 23510 pxor xmm2, xmm3 23511 pxor xmm1, xmm4 23512 pblendw xmm1, xmm0, 170 # xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] 23513 pblendw xmm2, xmm0, 170 # xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] 23514 packusdw xmm2, xmm1 23515 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm2 23516 add rdi, 16 23517 add rax, 2 23518 jne .LBB3_457 23519 jmp .LBB3_770 23520 .LBB3_458: 23521 mov esi, r9d 23522 and esi, -16 23523 lea rax, [rsi - 16] 23524 mov rdi, rax 23525 shr rdi, 4 23526 add rdi, 1 23527 mov r8d, edi 23528 and r8d, 3 23529 cmp rax, 48 23530 jae .LBB3_580 23531 # %bb.459: 23532 xor eax, eax 23533 jmp .LBB3_582 23534 .LBB3_460: 23535 mov esi, r9d 23536 and esi, -8 23537 lea rax, [rsi - 8] 23538 mov r8, rax 23539 shr r8, 3 23540 add r8, 1 23541 test rax, rax 23542 je .LBB3_777 23543 # %bb.461: 23544 mov rax, r8 23545 and rax, -2 23546 neg rax 23547 xor edi, edi 23548 pxor xmm0, xmm0 23549 .LBB3_462: # =>This Inner Loop Header: Depth=1 23550 pmovsxwd xmm1, qword ptr [rdx + 2*rdi + 8] 23551 pmovsxwd xmm2, qword ptr [rdx + 2*rdi] 23552 movdqa xmm3, xmm2 23553 psrad xmm3, 15 23554 movdqa xmm4, xmm1 23555 psrad xmm4, 15 23556 paddd xmm1, xmm4 23557 paddd xmm2, xmm3 23558 pxor xmm2, xmm3 23559 pxor xmm1, xmm4 23560 pblendw xmm1, xmm0, 170 # xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] 23561 pblendw xmm2, xmm0, 170 # xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] 23562 packusdw xmm2, xmm1 23563 movdqu xmmword ptr [rcx + 2*rdi], xmm2 23564 pmovsxwd xmm1, qword ptr [rdx + 2*rdi + 24] 23565 pmovsxwd xmm2, qword ptr [rdx + 2*rdi + 16] 23566 movdqa xmm3, xmm2 23567 psrad xmm3, 15 23568 movdqa xmm4, xmm1 23569 psrad xmm4, 15 23570 paddd xmm1, xmm4 23571 paddd xmm2, xmm3 23572 pxor xmm2, xmm3 23573 pxor xmm1, xmm4 23574 pblendw xmm1, xmm0, 170 # xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] 23575 pblendw xmm2, xmm0, 170 # xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] 23576 packusdw xmm2, xmm1 23577 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm2 23578 add rdi, 16 23579 add rax, 2 23580 jne .LBB3_462 23581 jmp .LBB3_778 23582 .LBB3_463: 23583 mov esi, r9d 23584 and esi, -4 23585 lea rax, [rsi - 4] 23586 mov r8, rax 23587 shr r8, 2 23588 add r8, 1 23589 test rax, rax 23590 je .LBB3_785 23591 # %bb.464: 23592 mov rax, r8 23593 and rax, -2 23594 neg rax 23595 xor edi, edi 23596 .LBB3_465: # =>This Inner Loop Header: Depth=1 23597 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 23598 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 23599 pxor xmm2, xmm2 23600 psubq xmm2, xmm0 23601 pxor xmm0, xmm0 23602 psubq xmm0, xmm1 23603 movdqu xmmword ptr [rcx + 8*rdi], xmm2 23604 movdqu xmmword ptr [rcx + 8*rdi + 16], xmm0 23605 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] 23606 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] 23607 pxor xmm2, xmm2 23608 psubq xmm2, xmm0 23609 pxor xmm0, xmm0 23610 psubq xmm0, xmm1 23611 movdqu xmmword ptr [rcx + 8*rdi + 32], xmm2 23612 movdqu xmmword ptr [rcx + 8*rdi + 48], xmm0 23613 add rdi, 8 23614 add rax, 2 23615 jne .LBB3_465 23616 jmp .LBB3_786 23617 .LBB3_466: 23618 mov esi, r9d 23619 and esi, -8 23620 lea rax, [rsi - 8] 23621 mov r8, rax 23622 shr r8, 3 23623 add r8, 1 23624 test rax, rax 23625 je .LBB3_793 23626 # %bb.467: 23627 mov rax, r8 23628 and rax, -2 23629 neg rax 23630 xor edi, edi 23631 movapd xmm0, xmmword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 23632 .LBB3_468: # =>This Inner Loop Header: Depth=1 23633 movupd xmm1, xmmword ptr [rdx + 4*rdi] 23634 movupd xmm2, xmmword ptr [rdx + 4*rdi + 16] 23635 xorpd xmm1, xmm0 23636 xorpd xmm2, xmm0 23637 movupd xmmword ptr [rcx + 4*rdi], xmm1 23638 movupd xmmword ptr [rcx + 4*rdi + 16], xmm2 23639 movupd xmm1, xmmword ptr [rdx + 4*rdi + 32] 23640 movupd xmm2, xmmword ptr [rdx + 4*rdi + 48] 23641 xorpd xmm1, xmm0 23642 xorpd xmm2, xmm0 23643 movupd xmmword ptr [rcx + 4*rdi + 32], xmm1 23644 movupd xmmword ptr [rcx + 4*rdi + 48], xmm2 23645 add rdi, 16 23646 add rax, 2 23647 jne .LBB3_468 23648 jmp .LBB3_794 23649 .LBB3_469: 23650 mov esi, r9d 23651 and esi, -4 23652 lea rax, [rsi - 4] 23653 mov r8, rax 23654 shr r8, 2 23655 add r8, 1 23656 test rax, rax 23657 je .LBB3_803 23658 # %bb.470: 23659 mov rax, r8 23660 and rax, -2 23661 neg rax 23662 xor edi, edi 23663 .LBB3_471: # =>This Inner Loop Header: Depth=1 23664 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 23665 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 23666 pxor xmm2, xmm2 23667 psubq xmm2, xmm0 23668 pxor xmm0, xmm0 23669 psubq xmm0, xmm1 23670 movdqu xmmword ptr [rcx + 8*rdi], xmm2 23671 movdqu xmmword ptr [rcx + 8*rdi + 16], xmm0 23672 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] 23673 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] 23674 pxor xmm2, xmm2 23675 psubq xmm2, xmm0 23676 pxor xmm0, xmm0 23677 psubq xmm0, xmm1 23678 movdqu xmmword ptr [rcx + 8*rdi + 32], xmm2 23679 movdqu xmmword ptr [rcx + 8*rdi + 48], xmm0 23680 add rdi, 8 23681 add rax, 2 23682 jne .LBB3_471 23683 jmp .LBB3_804 23684 .LBB3_472: 23685 mov esi, r9d 23686 and esi, -8 23687 lea rax, [rsi - 8] 23688 mov r8, rax 23689 shr r8, 3 23690 add r8, 1 23691 test rax, rax 23692 je .LBB3_811 23693 # %bb.473: 23694 mov rax, r8 23695 and rax, -2 23696 neg rax 23697 xor edi, edi 23698 movapd xmm0, xmmword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 23699 .LBB3_474: # =>This Inner Loop Header: Depth=1 23700 movupd xmm1, xmmword ptr [rdx + 4*rdi] 23701 movupd xmm2, xmmword ptr [rdx + 4*rdi + 16] 23702 xorpd xmm1, xmm0 23703 xorpd xmm2, xmm0 23704 movupd xmmword ptr [rcx + 4*rdi], xmm1 23705 movupd xmmword ptr [rcx + 4*rdi + 16], xmm2 23706 movupd xmm1, xmmword ptr [rdx + 4*rdi + 32] 23707 movupd xmm2, xmmword ptr [rdx + 4*rdi + 48] 23708 xorpd xmm1, xmm0 23709 xorpd xmm2, xmm0 23710 movupd xmmword ptr [rcx + 4*rdi + 32], xmm1 23711 movupd xmmword ptr [rcx + 4*rdi + 48], xmm2 23712 add rdi, 16 23713 add rax, 2 23714 jne .LBB3_474 23715 jmp .LBB3_812 23716 .LBB3_475: 23717 mov esi, r9d 23718 and esi, -4 23719 lea rax, [rsi - 4] 23720 mov r8, rax 23721 shr r8, 2 23722 add r8, 1 23723 test rax, rax 23724 je .LBB3_821 23725 # %bb.476: 23726 mov rax, r8 23727 and rax, -2 23728 neg rax 23729 xor edi, edi 23730 pxor xmm2, xmm2 23731 pcmpeqd xmm3, xmm3 23732 movdqa xmm4, xmmword ptr [rip + .LCPI3_4] # xmm4 = [1,1] 23733 .LBB3_477: # =>This Inner Loop Header: Depth=1 23734 movdqu xmm5, xmmword ptr [rdx + 8*rdi] 23735 movdqu xmm6, xmmword ptr [rdx + 8*rdi + 16] 23736 movdqa xmm0, xmm4 23737 pcmpgtq xmm0, xmm5 23738 pcmpeqq xmm5, xmm2 23739 pxor xmm5, xmm3 23740 movdqa xmm1, xmm4 23741 pcmpgtq xmm1, xmm6 23742 pcmpeqq xmm6, xmm2 23743 pxor xmm6, xmm3 23744 movdqa xmm7, xmm4 23745 blendvpd xmm7, xmm5, xmm0 23746 movdqa xmm5, xmm4 23747 movdqa xmm0, xmm1 23748 blendvpd xmm5, xmm6, xmm0 23749 movupd xmmword ptr [rcx + 8*rdi], xmm7 23750 movupd xmmword ptr [rcx + 8*rdi + 16], xmm5 23751 movdqu xmm5, xmmword ptr [rdx + 8*rdi + 32] 23752 movdqu xmm6, xmmword ptr [rdx + 8*rdi + 48] 23753 movdqa xmm0, xmm4 23754 pcmpgtq xmm0, xmm5 23755 pcmpeqq xmm5, xmm2 23756 pxor xmm5, xmm3 23757 movdqa xmm1, xmm4 23758 pcmpgtq xmm1, xmm6 23759 pcmpeqq xmm6, xmm2 23760 pxor xmm6, xmm3 23761 movdqa xmm7, xmm4 23762 blendvpd xmm7, xmm5, xmm0 23763 movdqa xmm5, xmm4 23764 movdqa xmm0, xmm1 23765 blendvpd xmm5, xmm6, xmm0 23766 movupd xmmword ptr [rcx + 8*rdi + 32], xmm7 23767 movupd xmmword ptr [rcx + 8*rdi + 48], xmm5 23768 add rdi, 8 23769 add rax, 2 23770 jne .LBB3_477 23771 jmp .LBB3_822 23772 .LBB3_478: 23773 mov esi, eax 23774 and esi, -8 23775 xor edi, edi 23776 xorps xmm0, xmm0 23777 movdqa xmm1, xmmword ptr [rip + .LCPI3_3] # xmm1 = [1,1,1,1] 23778 .LBB3_479: # =>This Inner Loop Header: Depth=1 23779 movdqu xmm2, xmmword ptr [rdx + 4*rdi] 23780 movdqu xmm3, xmmword ptr [rdx + 4*rdi + 16] 23781 movdqa xmm4, xmm2 23782 psrad xmm4, 31 23783 por xmm4, xmm1 23784 movdqa xmm5, xmm3 23785 psrad xmm5, 31 23786 por xmm5, xmm1 23787 cvtdq2ps xmm4, xmm4 23788 cvtdq2ps xmm5, xmm5 23789 cmpneqps xmm2, xmm0 23790 andps xmm2, xmm4 23791 cmpneqps xmm3, xmm0 23792 andps xmm3, xmm5 23793 movups xmmword ptr [rcx + 4*rdi], xmm2 23794 movups xmmword ptr [rcx + 4*rdi + 16], xmm3 23795 add rdi, 8 23796 cmp rsi, rdi 23797 jne .LBB3_479 23798 # %bb.480: 23799 cmp rsi, rax 23800 je .LBB3_923 23801 jmp .LBB3_481 23802 .LBB3_486: 23803 mov esi, r9d 23804 and esi, -4 23805 lea rax, [rsi - 4] 23806 mov r8, rax 23807 shr r8, 2 23808 add r8, 1 23809 test rax, rax 23810 je .LBB3_830 23811 # %bb.487: 23812 mov rax, r8 23813 and rax, -2 23814 neg rax 23815 xor edi, edi 23816 .LBB3_488: # =>This Inner Loop Header: Depth=1 23817 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 23818 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 23819 pxor xmm3, xmm3 23820 psubq xmm3, xmm1 23821 movdqa xmm0, xmm1 23822 blendvpd xmm1, xmm3, xmm0 23823 pxor xmm3, xmm3 23824 psubq xmm3, xmm2 23825 movdqa xmm0, xmm2 23826 blendvpd xmm2, xmm3, xmm0 23827 movupd xmmword ptr [rcx + 8*rdi], xmm1 23828 movupd xmmword ptr [rcx + 8*rdi + 16], xmm2 23829 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] 23830 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] 23831 pxor xmm3, xmm3 23832 psubq xmm3, xmm1 23833 movdqa xmm0, xmm1 23834 blendvpd xmm1, xmm3, xmm0 23835 pxor xmm3, xmm3 23836 psubq xmm3, xmm2 23837 movdqa xmm0, xmm2 23838 blendvpd xmm2, xmm3, xmm0 23839 movupd xmmword ptr [rcx + 8*rdi + 32], xmm1 23840 movupd xmmword ptr [rcx + 8*rdi + 48], xmm2 23841 add rdi, 8 23842 add rax, 2 23843 jne .LBB3_488 23844 jmp .LBB3_831 23845 .LBB3_489: 23846 mov esi, r9d 23847 and esi, -8 23848 lea rax, [rsi - 8] 23849 mov r8, rax 23850 shr r8, 3 23851 add r8, 1 23852 test rax, rax 23853 je .LBB3_838 23854 # %bb.490: 23855 mov rax, r8 23856 and rax, -2 23857 neg rax 23858 xor edi, edi 23859 movapd xmm0, xmmword ptr [rip + .LCPI3_9] # xmm0 = [2147483647,2147483647,2147483647,2147483647] 23860 .LBB3_491: # =>This Inner Loop Header: Depth=1 23861 movupd xmm1, xmmword ptr [rdx + 4*rdi] 23862 movupd xmm2, xmmword ptr [rdx + 4*rdi + 16] 23863 andpd xmm1, xmm0 23864 andpd xmm2, xmm0 23865 movupd xmmword ptr [rcx + 4*rdi], xmm1 23866 movupd xmmword ptr [rcx + 4*rdi + 16], xmm2 23867 movupd xmm1, xmmword ptr [rdx + 4*rdi + 32] 23868 movupd xmm2, xmmword ptr [rdx + 4*rdi + 48] 23869 andpd xmm1, xmm0 23870 andpd xmm2, xmm0 23871 movupd xmmword ptr [rcx + 4*rdi + 32], xmm1 23872 movupd xmmword ptr [rcx + 4*rdi + 48], xmm2 23873 add rdi, 16 23874 add rax, 2 23875 jne .LBB3_491 23876 jmp .LBB3_839 23877 .LBB3_492: 23878 mov esi, r9d 23879 and esi, -4 23880 lea rax, [rsi - 4] 23881 mov r8, rax 23882 shr r8, 2 23883 add r8, 1 23884 test rax, rax 23885 je .LBB3_848 23886 # %bb.493: 23887 mov rax, r8 23888 and rax, -2 23889 neg rax 23890 xor edi, edi 23891 .LBB3_494: # =>This Inner Loop Header: Depth=1 23892 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 23893 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 23894 pxor xmm3, xmm3 23895 psubq xmm3, xmm1 23896 movdqa xmm0, xmm1 23897 blendvpd xmm1, xmm3, xmm0 23898 pxor xmm3, xmm3 23899 psubq xmm3, xmm2 23900 movdqa xmm0, xmm2 23901 blendvpd xmm2, xmm3, xmm0 23902 movupd xmmword ptr [rcx + 8*rdi], xmm1 23903 movupd xmmword ptr [rcx + 8*rdi + 16], xmm2 23904 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] 23905 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] 23906 pxor xmm3, xmm3 23907 psubq xmm3, xmm1 23908 movdqa xmm0, xmm1 23909 blendvpd xmm1, xmm3, xmm0 23910 pxor xmm3, xmm3 23911 psubq xmm3, xmm2 23912 movdqa xmm0, xmm2 23913 blendvpd xmm2, xmm3, xmm0 23914 movupd xmmword ptr [rcx + 8*rdi + 32], xmm1 23915 movupd xmmword ptr [rcx + 8*rdi + 48], xmm2 23916 add rdi, 8 23917 add rax, 2 23918 jne .LBB3_494 23919 jmp .LBB3_849 23920 .LBB3_495: 23921 mov esi, r9d 23922 and esi, -8 23923 lea rax, [rsi - 8] 23924 mov r8, rax 23925 shr r8, 3 23926 add r8, 1 23927 test rax, rax 23928 je .LBB3_856 23929 # %bb.496: 23930 mov rax, r8 23931 and rax, -2 23932 neg rax 23933 xor edi, edi 23934 movapd xmm0, xmmword ptr [rip + .LCPI3_9] # xmm0 = [2147483647,2147483647,2147483647,2147483647] 23935 .LBB3_497: # =>This Inner Loop Header: Depth=1 23936 movupd xmm1, xmmword ptr [rdx + 4*rdi] 23937 movupd xmm2, xmmword ptr [rdx + 4*rdi + 16] 23938 andpd xmm1, xmm0 23939 andpd xmm2, xmm0 23940 movupd xmmword ptr [rcx + 4*rdi], xmm1 23941 movupd xmmword ptr [rcx + 4*rdi + 16], xmm2 23942 movupd xmm1, xmmword ptr [rdx + 4*rdi + 32] 23943 movupd xmm2, xmmword ptr [rdx + 4*rdi + 48] 23944 andpd xmm1, xmm0 23945 andpd xmm2, xmm0 23946 movupd xmmword ptr [rcx + 4*rdi + 32], xmm1 23947 movupd xmmword ptr [rcx + 4*rdi + 48], xmm2 23948 add rdi, 16 23949 add rax, 2 23950 jne .LBB3_497 23951 jmp .LBB3_857 23952 .LBB3_505: 23953 mov esi, r9d 23954 and esi, -32 23955 lea rax, [rsi - 32] 23956 mov r8, rax 23957 shr r8, 5 23958 add r8, 1 23959 test rax, rax 23960 je .LBB3_866 23961 # %bb.506: 23962 mov rax, r8 23963 and rax, -2 23964 neg rax 23965 xor edi, edi 23966 .LBB3_507: # =>This Inner Loop Header: Depth=1 23967 movdqu xmm0, xmmword ptr [rdx + rdi] 23968 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 23969 pxor xmm2, xmm2 23970 psubb xmm2, xmm0 23971 pxor xmm0, xmm0 23972 psubb xmm0, xmm1 23973 movdqu xmmword ptr [rcx + rdi], xmm2 23974 movdqu xmmword ptr [rcx + rdi + 16], xmm0 23975 movdqu xmm0, xmmword ptr [rdx + rdi + 32] 23976 movdqu xmm1, xmmword ptr [rdx + rdi + 48] 23977 pxor xmm2, xmm2 23978 psubb xmm2, xmm0 23979 pxor xmm0, xmm0 23980 psubb xmm0, xmm1 23981 movdqu xmmword ptr [rcx + rdi + 32], xmm2 23982 movdqu xmmword ptr [rcx + rdi + 48], xmm0 23983 add rdi, 64 23984 add rax, 2 23985 jne .LBB3_507 23986 jmp .LBB3_867 23987 .LBB3_508: 23988 mov esi, r9d 23989 and esi, -32 23990 lea rax, [rsi - 32] 23991 mov r8, rax 23992 shr r8, 5 23993 add r8, 1 23994 test rax, rax 23995 je .LBB3_874 23996 # %bb.509: 23997 mov rax, r8 23998 and rax, -2 23999 neg rax 24000 xor edi, edi 24001 pxor xmm0, xmm0 24002 movdqa xmm1, xmmword ptr [rip + .LCPI3_6] # xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 24003 .LBB3_510: # =>This Inner Loop Header: Depth=1 24004 movdqu xmm2, xmmword ptr [rdx + rdi] 24005 movdqu xmm3, xmmword ptr [rdx + rdi + 16] 24006 pcmpeqb xmm2, xmm0 24007 pandn xmm2, xmm1 24008 pcmpeqb xmm3, xmm0 24009 pandn xmm3, xmm1 24010 movdqu xmmword ptr [rcx + rdi], xmm2 24011 movdqu xmmword ptr [rcx + rdi + 16], xmm3 24012 movdqu xmm2, xmmword ptr [rdx + rdi + 32] 24013 movdqu xmm3, xmmword ptr [rdx + rdi + 48] 24014 pcmpeqb xmm2, xmm0 24015 pandn xmm2, xmm1 24016 pcmpeqb xmm3, xmm0 24017 pandn xmm3, xmm1 24018 movdqu xmmword ptr [rcx + rdi + 32], xmm2 24019 movdqu xmmword ptr [rcx + rdi + 48], xmm3 24020 add rdi, 64 24021 add rax, 2 24022 jne .LBB3_510 24023 jmp .LBB3_875 24024 .LBB3_511: 24025 mov esi, r9d 24026 and esi, -32 24027 lea rax, [rsi - 32] 24028 mov rdi, rax 24029 shr rdi, 5 24030 add rdi, 1 24031 mov r8d, edi 24032 and r8d, 3 24033 cmp rax, 96 24034 jae .LBB3_590 24035 # %bb.512: 24036 xor eax, eax 24037 jmp .LBB3_592 24038 .LBB3_513: 24039 mov esi, r9d 24040 and esi, -32 24041 lea rax, [rsi - 32] 24042 mov rdi, rax 24043 shr rdi, 5 24044 add rdi, 1 24045 mov r8d, edi 24046 and r8d, 3 24047 cmp rax, 96 24048 jae .LBB3_600 24049 # %bb.514: 24050 xor eax, eax 24051 jmp .LBB3_602 24052 .LBB3_515: 24053 mov esi, r9d 24054 and esi, -8 24055 lea rax, [rsi - 8] 24056 mov r8, rax 24057 shr r8, 3 24058 add r8, 1 24059 test rax, rax 24060 je .LBB3_882 24061 # %bb.516: 24062 mov rax, r8 24063 and rax, -2 24064 neg rax 24065 xor edi, edi 24066 .LBB3_517: # =>This Inner Loop Header: Depth=1 24067 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 24068 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 24069 pxor xmm2, xmm2 24070 psubd xmm2, xmm0 24071 pxor xmm0, xmm0 24072 psubd xmm0, xmm1 24073 movdqu xmmword ptr [rcx + 4*rdi], xmm2 24074 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm0 24075 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 24076 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 24077 pxor xmm2, xmm2 24078 psubd xmm2, xmm0 24079 pxor xmm0, xmm0 24080 psubd xmm0, xmm1 24081 movdqu xmmword ptr [rcx + 4*rdi + 32], xmm2 24082 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm0 24083 add rdi, 16 24084 add rax, 2 24085 jne .LBB3_517 24086 jmp .LBB3_883 24087 .LBB3_518: 24088 mov esi, r9d 24089 and esi, -8 24090 lea rax, [rsi - 8] 24091 mov r8, rax 24092 shr r8, 3 24093 add r8, 1 24094 test rax, rax 24095 je .LBB3_890 24096 # %bb.519: 24097 mov rax, r8 24098 and rax, -2 24099 neg rax 24100 xor edi, edi 24101 .LBB3_520: # =>This Inner Loop Header: Depth=1 24102 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 24103 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 24104 pxor xmm2, xmm2 24105 psubd xmm2, xmm0 24106 pxor xmm0, xmm0 24107 psubd xmm0, xmm1 24108 movdqu xmmword ptr [rcx + 4*rdi], xmm2 24109 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm0 24110 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 24111 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 24112 pxor xmm2, xmm2 24113 psubd xmm2, xmm0 24114 pxor xmm0, xmm0 24115 psubd xmm0, xmm1 24116 movdqu xmmword ptr [rcx + 4*rdi + 32], xmm2 24117 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm0 24118 add rdi, 16 24119 add rax, 2 24120 jne .LBB3_520 24121 jmp .LBB3_891 24122 .LBB3_521: 24123 mov esi, r9d 24124 and esi, -8 24125 lea rax, [rsi - 8] 24126 mov r8, rax 24127 shr r8, 3 24128 add r8, 1 24129 test rax, rax 24130 je .LBB3_898 24131 # %bb.522: 24132 mov rax, r8 24133 and rax, -2 24134 neg rax 24135 xor edi, edi 24136 pxor xmm2, xmm2 24137 pcmpeqd xmm3, xmm3 24138 movdqa xmm4, xmmword ptr [rip + .LCPI3_3] # xmm4 = [1,1,1,1] 24139 .LBB3_523: # =>This Inner Loop Header: Depth=1 24140 movdqu xmm5, xmmword ptr [rdx + 4*rdi] 24141 movdqu xmm6, xmmword ptr [rdx + 4*rdi + 16] 24142 movdqa xmm0, xmm4 24143 pcmpgtd xmm0, xmm5 24144 pcmpeqd xmm5, xmm2 24145 pxor xmm5, xmm3 24146 movdqa xmm1, xmm4 24147 pcmpgtd xmm1, xmm6 24148 pcmpeqd xmm6, xmm2 24149 pxor xmm6, xmm3 24150 movdqa xmm7, xmm4 24151 blendvps xmm7, xmm5, xmm0 24152 movdqa xmm5, xmm4 24153 movdqa xmm0, xmm1 24154 blendvps xmm5, xmm6, xmm0 24155 movups xmmword ptr [rcx + 4*rdi], xmm7 24156 movups xmmword ptr [rcx + 4*rdi + 16], xmm5 24157 movdqu xmm5, xmmword ptr [rdx + 4*rdi + 32] 24158 movdqu xmm6, xmmword ptr [rdx + 4*rdi + 48] 24159 movdqa xmm0, xmm4 24160 pcmpgtd xmm0, xmm5 24161 pcmpeqd xmm5, xmm2 24162 pxor xmm5, xmm3 24163 movdqa xmm1, xmm4 24164 pcmpgtd xmm1, xmm6 24165 pcmpeqd xmm6, xmm2 24166 pxor xmm6, xmm3 24167 movdqa xmm7, xmm4 24168 blendvps xmm7, xmm5, xmm0 24169 movdqa xmm5, xmm4 24170 movdqa xmm0, xmm1 24171 blendvps xmm5, xmm6, xmm0 24172 movups xmmword ptr [rcx + 4*rdi + 32], xmm7 24173 movups xmmword ptr [rcx + 4*rdi + 48], xmm5 24174 add rdi, 16 24175 add rax, 2 24176 jne .LBB3_523 24177 jmp .LBB3_899 24178 .LBB3_524: 24179 mov esi, r9d 24180 and esi, -8 24181 lea rax, [rsi - 8] 24182 mov r8, rax 24183 shr r8, 3 24184 add r8, 1 24185 test rax, rax 24186 je .LBB3_907 24187 # %bb.525: 24188 mov rax, r8 24189 and rax, -2 24190 neg rax 24191 xor edi, edi 24192 .LBB3_526: # =>This Inner Loop Header: Depth=1 24193 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 24194 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 24195 pabsd xmm0, xmm0 24196 pabsd xmm1, xmm1 24197 movdqu xmmword ptr [rcx + 4*rdi], xmm0 24198 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 24199 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 24200 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 24201 pabsd xmm0, xmm0 24202 pabsd xmm1, xmm1 24203 movdqu xmmword ptr [rcx + 4*rdi + 32], xmm0 24204 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm1 24205 add rdi, 16 24206 add rax, 2 24207 jne .LBB3_526 24208 jmp .LBB3_908 24209 .LBB3_527: 24210 mov esi, r9d 24211 and esi, -8 24212 lea rax, [rsi - 8] 24213 mov r8, rax 24214 shr r8, 3 24215 add r8, 1 24216 test rax, rax 24217 je .LBB3_915 24218 # %bb.528: 24219 mov rax, r8 24220 and rax, -2 24221 neg rax 24222 xor edi, edi 24223 .LBB3_529: # =>This Inner Loop Header: Depth=1 24224 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 24225 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 24226 pabsd xmm0, xmm0 24227 pabsd xmm1, xmm1 24228 movdqu xmmword ptr [rcx + 4*rdi], xmm0 24229 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 24230 movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32] 24231 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48] 24232 pabsd xmm0, xmm0 24233 pabsd xmm1, xmm1 24234 movdqu xmmword ptr [rcx + 4*rdi + 32], xmm0 24235 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm1 24236 add rdi, 16 24237 add rax, 2 24238 jne .LBB3_529 24239 jmp .LBB3_916 24240 .LBB3_367: 24241 and rdi, -8 24242 neg rdi 24243 xor eax, eax 24244 xorpd xmm0, xmm0 24245 .LBB3_368: # =>This Inner Loop Header: Depth=1 24246 movupd xmmword ptr [rcx + 4*rax], xmm0 24247 movupd xmmword ptr [rcx + 4*rax + 16], xmm0 24248 movupd xmmword ptr [rcx + 4*rax + 32], xmm0 24249 movupd xmmword ptr [rcx + 4*rax + 48], xmm0 24250 movupd xmmword ptr [rcx + 4*rax + 64], xmm0 24251 movupd xmmword ptr [rcx + 4*rax + 80], xmm0 24252 movupd xmmword ptr [rcx + 4*rax + 96], xmm0 24253 movupd xmmword ptr [rcx + 4*rax + 112], xmm0 24254 movupd xmmword ptr [rcx + 4*rax + 128], xmm0 24255 movupd xmmword ptr [rcx + 4*rax + 144], xmm0 24256 movupd xmmword ptr [rcx + 4*rax + 160], xmm0 24257 movupd xmmword ptr [rcx + 4*rax + 176], xmm0 24258 movupd xmmword ptr [rcx + 4*rax + 192], xmm0 24259 movupd xmmword ptr [rcx + 4*rax + 208], xmm0 24260 movupd xmmword ptr [rcx + 4*rax + 224], xmm0 24261 movupd xmmword ptr [rcx + 4*rax + 240], xmm0 24262 add rax, 64 24263 add rdi, 8 24264 jne .LBB3_368 24265 .LBB3_369: 24266 test rsi, rsi 24267 je .LBB3_372 24268 # %bb.370: 24269 lea rax, [rcx + 4*rax] 24270 add rax, 16 24271 neg rsi 24272 xorpd xmm0, xmm0 24273 .LBB3_371: # =>This Inner Loop Header: Depth=1 24274 movupd xmmword ptr [rax - 16], xmm0 24275 movupd xmmword ptr [rax], xmm0 24276 add rax, 32 24277 inc rsi 24278 jne .LBB3_371 24279 .LBB3_372: 24280 cmp rdx, r9 24281 je .LBB3_923 24282 .p2align 4, 0x90 24283 .LBB3_373: # =>This Inner Loop Header: Depth=1 24284 mov dword ptr [rcx + 4*rdx], 0 24285 add rdx, 1 24286 cmp r9, rdx 24287 jne .LBB3_373 24288 jmp .LBB3_923 24289 .LBB3_414: 24290 and rdi, -8 24291 neg rdi 24292 xor eax, eax 24293 xorpd xmm0, xmm0 24294 .LBB3_415: # =>This Inner Loop Header: Depth=1 24295 movupd xmmword ptr [rcx + 8*rax], xmm0 24296 movupd xmmword ptr [rcx + 8*rax + 16], xmm0 24297 movupd xmmword ptr [rcx + 8*rax + 32], xmm0 24298 movupd xmmword ptr [rcx + 8*rax + 48], xmm0 24299 movupd xmmword ptr [rcx + 8*rax + 64], xmm0 24300 movupd xmmword ptr [rcx + 8*rax + 80], xmm0 24301 movupd xmmword ptr [rcx + 8*rax + 96], xmm0 24302 movupd xmmword ptr [rcx + 8*rax + 112], xmm0 24303 movupd xmmword ptr [rcx + 8*rax + 128], xmm0 24304 movupd xmmword ptr [rcx + 8*rax + 144], xmm0 24305 movupd xmmword ptr [rcx + 8*rax + 160], xmm0 24306 movupd xmmword ptr [rcx + 8*rax + 176], xmm0 24307 movupd xmmword ptr [rcx + 8*rax + 192], xmm0 24308 movupd xmmword ptr [rcx + 8*rax + 208], xmm0 24309 movupd xmmword ptr [rcx + 8*rax + 224], xmm0 24310 movupd xmmword ptr [rcx + 8*rax + 240], xmm0 24311 add rax, 32 24312 add rdi, 8 24313 jne .LBB3_415 24314 .LBB3_416: 24315 test rsi, rsi 24316 je .LBB3_419 24317 # %bb.417: 24318 lea rax, [rcx + 8*rax] 24319 add rax, 16 24320 neg rsi 24321 xorpd xmm0, xmm0 24322 .LBB3_418: # =>This Inner Loop Header: Depth=1 24323 movupd xmmword ptr [rax - 16], xmm0 24324 movupd xmmword ptr [rax], xmm0 24325 add rax, 32 24326 inc rsi 24327 jne .LBB3_418 24328 .LBB3_419: 24329 cmp rdx, r9 24330 je .LBB3_923 24331 .p2align 4, 0x90 24332 .LBB3_420: # =>This Inner Loop Header: Depth=1 24333 mov qword ptr [rcx + 8*rdx], 0 24334 add rdx, 1 24335 cmp r9, rdx 24336 jne .LBB3_420 24337 jmp .LBB3_923 24338 .LBB3_431: 24339 and rdi, -8 24340 neg rdi 24341 xor eax, eax 24342 xorpd xmm0, xmm0 24343 .LBB3_432: # =>This Inner Loop Header: Depth=1 24344 movupd xmmword ptr [rcx + 2*rax], xmm0 24345 movupd xmmword ptr [rcx + 2*rax + 16], xmm0 24346 movupd xmmword ptr [rcx + 2*rax + 32], xmm0 24347 movupd xmmword ptr [rcx + 2*rax + 48], xmm0 24348 movupd xmmword ptr [rcx + 2*rax + 64], xmm0 24349 movupd xmmword ptr [rcx + 2*rax + 80], xmm0 24350 movupd xmmword ptr [rcx + 2*rax + 96], xmm0 24351 movupd xmmword ptr [rcx + 2*rax + 112], xmm0 24352 movupd xmmword ptr [rcx + 2*rax + 128], xmm0 24353 movupd xmmword ptr [rcx + 2*rax + 144], xmm0 24354 movupd xmmword ptr [rcx + 2*rax + 160], xmm0 24355 movupd xmmword ptr [rcx + 2*rax + 176], xmm0 24356 movupd xmmword ptr [rcx + 2*rax + 192], xmm0 24357 movupd xmmword ptr [rcx + 2*rax + 208], xmm0 24358 movupd xmmword ptr [rcx + 2*rax + 224], xmm0 24359 movupd xmmword ptr [rcx + 2*rax + 240], xmm0 24360 sub rax, -128 24361 add rdi, 8 24362 jne .LBB3_432 24363 .LBB3_433: 24364 test rsi, rsi 24365 je .LBB3_436 24366 # %bb.434: 24367 lea rax, [rcx + 2*rax] 24368 add rax, 16 24369 neg rsi 24370 xorpd xmm0, xmm0 24371 .LBB3_435: # =>This Inner Loop Header: Depth=1 24372 movupd xmmword ptr [rax - 16], xmm0 24373 movupd xmmword ptr [rax], xmm0 24374 add rax, 32 24375 inc rsi 24376 jne .LBB3_435 24377 .LBB3_436: 24378 cmp rdx, r9 24379 je .LBB3_923 24380 .p2align 4, 0x90 24381 .LBB3_437: # =>This Inner Loop Header: Depth=1 24382 mov word ptr [rcx + 2*rdx], 0 24383 add rdx, 1 24384 cmp r9, rdx 24385 jne .LBB3_437 24386 jmp .LBB3_923 24387 .LBB3_498: 24388 and rdi, -8 24389 neg rdi 24390 xor eax, eax 24391 xorpd xmm0, xmm0 24392 .LBB3_499: # =>This Inner Loop Header: Depth=1 24393 movupd xmmword ptr [rcx + rax], xmm0 24394 movupd xmmword ptr [rcx + rax + 16], xmm0 24395 movupd xmmword ptr [rcx + rax + 32], xmm0 24396 movupd xmmword ptr [rcx + rax + 48], xmm0 24397 movupd xmmword ptr [rcx + rax + 64], xmm0 24398 movupd xmmword ptr [rcx + rax + 80], xmm0 24399 movupd xmmword ptr [rcx + rax + 96], xmm0 24400 movupd xmmword ptr [rcx + rax + 112], xmm0 24401 movupd xmmword ptr [rcx + rax + 128], xmm0 24402 movupd xmmword ptr [rcx + rax + 144], xmm0 24403 movupd xmmword ptr [rcx + rax + 160], xmm0 24404 movupd xmmword ptr [rcx + rax + 176], xmm0 24405 movupd xmmword ptr [rcx + rax + 192], xmm0 24406 movupd xmmword ptr [rcx + rax + 208], xmm0 24407 movupd xmmword ptr [rcx + rax + 224], xmm0 24408 movupd xmmword ptr [rcx + rax + 240], xmm0 24409 add rax, 256 24410 add rdi, 8 24411 jne .LBB3_499 24412 .LBB3_500: 24413 test rsi, rsi 24414 je .LBB3_503 24415 # %bb.501: 24416 add rax, rcx 24417 add rax, 16 24418 neg rsi 24419 xorpd xmm0, xmm0 24420 .LBB3_502: # =>This Inner Loop Header: Depth=1 24421 movupd xmmword ptr [rax - 16], xmm0 24422 movupd xmmword ptr [rax], xmm0 24423 add rax, 32 24424 inc rsi 24425 jne .LBB3_502 24426 .LBB3_503: 24427 cmp rdx, r9 24428 je .LBB3_923 24429 .p2align 4, 0x90 24430 .LBB3_504: # =>This Inner Loop Header: Depth=1 24431 mov byte ptr [rcx + rdx], 0 24432 add rdx, 1 24433 cmp r9, rdx 24434 jne .LBB3_504 24435 .LBB3_923: 24436 mov rsp, rbp 24437 pop rbp 24438 ret 24439 .LBB3_530: 24440 and rdi, -4 24441 neg rdi 24442 xor eax, eax 24443 .LBB3_531: # =>This Inner Loop Header: Depth=1 24444 movups xmm0, xmmword ptr [rdx + 4*rax] 24445 movups xmm1, xmmword ptr [rdx + 4*rax + 16] 24446 movups xmmword ptr [rcx + 4*rax], xmm0 24447 movups xmmword ptr [rcx + 4*rax + 16], xmm1 24448 movups xmm0, xmmword ptr [rdx + 4*rax + 32] 24449 movups xmm1, xmmword ptr [rdx + 4*rax + 48] 24450 movups xmmword ptr [rcx + 4*rax + 32], xmm0 24451 movups xmmword ptr [rcx + 4*rax + 48], xmm1 24452 movups xmm0, xmmword ptr [rdx + 4*rax + 64] 24453 movups xmm1, xmmword ptr [rdx + 4*rax + 80] 24454 movups xmmword ptr [rcx + 4*rax + 64], xmm0 24455 movups xmmword ptr [rcx + 4*rax + 80], xmm1 24456 movupd xmm0, xmmword ptr [rdx + 4*rax + 96] 24457 movupd xmm1, xmmword ptr [rdx + 4*rax + 112] 24458 movupd xmmword ptr [rcx + 4*rax + 96], xmm0 24459 movupd xmmword ptr [rcx + 4*rax + 112], xmm1 24460 add rax, 32 24461 add rdi, 4 24462 jne .LBB3_531 24463 .LBB3_532: 24464 test r8, r8 24465 je .LBB3_535 24466 # %bb.533: 24467 lea rax, [4*rax + 16] 24468 neg r8 24469 .LBB3_534: # =>This Inner Loop Header: Depth=1 24470 movupd xmm0, xmmword ptr [rdx + rax - 16] 24471 movupd xmm1, xmmword ptr [rdx + rax] 24472 movupd xmmword ptr [rcx + rax - 16], xmm0 24473 movupd xmmword ptr [rcx + rax], xmm1 24474 add rax, 32 24475 inc r8 24476 jne .LBB3_534 24477 .LBB3_535: 24478 cmp rsi, r9 24479 je .LBB3_923 24480 jmp .LBB3_536 24481 .LBB3_540: 24482 and rdi, -4 24483 neg rdi 24484 xor eax, eax 24485 .LBB3_541: # =>This Inner Loop Header: Depth=1 24486 movups xmm0, xmmword ptr [rdx + 4*rax] 24487 movups xmm1, xmmword ptr [rdx + 4*rax + 16] 24488 movups xmmword ptr [rcx + 4*rax], xmm0 24489 movups xmmword ptr [rcx + 4*rax + 16], xmm1 24490 movups xmm0, xmmword ptr [rdx + 4*rax + 32] 24491 movups xmm1, xmmword ptr [rdx + 4*rax + 48] 24492 movups xmmword ptr [rcx + 4*rax + 32], xmm0 24493 movups xmmword ptr [rcx + 4*rax + 48], xmm1 24494 movups xmm0, xmmword ptr [rdx + 4*rax + 64] 24495 movups xmm1, xmmword ptr [rdx + 4*rax + 80] 24496 movups xmmword ptr [rcx + 4*rax + 64], xmm0 24497 movups xmmword ptr [rcx + 4*rax + 80], xmm1 24498 movupd xmm0, xmmword ptr [rdx + 4*rax + 96] 24499 movupd xmm1, xmmword ptr [rdx + 4*rax + 112] 24500 movupd xmmword ptr [rcx + 4*rax + 96], xmm0 24501 movupd xmmword ptr [rcx + 4*rax + 112], xmm1 24502 add rax, 32 24503 add rdi, 4 24504 jne .LBB3_541 24505 .LBB3_542: 24506 test r8, r8 24507 je .LBB3_545 24508 # %bb.543: 24509 lea rax, [4*rax + 16] 24510 neg r8 24511 .LBB3_544: # =>This Inner Loop Header: Depth=1 24512 movupd xmm0, xmmword ptr [rdx + rax - 16] 24513 movupd xmm1, xmmword ptr [rdx + rax] 24514 movupd xmmword ptr [rcx + rax - 16], xmm0 24515 movupd xmmword ptr [rcx + rax], xmm1 24516 add rax, 32 24517 inc r8 24518 jne .LBB3_544 24519 .LBB3_545: 24520 cmp rsi, r9 24521 je .LBB3_923 24522 jmp .LBB3_546 24523 .LBB3_550: 24524 and rdi, -4 24525 neg rdi 24526 xor eax, eax 24527 .LBB3_551: # =>This Inner Loop Header: Depth=1 24528 movups xmm0, xmmword ptr [rdx + 8*rax] 24529 movups xmm1, xmmword ptr [rdx + 8*rax + 16] 24530 movups xmmword ptr [rcx + 8*rax], xmm0 24531 movups xmmword ptr [rcx + 8*rax + 16], xmm1 24532 movups xmm0, xmmword ptr [rdx + 8*rax + 32] 24533 movups xmm1, xmmword ptr [rdx + 8*rax + 48] 24534 movups xmmword ptr [rcx + 8*rax + 32], xmm0 24535 movups xmmword ptr [rcx + 8*rax + 48], xmm1 24536 movups xmm0, xmmword ptr [rdx + 8*rax + 64] 24537 movups xmm1, xmmword ptr [rdx + 8*rax + 80] 24538 movups xmmword ptr [rcx + 8*rax + 64], xmm0 24539 movups xmmword ptr [rcx + 8*rax + 80], xmm1 24540 movupd xmm0, xmmword ptr [rdx + 8*rax + 96] 24541 movupd xmm1, xmmword ptr [rdx + 8*rax + 112] 24542 movupd xmmword ptr [rcx + 8*rax + 96], xmm0 24543 movupd xmmword ptr [rcx + 8*rax + 112], xmm1 24544 add rax, 16 24545 add rdi, 4 24546 jne .LBB3_551 24547 .LBB3_552: 24548 test r8, r8 24549 je .LBB3_555 24550 # %bb.553: 24551 lea rax, [8*rax + 16] 24552 neg r8 24553 .LBB3_554: # =>This Inner Loop Header: Depth=1 24554 movupd xmm0, xmmword ptr [rdx + rax - 16] 24555 movupd xmm1, xmmword ptr [rdx + rax] 24556 movupd xmmword ptr [rcx + rax - 16], xmm0 24557 movupd xmmword ptr [rcx + rax], xmm1 24558 add rax, 32 24559 inc r8 24560 jne .LBB3_554 24561 .LBB3_555: 24562 cmp rsi, r9 24563 je .LBB3_923 24564 jmp .LBB3_556 24565 .LBB3_560: 24566 and rdi, -4 24567 neg rdi 24568 xor eax, eax 24569 .LBB3_561: # =>This Inner Loop Header: Depth=1 24570 movups xmm0, xmmword ptr [rdx + 8*rax] 24571 movups xmm1, xmmword ptr [rdx + 8*rax + 16] 24572 movups xmmword ptr [rcx + 8*rax], xmm0 24573 movups xmmword ptr [rcx + 8*rax + 16], xmm1 24574 movups xmm0, xmmword ptr [rdx + 8*rax + 32] 24575 movups xmm1, xmmword ptr [rdx + 8*rax + 48] 24576 movups xmmword ptr [rcx + 8*rax + 32], xmm0 24577 movups xmmword ptr [rcx + 8*rax + 48], xmm1 24578 movups xmm0, xmmword ptr [rdx + 8*rax + 64] 24579 movups xmm1, xmmword ptr [rdx + 8*rax + 80] 24580 movups xmmword ptr [rcx + 8*rax + 64], xmm0 24581 movups xmmword ptr [rcx + 8*rax + 80], xmm1 24582 movupd xmm0, xmmword ptr [rdx + 8*rax + 96] 24583 movupd xmm1, xmmword ptr [rdx + 8*rax + 112] 24584 movupd xmmword ptr [rcx + 8*rax + 96], xmm0 24585 movupd xmmword ptr [rcx + 8*rax + 112], xmm1 24586 add rax, 16 24587 add rdi, 4 24588 jne .LBB3_561 24589 .LBB3_562: 24590 test r8, r8 24591 je .LBB3_565 24592 # %bb.563: 24593 lea rax, [8*rax + 16] 24594 neg r8 24595 .LBB3_564: # =>This Inner Loop Header: Depth=1 24596 movupd xmm0, xmmword ptr [rdx + rax - 16] 24597 movupd xmm1, xmmword ptr [rdx + rax] 24598 movupd xmmword ptr [rcx + rax - 16], xmm0 24599 movupd xmmword ptr [rcx + rax], xmm1 24600 add rax, 32 24601 inc r8 24602 jne .LBB3_564 24603 .LBB3_565: 24604 cmp rsi, r9 24605 je .LBB3_923 24606 jmp .LBB3_566 24607 .LBB3_570: 24608 and rdi, -4 24609 neg rdi 24610 xor eax, eax 24611 .LBB3_571: # =>This Inner Loop Header: Depth=1 24612 movups xmm0, xmmword ptr [rdx + 2*rax] 24613 movups xmm1, xmmword ptr [rdx + 2*rax + 16] 24614 movups xmmword ptr [rcx + 2*rax], xmm0 24615 movups xmmword ptr [rcx + 2*rax + 16], xmm1 24616 movups xmm0, xmmword ptr [rdx + 2*rax + 32] 24617 movups xmm1, xmmword ptr [rdx + 2*rax + 48] 24618 movups xmmword ptr [rcx + 2*rax + 32], xmm0 24619 movups xmmword ptr [rcx + 2*rax + 48], xmm1 24620 movups xmm0, xmmword ptr [rdx + 2*rax + 64] 24621 movups xmm1, xmmword ptr [rdx + 2*rax + 80] 24622 movups xmmword ptr [rcx + 2*rax + 64], xmm0 24623 movups xmmword ptr [rcx + 2*rax + 80], xmm1 24624 movupd xmm0, xmmword ptr [rdx + 2*rax + 96] 24625 movupd xmm1, xmmword ptr [rdx + 2*rax + 112] 24626 movupd xmmword ptr [rcx + 2*rax + 96], xmm0 24627 movupd xmmword ptr [rcx + 2*rax + 112], xmm1 24628 add rax, 64 24629 add rdi, 4 24630 jne .LBB3_571 24631 .LBB3_572: 24632 test r8, r8 24633 je .LBB3_575 24634 # %bb.573: 24635 add rax, rax 24636 add rax, 16 24637 neg r8 24638 .LBB3_574: # =>This Inner Loop Header: Depth=1 24639 movupd xmm0, xmmword ptr [rdx + rax - 16] 24640 movupd xmm1, xmmword ptr [rdx + rax] 24641 movupd xmmword ptr [rcx + rax - 16], xmm0 24642 movupd xmmword ptr [rcx + rax], xmm1 24643 add rax, 32 24644 inc r8 24645 jne .LBB3_574 24646 .LBB3_575: 24647 cmp rsi, r9 24648 je .LBB3_923 24649 jmp .LBB3_576 24650 .LBB3_580: 24651 and rdi, -4 24652 neg rdi 24653 xor eax, eax 24654 .LBB3_581: # =>This Inner Loop Header: Depth=1 24655 movups xmm0, xmmword ptr [rdx + 2*rax] 24656 movups xmm1, xmmword ptr [rdx + 2*rax + 16] 24657 movups xmmword ptr [rcx + 2*rax], xmm0 24658 movups xmmword ptr [rcx + 2*rax + 16], xmm1 24659 movups xmm0, xmmword ptr [rdx + 2*rax + 32] 24660 movups xmm1, xmmword ptr [rdx + 2*rax + 48] 24661 movups xmmword ptr [rcx + 2*rax + 32], xmm0 24662 movups xmmword ptr [rcx + 2*rax + 48], xmm1 24663 movups xmm0, xmmword ptr [rdx + 2*rax + 64] 24664 movups xmm1, xmmword ptr [rdx + 2*rax + 80] 24665 movups xmmword ptr [rcx + 2*rax + 64], xmm0 24666 movups xmmword ptr [rcx + 2*rax + 80], xmm1 24667 movupd xmm0, xmmword ptr [rdx + 2*rax + 96] 24668 movupd xmm1, xmmword ptr [rdx + 2*rax + 112] 24669 movupd xmmword ptr [rcx + 2*rax + 96], xmm0 24670 movupd xmmword ptr [rcx + 2*rax + 112], xmm1 24671 add rax, 64 24672 add rdi, 4 24673 jne .LBB3_581 24674 .LBB3_582: 24675 test r8, r8 24676 je .LBB3_585 24677 # %bb.583: 24678 add rax, rax 24679 add rax, 16 24680 neg r8 24681 .LBB3_584: # =>This Inner Loop Header: Depth=1 24682 movupd xmm0, xmmword ptr [rdx + rax - 16] 24683 movupd xmm1, xmmword ptr [rdx + rax] 24684 movupd xmmword ptr [rcx + rax - 16], xmm0 24685 movupd xmmword ptr [rcx + rax], xmm1 24686 add rax, 32 24687 inc r8 24688 jne .LBB3_584 24689 .LBB3_585: 24690 cmp rsi, r9 24691 je .LBB3_923 24692 jmp .LBB3_586 24693 .LBB3_590: 24694 and rdi, -4 24695 neg rdi 24696 xor eax, eax 24697 .LBB3_591: # =>This Inner Loop Header: Depth=1 24698 movups xmm0, xmmword ptr [rdx + rax] 24699 movups xmm1, xmmword ptr [rdx + rax + 16] 24700 movups xmmword ptr [rcx + rax], xmm0 24701 movups xmmword ptr [rcx + rax + 16], xmm1 24702 movups xmm0, xmmword ptr [rdx + rax + 32] 24703 movups xmm1, xmmword ptr [rdx + rax + 48] 24704 movups xmmword ptr [rcx + rax + 32], xmm0 24705 movups xmmword ptr [rcx + rax + 48], xmm1 24706 movups xmm0, xmmword ptr [rdx + rax + 64] 24707 movups xmm1, xmmword ptr [rdx + rax + 80] 24708 movups xmmword ptr [rcx + rax + 64], xmm0 24709 movups xmmword ptr [rcx + rax + 80], xmm1 24710 movupd xmm0, xmmword ptr [rdx + rax + 96] 24711 movupd xmm1, xmmword ptr [rdx + rax + 112] 24712 movupd xmmword ptr [rcx + rax + 96], xmm0 24713 movupd xmmword ptr [rcx + rax + 112], xmm1 24714 sub rax, -128 24715 add rdi, 4 24716 jne .LBB3_591 24717 .LBB3_592: 24718 test r8, r8 24719 je .LBB3_595 24720 # %bb.593: 24721 add rax, 16 24722 neg r8 24723 .LBB3_594: # =>This Inner Loop Header: Depth=1 24724 movupd xmm0, xmmword ptr [rdx + rax - 16] 24725 movupd xmm1, xmmword ptr [rdx + rax] 24726 movupd xmmword ptr [rcx + rax - 16], xmm0 24727 movupd xmmword ptr [rcx + rax], xmm1 24728 add rax, 32 24729 inc r8 24730 jne .LBB3_594 24731 .LBB3_595: 24732 cmp rsi, r9 24733 je .LBB3_923 24734 jmp .LBB3_596 24735 .LBB3_600: 24736 and rdi, -4 24737 neg rdi 24738 xor eax, eax 24739 .LBB3_601: # =>This Inner Loop Header: Depth=1 24740 movups xmm0, xmmword ptr [rdx + rax] 24741 movups xmm1, xmmword ptr [rdx + rax + 16] 24742 movups xmmword ptr [rcx + rax], xmm0 24743 movups xmmword ptr [rcx + rax + 16], xmm1 24744 movups xmm0, xmmword ptr [rdx + rax + 32] 24745 movups xmm1, xmmword ptr [rdx + rax + 48] 24746 movups xmmword ptr [rcx + rax + 32], xmm0 24747 movups xmmword ptr [rcx + rax + 48], xmm1 24748 movups xmm0, xmmword ptr [rdx + rax + 64] 24749 movups xmm1, xmmword ptr [rdx + rax + 80] 24750 movups xmmword ptr [rcx + rax + 64], xmm0 24751 movups xmmword ptr [rcx + rax + 80], xmm1 24752 movupd xmm0, xmmword ptr [rdx + rax + 96] 24753 movupd xmm1, xmmword ptr [rdx + rax + 112] 24754 movupd xmmword ptr [rcx + rax + 96], xmm0 24755 movupd xmmword ptr [rcx + rax + 112], xmm1 24756 sub rax, -128 24757 add rdi, 4 24758 jne .LBB3_601 24759 .LBB3_602: 24760 test r8, r8 24761 je .LBB3_605 24762 # %bb.603: 24763 add rax, 16 24764 neg r8 24765 .LBB3_604: # =>This Inner Loop Header: Depth=1 24766 movupd xmm0, xmmword ptr [rdx + rax - 16] 24767 movupd xmm1, xmmword ptr [rdx + rax] 24768 movupd xmmword ptr [rcx + rax - 16], xmm0 24769 movupd xmmword ptr [rcx + rax], xmm1 24770 add rax, 32 24771 inc r8 24772 jne .LBB3_604 24773 .LBB3_605: 24774 cmp rsi, r9 24775 je .LBB3_923 24776 jmp .LBB3_606 24777 .LBB3_610: 24778 xor edi, edi 24779 .LBB3_611: 24780 test r8b, 1 24781 je .LBB3_613 24782 # %bb.612: 24783 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 24784 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 24785 pxor xmm2, xmm2 24786 pxor xmm3, xmm3 24787 psubd xmm3, xmm0 24788 psubd xmm2, xmm1 24789 movdqu xmmword ptr [rcx + 4*rdi], xmm3 24790 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm2 24791 .LBB3_613: 24792 cmp rsi, r9 24793 je .LBB3_923 24794 jmp .LBB3_614 24795 .LBB3_618: 24796 xor edi, edi 24797 .LBB3_619: 24798 test r8b, 1 24799 je .LBB3_621 24800 # %bb.620: 24801 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 24802 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 24803 pxor xmm2, xmm2 24804 pcmpeqd xmm0, xmm2 24805 movdqa xmm3, xmmword ptr [rip + .LCPI3_3] # xmm3 = [1,1,1,1] 24806 pandn xmm0, xmm3 24807 pcmpeqd xmm1, xmm2 24808 pandn xmm1, xmm3 24809 movdqu xmmword ptr [rcx + 4*rdi], xmm0 24810 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 24811 .LBB3_621: 24812 cmp rsi, r9 24813 je .LBB3_923 24814 jmp .LBB3_622 24815 .LBB3_626: 24816 xor edi, edi 24817 .LBB3_627: 24818 test r8b, 1 24819 je .LBB3_629 24820 # %bb.628: 24821 movupd xmm0, xmmword ptr [rdx + 8*rdi] 24822 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 24823 movapd xmm2, xmmword ptr [rip + .LCPI3_0] # xmm2 = [-0.0E+0,-0.0E+0] 24824 xorpd xmm0, xmm2 24825 xorpd xmm1, xmm2 24826 movupd xmmword ptr [rcx + 8*rdi], xmm0 24827 movupd xmmword ptr [rcx + 8*rdi + 16], xmm1 24828 .LBB3_629: 24829 cmp rsi, r9 24830 je .LBB3_923 24831 jmp .LBB3_630 24832 .LBB3_636: 24833 xor edi, edi 24834 .LBB3_637: 24835 test r8b, 1 24836 je .LBB3_639 24837 # %bb.638: 24838 movupd xmm0, xmmword ptr [rdx + 8*rdi] 24839 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 24840 movapd xmm2, xmmword ptr [rip + .LCPI3_0] # xmm2 = [-0.0E+0,-0.0E+0] 24841 xorpd xmm0, xmm2 24842 xorpd xmm1, xmm2 24843 movupd xmmword ptr [rcx + 8*rdi], xmm0 24844 movupd xmmword ptr [rcx + 8*rdi + 16], xmm1 24845 .LBB3_639: 24846 cmp rsi, r9 24847 je .LBB3_923 24848 jmp .LBB3_640 24849 .LBB3_646: 24850 xor edi, edi 24851 .LBB3_647: 24852 test r8b, 1 24853 je .LBB3_649 24854 # %bb.648: 24855 movupd xmm0, xmmword ptr [rdx + 8*rdi] 24856 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 24857 xorpd xmm2, xmm2 24858 movapd xmm3, xmmword ptr [rip + .LCPI3_0] # xmm3 = [-0.0E+0,-0.0E+0] 24859 movapd xmm4, xmm0 24860 andpd xmm4, xmm3 24861 movapd xmm5, xmmword ptr [rip + .LCPI3_1] # xmm5 = [1.0E+0,1.0E+0] 24862 orpd xmm4, xmm5 24863 andpd xmm3, xmm1 24864 orpd xmm3, xmm5 24865 cmpneqpd xmm0, xmm2 24866 andpd xmm0, xmm4 24867 cmpneqpd xmm1, xmm2 24868 andpd xmm1, xmm3 24869 movupd xmmword ptr [rcx + 8*rdi], xmm0 24870 movupd xmmword ptr [rcx + 8*rdi + 16], xmm1 24871 .LBB3_649: 24872 cmp rsi, r9 24873 je .LBB3_923 24874 jmp .LBB3_650 24875 .LBB3_655: 24876 xor edi, edi 24877 .LBB3_656: 24878 test r8b, 1 24879 je .LBB3_658 24880 # %bb.657: 24881 movupd xmm0, xmmword ptr [rdx + 8*rdi] 24882 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 24883 movapd xmm2, xmmword ptr [rip + .LCPI3_8] # xmm2 = [9223372036854775807,9223372036854775807] 24884 andpd xmm0, xmm2 24885 andpd xmm1, xmm2 24886 movupd xmmword ptr [rcx + 8*rdi], xmm0 24887 movupd xmmword ptr [rcx + 8*rdi + 16], xmm1 24888 .LBB3_658: 24889 cmp rsi, r9 24890 je .LBB3_923 24891 jmp .LBB3_659 24892 .LBB3_663: 24893 xor edi, edi 24894 .LBB3_664: 24895 test r8b, 1 24896 je .LBB3_666 24897 # %bb.665: 24898 movupd xmm0, xmmword ptr [rdx + 8*rdi] 24899 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] 24900 movapd xmm2, xmmword ptr [rip + .LCPI3_8] # xmm2 = [9223372036854775807,9223372036854775807] 24901 andpd xmm0, xmm2 24902 andpd xmm1, xmm2 24903 movupd xmmword ptr [rcx + 8*rdi], xmm0 24904 movupd xmmword ptr [rcx + 8*rdi + 16], xmm1 24905 .LBB3_666: 24906 cmp rsi, r9 24907 je .LBB3_923 24908 jmp .LBB3_667 24909 .LBB3_671: 24910 xor edi, edi 24911 .LBB3_672: 24912 test r8b, 1 24913 je .LBB3_674 24914 # %bb.673: 24915 movdqu xmm0, xmmword ptr [rdx + rdi] 24916 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 24917 pxor xmm2, xmm2 24918 pxor xmm3, xmm3 24919 psubb xmm3, xmm0 24920 psubb xmm2, xmm1 24921 movdqu xmmword ptr [rcx + rdi], xmm3 24922 movdqu xmmword ptr [rcx + rdi + 16], xmm2 24923 .LBB3_674: 24924 cmp rsi, r9 24925 je .LBB3_923 24926 jmp .LBB3_675 24927 .LBB3_679: 24928 xor edi, edi 24929 .LBB3_680: 24930 test r8b, 1 24931 je .LBB3_682 24932 # %bb.681: 24933 movdqu xmm0, xmmword ptr [rdx + rdi] 24934 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 24935 pxor xmm2, xmm2 24936 pxor xmm3, xmm3 24937 psubb xmm3, xmm0 24938 psubb xmm2, xmm1 24939 movdqu xmmword ptr [rcx + rdi], xmm3 24940 movdqu xmmword ptr [rcx + rdi + 16], xmm2 24941 .LBB3_682: 24942 cmp rsi, r9 24943 je .LBB3_923 24944 jmp .LBB3_683 24945 .LBB3_687: 24946 xor edi, edi 24947 .LBB3_688: 24948 test r8b, 1 24949 je .LBB3_690 24950 # %bb.689: 24951 movdqu xmm1, xmmword ptr [rdx + rdi] 24952 movdqu xmm2, xmmword ptr [rdx + rdi + 16] 24953 pxor xmm3, xmm3 24954 movdqa xmm4, xmmword ptr [rip + .LCPI3_6] # xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 24955 movdqa xmm0, xmm4 24956 pcmpgtb xmm0, xmm1 24957 movdqa xmm5, xmm1 24958 pcmpeqb xmm5, xmm3 24959 pcmpeqd xmm1, xmm1 24960 pxor xmm5, xmm1 24961 pcmpeqb xmm3, xmm2 24962 pxor xmm3, xmm1 24963 movdqa xmm1, xmm4 24964 pcmpgtb xmm1, xmm2 24965 movdqa xmm2, xmm4 24966 pblendvb xmm2, xmm5, xmm0 24967 movdqa xmm0, xmm1 24968 pblendvb xmm4, xmm3, xmm0 24969 movdqu xmmword ptr [rcx + rdi], xmm2 24970 movdqu xmmword ptr [rcx + rdi + 16], xmm4 24971 .LBB3_690: 24972 cmp rsi, r9 24973 je .LBB3_923 24974 jmp .LBB3_691 24975 .LBB3_696: 24976 xor edi, edi 24977 .LBB3_697: 24978 test r8b, 1 24979 je .LBB3_699 24980 # %bb.698: 24981 pmovsxbd xmm3, dword ptr [rdx + rdi + 12] 24982 pmovsxbd xmm0, dword ptr [rdx + rdi + 8] 24983 pmovsxbd xmm2, dword ptr [rdx + rdi + 4] 24984 pmovsxbd xmm1, dword ptr [rdx + rdi] 24985 movdqa xmm4, xmm1 24986 psrad xmm4, 7 24987 movdqa xmm5, xmm2 24988 psrad xmm5, 7 24989 movdqa xmm6, xmm0 24990 psrad xmm6, 7 24991 movdqa xmm7, xmm3 24992 psrad xmm7, 7 24993 paddd xmm3, xmm7 24994 paddd xmm0, xmm6 24995 paddd xmm2, xmm5 24996 paddd xmm1, xmm4 24997 pxor xmm1, xmm4 24998 pxor xmm2, xmm5 24999 pxor xmm0, xmm6 25000 pxor xmm3, xmm7 25001 movdqa xmm4, xmmword ptr [rip + .LCPI3_10] # xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 25002 pand xmm3, xmm4 25003 pand xmm0, xmm4 25004 packusdw xmm0, xmm3 25005 pand xmm2, xmm4 25006 pand xmm1, xmm4 25007 packusdw xmm1, xmm2 25008 packuswb xmm1, xmm0 25009 movdqu xmmword ptr [rcx + rdi], xmm1 25010 .LBB3_699: 25011 cmp rsi, r9 25012 je .LBB3_923 25013 jmp .LBB3_700 25014 .LBB3_704: 25015 xor edi, edi 25016 .LBB3_705: 25017 test r8b, 1 25018 je .LBB3_707 25019 # %bb.706: 25020 pmovsxbd xmm3, dword ptr [rdx + rdi + 12] 25021 pmovsxbd xmm0, dword ptr [rdx + rdi + 8] 25022 pmovsxbd xmm2, dword ptr [rdx + rdi + 4] 25023 pmovsxbd xmm1, dword ptr [rdx + rdi] 25024 movdqa xmm4, xmm1 25025 psrad xmm4, 7 25026 movdqa xmm5, xmm2 25027 psrad xmm5, 7 25028 movdqa xmm6, xmm0 25029 psrad xmm6, 7 25030 movdqa xmm7, xmm3 25031 psrad xmm7, 7 25032 paddd xmm3, xmm7 25033 paddd xmm0, xmm6 25034 paddd xmm2, xmm5 25035 paddd xmm1, xmm4 25036 pxor xmm1, xmm4 25037 pxor xmm2, xmm5 25038 pxor xmm0, xmm6 25039 pxor xmm3, xmm7 25040 movdqa xmm4, xmmword ptr [rip + .LCPI3_10] # xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 25041 pand xmm3, xmm4 25042 pand xmm0, xmm4 25043 packusdw xmm0, xmm3 25044 pand xmm2, xmm4 25045 pand xmm1, xmm4 25046 packusdw xmm1, xmm2 25047 packuswb xmm1, xmm0 25048 movdqu xmmword ptr [rcx + rdi], xmm1 25049 .LBB3_707: 25050 cmp rsi, r9 25051 je .LBB3_923 25052 jmp .LBB3_708 25053 .LBB3_712: 25054 xor edi, edi 25055 .LBB3_713: 25056 test r8b, 1 25057 je .LBB3_715 25058 # %bb.714: 25059 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 25060 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 25061 pxor xmm2, xmm2 25062 pxor xmm3, xmm3 25063 psubq xmm3, xmm0 25064 psubq xmm2, xmm1 25065 movdqu xmmword ptr [rcx + 8*rdi], xmm3 25066 movdqu xmmword ptr [rcx + 8*rdi + 16], xmm2 25067 .LBB3_715: 25068 cmp rsi, r9 25069 je .LBB3_923 25070 jmp .LBB3_716 25071 .LBB3_720: 25072 xor edi, edi 25073 .LBB3_721: 25074 test r8b, 1 25075 je .LBB3_723 25076 # %bb.722: 25077 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 25078 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 25079 pxor xmm2, xmm2 25080 pcmpeqq xmm0, xmm2 25081 movdqa xmm3, xmmword ptr [rip + .LCPI3_4] # xmm3 = [1,1] 25082 pandn xmm0, xmm3 25083 pcmpeqq xmm1, xmm2 25084 pandn xmm1, xmm3 25085 movdqu xmmword ptr [rcx + 8*rdi], xmm0 25086 movdqu xmmword ptr [rcx + 8*rdi + 16], xmm1 25087 .LBB3_723: 25088 cmp rsi, r9 25089 je .LBB3_923 25090 jmp .LBB3_724 25091 .LBB3_728: 25092 xor edi, edi 25093 .LBB3_729: 25094 test r8b, 1 25095 je .LBB3_731 25096 # %bb.730: 25097 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 25098 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 25099 pxor xmm2, xmm2 25100 pxor xmm3, xmm3 25101 psubw xmm3, xmm0 25102 psubw xmm2, xmm1 25103 movdqu xmmword ptr [rcx + 2*rdi], xmm3 25104 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm2 25105 .LBB3_731: 25106 cmp rsi, r9 25107 je .LBB3_923 25108 jmp .LBB3_732 25109 .LBB3_736: 25110 xor edi, edi 25111 .LBB3_737: 25112 test r8b, 1 25113 je .LBB3_739 25114 # %bb.738: 25115 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 25116 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 25117 pxor xmm2, xmm2 25118 pxor xmm3, xmm3 25119 psubw xmm3, xmm0 25120 psubw xmm2, xmm1 25121 movdqu xmmword ptr [rcx + 2*rdi], xmm3 25122 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm2 25123 .LBB3_739: 25124 cmp rsi, r9 25125 je .LBB3_923 25126 jmp .LBB3_740 25127 .LBB3_744: 25128 xor edi, edi 25129 .LBB3_745: 25130 test r8b, 1 25131 je .LBB3_747 25132 # %bb.746: 25133 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 25134 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 25135 pxor xmm2, xmm2 25136 pxor xmm3, xmm3 25137 psubw xmm3, xmm0 25138 psubw xmm2, xmm1 25139 movdqu xmmword ptr [rcx + 2*rdi], xmm3 25140 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm2 25141 .LBB3_747: 25142 cmp rsi, r9 25143 je .LBB3_923 25144 jmp .LBB3_748 25145 .LBB3_752: 25146 xor edi, edi 25147 .LBB3_753: 25148 test r8b, 1 25149 je .LBB3_755 25150 # %bb.754: 25151 movdqu xmm0, xmmword ptr [rdx + 2*rdi] 25152 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] 25153 pxor xmm2, xmm2 25154 pcmpeqw xmm0, xmm2 25155 movdqa xmm3, xmmword ptr [rip + .LCPI3_5] # xmm3 = [1,1,1,1,1,1,1,1] 25156 pandn xmm0, xmm3 25157 pcmpeqw xmm1, xmm2 25158 pandn xmm1, xmm3 25159 movdqu xmmword ptr [rcx + 2*rdi], xmm0 25160 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm1 25161 .LBB3_755: 25162 cmp rsi, r9 25163 je .LBB3_923 25164 jmp .LBB3_756 25165 .LBB3_760: 25166 xor edi, edi 25167 .LBB3_761: 25168 test r8b, 1 25169 je .LBB3_763 25170 # %bb.762: 25171 movdqu xmm1, xmmword ptr [rdx + 2*rdi] 25172 movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] 25173 pxor xmm3, xmm3 25174 movdqa xmm4, xmmword ptr [rip + .LCPI3_5] # xmm4 = [1,1,1,1,1,1,1,1] 25175 movdqa xmm0, xmm4 25176 pcmpgtw xmm0, xmm1 25177 movdqa xmm5, xmm1 25178 pcmpeqw xmm5, xmm3 25179 pcmpeqd xmm1, xmm1 25180 pxor xmm5, xmm1 25181 pcmpeqw xmm3, xmm2 25182 pxor xmm3, xmm1 25183 movdqa xmm1, xmm4 25184 pcmpgtw xmm1, xmm2 25185 movdqa xmm2, xmm4 25186 pblendvb xmm2, xmm5, xmm0 25187 movdqa xmm0, xmm1 25188 pblendvb xmm4, xmm3, xmm0 25189 movdqu xmmword ptr [rcx + 2*rdi], xmm2 25190 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm4 25191 .LBB3_763: 25192 cmp rsi, r9 25193 je .LBB3_923 25194 jmp .LBB3_764 25195 .LBB3_769: 25196 xor edi, edi 25197 .LBB3_770: 25198 test r8b, 1 25199 je .LBB3_772 25200 # %bb.771: 25201 pmovsxwd xmm0, qword ptr [rdx + 2*rdi + 8] 25202 pmovsxwd xmm1, qword ptr [rdx + 2*rdi] 25203 movdqa xmm2, xmm1 25204 psrad xmm2, 15 25205 movdqa xmm3, xmm0 25206 psrad xmm3, 15 25207 paddd xmm0, xmm3 25208 paddd xmm1, xmm2 25209 pxor xmm1, xmm2 25210 pxor xmm0, xmm3 25211 pxor xmm2, xmm2 25212 pblendw xmm0, xmm2, 170 # xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 25213 pblendw xmm1, xmm2, 170 # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 25214 packusdw xmm1, xmm0 25215 movdqu xmmword ptr [rcx + 2*rdi], xmm1 25216 .LBB3_772: 25217 cmp rsi, r9 25218 je .LBB3_923 25219 jmp .LBB3_773 25220 .LBB3_777: 25221 xor edi, edi 25222 .LBB3_778: 25223 test r8b, 1 25224 je .LBB3_780 25225 # %bb.779: 25226 pmovsxwd xmm0, qword ptr [rdx + 2*rdi + 8] 25227 pmovsxwd xmm1, qword ptr [rdx + 2*rdi] 25228 movdqa xmm2, xmm1 25229 psrad xmm2, 15 25230 movdqa xmm3, xmm0 25231 psrad xmm3, 15 25232 paddd xmm0, xmm3 25233 paddd xmm1, xmm2 25234 pxor xmm1, xmm2 25235 pxor xmm0, xmm3 25236 pxor xmm2, xmm2 25237 pblendw xmm0, xmm2, 170 # xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 25238 pblendw xmm1, xmm2, 170 # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 25239 packusdw xmm1, xmm0 25240 movdqu xmmword ptr [rcx + 2*rdi], xmm1 25241 .LBB3_780: 25242 cmp rsi, r9 25243 je .LBB3_923 25244 jmp .LBB3_781 25245 .LBB3_785: 25246 xor edi, edi 25247 .LBB3_786: 25248 test r8b, 1 25249 je .LBB3_788 25250 # %bb.787: 25251 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 25252 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 25253 pxor xmm2, xmm2 25254 pxor xmm3, xmm3 25255 psubq xmm3, xmm0 25256 psubq xmm2, xmm1 25257 movdqu xmmword ptr [rcx + 8*rdi], xmm3 25258 movdqu xmmword ptr [rcx + 8*rdi + 16], xmm2 25259 .LBB3_788: 25260 cmp rsi, r9 25261 je .LBB3_923 25262 jmp .LBB3_789 25263 .LBB3_793: 25264 xor edi, edi 25265 .LBB3_794: 25266 test r8b, 1 25267 je .LBB3_796 25268 # %bb.795: 25269 movupd xmm0, xmmword ptr [rdx + 4*rdi] 25270 movupd xmm1, xmmword ptr [rdx + 4*rdi + 16] 25271 movapd xmm2, xmmword ptr [rip + .LCPI3_7] # xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 25272 xorpd xmm0, xmm2 25273 xorpd xmm1, xmm2 25274 movupd xmmword ptr [rcx + 4*rdi], xmm0 25275 movupd xmmword ptr [rcx + 4*rdi + 16], xmm1 25276 .LBB3_796: 25277 cmp rsi, r9 25278 je .LBB3_923 25279 jmp .LBB3_797 25280 .LBB3_803: 25281 xor edi, edi 25282 .LBB3_804: 25283 test r8b, 1 25284 je .LBB3_806 25285 # %bb.805: 25286 movdqu xmm0, xmmword ptr [rdx + 8*rdi] 25287 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] 25288 pxor xmm2, xmm2 25289 pxor xmm3, xmm3 25290 psubq xmm3, xmm0 25291 psubq xmm2, xmm1 25292 movdqu xmmword ptr [rcx + 8*rdi], xmm3 25293 movdqu xmmword ptr [rcx + 8*rdi + 16], xmm2 25294 .LBB3_806: 25295 cmp rsi, r9 25296 je .LBB3_923 25297 jmp .LBB3_807 25298 .LBB3_811: 25299 xor edi, edi 25300 .LBB3_812: 25301 test r8b, 1 25302 je .LBB3_814 25303 # %bb.813: 25304 movupd xmm0, xmmword ptr [rdx + 4*rdi] 25305 movupd xmm1, xmmword ptr [rdx + 4*rdi + 16] 25306 movapd xmm2, xmmword ptr [rip + .LCPI3_7] # xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 25307 xorpd xmm0, xmm2 25308 xorpd xmm1, xmm2 25309 movupd xmmword ptr [rcx + 4*rdi], xmm0 25310 movupd xmmword ptr [rcx + 4*rdi + 16], xmm1 25311 .LBB3_814: 25312 cmp rsi, r9 25313 je .LBB3_923 25314 jmp .LBB3_815 25315 .LBB3_821: 25316 xor edi, edi 25317 .LBB3_822: 25318 test r8b, 1 25319 je .LBB3_824 25320 # %bb.823: 25321 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 25322 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 25323 pxor xmm3, xmm3 25324 movdqa xmm4, xmmword ptr [rip + .LCPI3_4] # xmm4 = [1,1] 25325 movdqa xmm0, xmm4 25326 pcmpgtq xmm0, xmm1 25327 movdqa xmm5, xmm1 25328 pcmpeqq xmm5, xmm3 25329 pcmpeqd xmm1, xmm1 25330 pxor xmm5, xmm1 25331 pcmpeqq xmm3, xmm2 25332 pxor xmm3, xmm1 25333 movdqa xmm1, xmm4 25334 pcmpgtq xmm1, xmm2 25335 movdqa xmm2, xmm4 25336 blendvpd xmm2, xmm5, xmm0 25337 movdqa xmm0, xmm1 25338 blendvpd xmm4, xmm3, xmm0 25339 movupd xmmword ptr [rcx + 8*rdi], xmm2 25340 movupd xmmword ptr [rcx + 8*rdi + 16], xmm4 25341 .LBB3_824: 25342 cmp rsi, r9 25343 je .LBB3_923 25344 jmp .LBB3_825 25345 .LBB3_830: 25346 xor edi, edi 25347 .LBB3_831: 25348 test r8b, 1 25349 je .LBB3_833 25350 # %bb.832: 25351 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 25352 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 25353 pxor xmm3, xmm3 25354 pxor xmm4, xmm4 25355 psubq xmm4, xmm1 25356 movdqa xmm0, xmm1 25357 blendvpd xmm1, xmm4, xmm0 25358 psubq xmm3, xmm2 25359 movdqa xmm0, xmm2 25360 blendvpd xmm2, xmm3, xmm0 25361 movupd xmmword ptr [rcx + 8*rdi], xmm1 25362 movupd xmmword ptr [rcx + 8*rdi + 16], xmm2 25363 .LBB3_833: 25364 cmp rsi, r9 25365 je .LBB3_923 25366 jmp .LBB3_834 25367 .LBB3_838: 25368 xor edi, edi 25369 .LBB3_839: 25370 test r8b, 1 25371 je .LBB3_841 25372 # %bb.840: 25373 movupd xmm0, xmmword ptr [rdx + 4*rdi] 25374 movupd xmm1, xmmword ptr [rdx + 4*rdi + 16] 25375 movapd xmm2, xmmword ptr [rip + .LCPI3_9] # xmm2 = [2147483647,2147483647,2147483647,2147483647] 25376 andpd xmm0, xmm2 25377 andpd xmm1, xmm2 25378 movupd xmmword ptr [rcx + 4*rdi], xmm0 25379 movupd xmmword ptr [rcx + 4*rdi + 16], xmm1 25380 .LBB3_841: 25381 cmp rsi, r9 25382 je .LBB3_923 25383 jmp .LBB3_842 25384 .LBB3_848: 25385 xor edi, edi 25386 .LBB3_849: 25387 test r8b, 1 25388 je .LBB3_851 25389 # %bb.850: 25390 movdqu xmm1, xmmword ptr [rdx + 8*rdi] 25391 movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] 25392 pxor xmm3, xmm3 25393 pxor xmm4, xmm4 25394 psubq xmm4, xmm1 25395 movdqa xmm0, xmm1 25396 blendvpd xmm1, xmm4, xmm0 25397 psubq xmm3, xmm2 25398 movdqa xmm0, xmm2 25399 blendvpd xmm2, xmm3, xmm0 25400 movupd xmmword ptr [rcx + 8*rdi], xmm1 25401 movupd xmmword ptr [rcx + 8*rdi + 16], xmm2 25402 .LBB3_851: 25403 cmp rsi, r9 25404 je .LBB3_923 25405 jmp .LBB3_852 25406 .LBB3_856: 25407 xor edi, edi 25408 .LBB3_857: 25409 test r8b, 1 25410 je .LBB3_859 25411 # %bb.858: 25412 movupd xmm0, xmmword ptr [rdx + 4*rdi] 25413 movupd xmm1, xmmword ptr [rdx + 4*rdi + 16] 25414 movapd xmm2, xmmword ptr [rip + .LCPI3_9] # xmm2 = [2147483647,2147483647,2147483647,2147483647] 25415 andpd xmm0, xmm2 25416 andpd xmm1, xmm2 25417 movupd xmmword ptr [rcx + 4*rdi], xmm0 25418 movupd xmmword ptr [rcx + 4*rdi + 16], xmm1 25419 .LBB3_859: 25420 cmp rsi, r9 25421 je .LBB3_923 25422 jmp .LBB3_860 25423 .LBB3_866: 25424 xor edi, edi 25425 .LBB3_867: 25426 test r8b, 1 25427 je .LBB3_869 25428 # %bb.868: 25429 movdqu xmm0, xmmword ptr [rdx + rdi] 25430 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 25431 pxor xmm2, xmm2 25432 pxor xmm3, xmm3 25433 psubb xmm3, xmm0 25434 psubb xmm2, xmm1 25435 movdqu xmmword ptr [rcx + rdi], xmm3 25436 movdqu xmmword ptr [rcx + rdi + 16], xmm2 25437 .LBB3_869: 25438 cmp rsi, r9 25439 je .LBB3_923 25440 jmp .LBB3_870 25441 .LBB3_874: 25442 xor edi, edi 25443 .LBB3_875: 25444 test r8b, 1 25445 je .LBB3_877 25446 # %bb.876: 25447 movdqu xmm0, xmmword ptr [rdx + rdi] 25448 movdqu xmm1, xmmword ptr [rdx + rdi + 16] 25449 pxor xmm2, xmm2 25450 pcmpeqb xmm0, xmm2 25451 movdqa xmm3, xmmword ptr [rip + .LCPI3_6] # xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 25452 pandn xmm0, xmm3 25453 pcmpeqb xmm1, xmm2 25454 pandn xmm1, xmm3 25455 movdqu xmmword ptr [rcx + rdi], xmm0 25456 movdqu xmmword ptr [rcx + rdi + 16], xmm1 25457 .LBB3_877: 25458 cmp rsi, r9 25459 je .LBB3_923 25460 jmp .LBB3_878 25461 .LBB3_882: 25462 xor edi, edi 25463 .LBB3_883: 25464 test r8b, 1 25465 je .LBB3_885 25466 # %bb.884: 25467 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 25468 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 25469 pxor xmm2, xmm2 25470 pxor xmm3, xmm3 25471 psubd xmm3, xmm0 25472 psubd xmm2, xmm1 25473 movdqu xmmword ptr [rcx + 4*rdi], xmm3 25474 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm2 25475 .LBB3_885: 25476 cmp rsi, r9 25477 je .LBB3_923 25478 jmp .LBB3_886 25479 .LBB3_890: 25480 xor edi, edi 25481 .LBB3_891: 25482 test r8b, 1 25483 je .LBB3_893 25484 # %bb.892: 25485 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 25486 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 25487 pxor xmm2, xmm2 25488 pxor xmm3, xmm3 25489 psubd xmm3, xmm0 25490 psubd xmm2, xmm1 25491 movdqu xmmword ptr [rcx + 4*rdi], xmm3 25492 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm2 25493 .LBB3_893: 25494 cmp rsi, r9 25495 je .LBB3_923 25496 jmp .LBB3_894 25497 .LBB3_898: 25498 xor edi, edi 25499 .LBB3_899: 25500 test r8b, 1 25501 je .LBB3_901 25502 # %bb.900: 25503 movdqu xmm1, xmmword ptr [rdx + 4*rdi] 25504 movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] 25505 pxor xmm3, xmm3 25506 movdqa xmm4, xmmword ptr [rip + .LCPI3_3] # xmm4 = [1,1,1,1] 25507 movdqa xmm0, xmm4 25508 pcmpgtd xmm0, xmm1 25509 movdqa xmm5, xmm1 25510 pcmpeqd xmm5, xmm3 25511 pcmpeqd xmm1, xmm1 25512 pxor xmm5, xmm1 25513 pcmpeqd xmm3, xmm2 25514 pxor xmm3, xmm1 25515 movdqa xmm1, xmm4 25516 pcmpgtd xmm1, xmm2 25517 movdqa xmm2, xmm4 25518 blendvps xmm2, xmm5, xmm0 25519 movdqa xmm0, xmm1 25520 blendvps xmm4, xmm3, xmm0 25521 movups xmmword ptr [rcx + 4*rdi], xmm2 25522 movups xmmword ptr [rcx + 4*rdi + 16], xmm4 25523 .LBB3_901: 25524 cmp rsi, r9 25525 je .LBB3_923 25526 jmp .LBB3_902 25527 .LBB3_907: 25528 xor edi, edi 25529 .LBB3_908: 25530 test r8b, 1 25531 je .LBB3_910 25532 # %bb.909: 25533 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 25534 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 25535 pabsd xmm0, xmm0 25536 pabsd xmm1, xmm1 25537 movdqu xmmword ptr [rcx + 4*rdi], xmm0 25538 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 25539 .LBB3_910: 25540 cmp rsi, r9 25541 je .LBB3_923 25542 jmp .LBB3_911 25543 .LBB3_915: 25544 xor edi, edi 25545 .LBB3_916: 25546 test r8b, 1 25547 je .LBB3_918 25548 # %bb.917: 25549 movdqu xmm0, xmmword ptr [rdx + 4*rdi] 25550 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] 25551 pabsd xmm0, xmm0 25552 pabsd xmm1, xmm1 25553 movdqu xmmword ptr [rcx + 4*rdi], xmm0 25554 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 25555 .LBB3_918: 25556 cmp rsi, r9 25557 je .LBB3_923 25558 jmp .LBB3_919 25559 .Lfunc_end3: 25560 .size arithmetic_unary_same_types_sse4, .Lfunc_end3-arithmetic_unary_same_types_sse4 25561 # -- End function 25562 .section .rodata.cst16,"aM",@progbits,16 25563 .p2align 4 # -- Begin function arithmetic_unary_diff_type_sse4 25564 .LCPI4_0: 25565 .quad 0x8000000000000000 # double -0 25566 .quad 0x8000000000000000 # double -0 25567 .LCPI4_1: 25568 .quad 0x3ff0000000000000 # double 1 25569 .quad 0x3ff0000000000000 # double 1 25570 .LCPI4_3: 25571 .long 0x7fffffff # float NaN 25572 .long 0x7fffffff # float NaN 25573 .long 0x7fffffff # float NaN 25574 .long 0x7fffffff # float NaN 25575 .LCPI4_4: 25576 .long 0x80000000 # float -0 25577 .long 0x80000000 # float -0 25578 .long 0x80000000 # float -0 25579 .long 0x80000000 # float -0 25580 .LCPI4_7: 25581 .byte 0 # 0x0 25582 .byte 4 # 0x4 25583 .zero 1 25584 .zero 1 25585 .zero 1 25586 .zero 1 25587 .zero 1 25588 .zero 1 25589 .zero 1 25590 .zero 1 25591 .zero 1 25592 .zero 1 25593 .zero 1 25594 .zero 1 25595 .zero 1 25596 .zero 1 25597 .LCPI4_8: 25598 .long 1 # 0x1 25599 .long 1 # 0x1 25600 .long 1 # 0x1 25601 .long 1 # 0x1 25602 .LCPI4_10: 25603 .long 0x4f000000 # float 2.14748365E+9 25604 .long 0x4f000000 # float 2.14748365E+9 25605 .long 0x4f000000 # float 2.14748365E+9 25606 .long 0x4f000000 # float 2.14748365E+9 25607 .LCPI4_11: 25608 .short 1 # 0x1 25609 .short 1 # 0x1 25610 .short 1 # 0x1 25611 .short 1 # 0x1 25612 .zero 2 25613 .zero 2 25614 .zero 2 25615 .zero 2 25616 .LCPI4_12: 25617 .byte 1 # 0x1 25618 .byte 1 # 0x1 25619 .byte 1 # 0x1 25620 .byte 1 # 0x1 25621 .zero 1 25622 .zero 1 25623 .zero 1 25624 .zero 1 25625 .zero 1 25626 .zero 1 25627 .zero 1 25628 .zero 1 25629 .zero 1 25630 .zero 1 25631 .zero 1 25632 .zero 1 25633 .LCPI4_15: 25634 .quad 1 # 0x1 25635 .quad 1 # 0x1 25636 .LCPI4_16: 25637 .long 1 # 0x1 25638 .long 1 # 0x1 25639 .zero 4 25640 .zero 4 25641 .LCPI4_17: 25642 .short 1 # 0x1 25643 .short 1 # 0x1 25644 .zero 2 25645 .zero 2 25646 .zero 2 25647 .zero 2 25648 .zero 2 25649 .zero 2 25650 .LCPI4_18: 25651 .byte 1 # 0x1 25652 .byte 1 # 0x1 25653 .zero 1 25654 .zero 1 25655 .zero 1 25656 .zero 1 25657 .zero 1 25658 .zero 1 25659 .zero 1 25660 .zero 1 25661 .zero 1 25662 .zero 1 25663 .zero 1 25664 .zero 1 25665 .zero 1 25666 .zero 1 25667 .LCPI4_19: 25668 .long 0x3f800000 # float 1 25669 .long 0x3f800000 # float 1 25670 .long 0x3f800000 # float 1 25671 .long 0x3f800000 # float 1 25672 .LCPI4_20: 25673 .short 1 # 0x1 25674 .short 1 # 0x1 25675 .short 1 # 0x1 25676 .short 1 # 0x1 25677 .short 1 # 0x1 25678 .short 1 # 0x1 25679 .short 1 # 0x1 25680 .short 1 # 0x1 25681 .LCPI4_21: 25682 .byte 1 # 0x1 25683 .byte 1 # 0x1 25684 .byte 1 # 0x1 25685 .byte 1 # 0x1 25686 .byte 1 # 0x1 25687 .byte 1 # 0x1 25688 .byte 1 # 0x1 25689 .byte 1 # 0x1 25690 .zero 1 25691 .zero 1 25692 .zero 1 25693 .zero 1 25694 .zero 1 25695 .zero 1 25696 .zero 1 25697 .zero 1 25698 .LCPI4_22: 25699 .zero 16,1 25700 .section .rodata.cst8,"aM",@progbits,8 25701 .p2align 3 25702 .LCPI4_2: 25703 .quad 0x3ff0000000000000 # double 1 25704 .LCPI4_6: 25705 .quad 0x43e0000000000000 # double 9.2233720368547758E+18 25706 .LCPI4_13: 25707 .quad 0xbff0000000000000 # double -1 25708 .section .rodata.cst4,"aM",@progbits,4 25709 .p2align 2 25710 .LCPI4_5: 25711 .long 0x3f800000 # float 1 25712 .LCPI4_9: 25713 .long 0x5f000000 # float 9.22337203E+18 25714 .LCPI4_14: 25715 .long 0xbf800000 # float -1 25716 .text 25717 .globl arithmetic_unary_diff_type_sse4 25718 .p2align 4, 0x90 25719 .type arithmetic_unary_diff_type_sse4,@function 25720 arithmetic_unary_diff_type_sse4: # @arithmetic_unary_diff_type_sse4 25721 # %bb.0: 25722 push rbp 25723 mov rbp, rsp 25724 push r14 25725 push rbx 25726 and rsp, -8 25727 cmp dl, 20 25728 jne .LBB4_1655 25729 # %bb.1: 25730 cmp edi, 6 25731 jg .LBB4_14 25732 # %bb.2: 25733 cmp edi, 3 25734 jle .LBB4_26 25735 # %bb.3: 25736 cmp edi, 4 25737 je .LBB4_46 25738 # %bb.4: 25739 cmp edi, 5 25740 je .LBB4_54 25741 # %bb.5: 25742 cmp edi, 6 25743 jne .LBB4_1655 25744 # %bb.6: 25745 cmp esi, 6 25746 jg .LBB4_94 25747 # %bb.7: 25748 cmp esi, 3 25749 jle .LBB4_200 25750 # %bb.8: 25751 cmp esi, 4 25752 je .LBB4_303 25753 # %bb.9: 25754 cmp esi, 5 25755 je .LBB4_306 25756 # %bb.10: 25757 cmp esi, 6 25758 jne .LBB4_1655 25759 # %bb.11: 25760 test r9d, r9d 25761 jle .LBB4_1655 25762 # %bb.12: 25763 mov r10d, r9d 25764 cmp r9d, 8 25765 jb .LBB4_13 25766 # %bb.494: 25767 lea rdx, [rcx + 4*r10] 25768 cmp rdx, r8 25769 jbe .LBB4_496 25770 # %bb.495: 25771 lea rdx, [r8 + 4*r10] 25772 cmp rdx, rcx 25773 jbe .LBB4_496 25774 .LBB4_13: 25775 xor edx, edx 25776 .LBB4_1232: 25777 mov rsi, rdx 25778 not rsi 25779 add rsi, r10 25780 mov rdi, r10 25781 and rdi, 3 25782 je .LBB4_1234 25783 .LBB4_1233: # =>This Inner Loop Header: Depth=1 25784 xor eax, eax 25785 cmp dword ptr [rcx + 4*rdx], 0 25786 setne al 25787 mov dword ptr [r8 + 4*rdx], eax 25788 add rdx, 1 25789 add rdi, -1 25790 jne .LBB4_1233 25791 .LBB4_1234: 25792 cmp rsi, 3 25793 jb .LBB4_1655 25794 .LBB4_1235: # =>This Inner Loop Header: Depth=1 25795 xor eax, eax 25796 cmp dword ptr [rcx + 4*rdx], 0 25797 setne al 25798 mov dword ptr [r8 + 4*rdx], eax 25799 xor eax, eax 25800 cmp dword ptr [rcx + 4*rdx + 4], 0 25801 setne al 25802 mov dword ptr [r8 + 4*rdx + 4], eax 25803 xor eax, eax 25804 cmp dword ptr [rcx + 4*rdx + 8], 0 25805 setne al 25806 mov dword ptr [r8 + 4*rdx + 8], eax 25807 xor eax, eax 25808 cmp dword ptr [rcx + 4*rdx + 12], 0 25809 setne al 25810 mov dword ptr [r8 + 4*rdx + 12], eax 25811 add rdx, 4 25812 cmp r10, rdx 25813 jne .LBB4_1235 25814 jmp .LBB4_1655 25815 .LBB4_14: 25816 cmp edi, 8 25817 jle .LBB4_36 25818 # %bb.15: 25819 cmp edi, 9 25820 je .LBB4_62 25821 # %bb.16: 25822 cmp edi, 11 25823 je .LBB4_70 25824 # %bb.17: 25825 cmp edi, 12 25826 jne .LBB4_1655 25827 # %bb.18: 25828 cmp esi, 6 25829 jg .LBB4_106 25830 # %bb.19: 25831 cmp esi, 3 25832 jle .LBB4_205 25833 # %bb.20: 25834 cmp esi, 4 25835 je .LBB4_309 25836 # %bb.21: 25837 cmp esi, 5 25838 je .LBB4_312 25839 # %bb.22: 25840 cmp esi, 6 25841 jne .LBB4_1655 25842 # %bb.23: 25843 test r9d, r9d 25844 jle .LBB4_1655 25845 # %bb.24: 25846 mov r11d, r9d 25847 xor r10d, r10d 25848 cmp r9d, 4 25849 jae .LBB4_499 25850 # %bb.25: 25851 xor esi, esi 25852 jmp .LBB4_1110 25853 .LBB4_26: 25854 cmp edi, 2 25855 je .LBB4_78 25856 # %bb.27: 25857 cmp edi, 3 25858 jne .LBB4_1655 25859 # %bb.28: 25860 cmp esi, 6 25861 jg .LBB4_113 25862 # %bb.29: 25863 cmp esi, 3 25864 jle .LBB4_210 25865 # %bb.30: 25866 cmp esi, 4 25867 je .LBB4_315 25868 # %bb.31: 25869 cmp esi, 5 25870 je .LBB4_318 25871 # %bb.32: 25872 cmp esi, 6 25873 jne .LBB4_1655 25874 # %bb.33: 25875 test r9d, r9d 25876 jle .LBB4_1655 25877 # %bb.34: 25878 mov r10d, r9d 25879 cmp r9d, 8 25880 jb .LBB4_35 25881 # %bb.502: 25882 lea rdx, [rcx + r10] 25883 cmp rdx, r8 25884 jbe .LBB4_504 25885 # %bb.503: 25886 lea rdx, [r8 + 4*r10] 25887 cmp rdx, rcx 25888 jbe .LBB4_504 25889 .LBB4_35: 25890 xor edx, edx 25891 .LBB4_1240: 25892 mov rsi, rdx 25893 not rsi 25894 test r10b, 1 25895 je .LBB4_1242 25896 # %bb.1241: 25897 mov r9b, byte ptr [rcx + rdx] 25898 xor edi, edi 25899 test r9b, r9b 25900 setne dil 25901 neg edi 25902 test r9b, r9b 25903 mov eax, 1 25904 cmovle eax, edi 25905 mov dword ptr [r8 + 4*rdx], eax 25906 or rdx, 1 25907 .LBB4_1242: 25908 add rsi, r10 25909 je .LBB4_1655 25910 # %bb.1243: 25911 mov esi, 1 25912 .LBB4_1244: # =>This Inner Loop Header: Depth=1 25913 movzx eax, byte ptr [rcx + rdx] 25914 xor edi, edi 25915 test al, al 25916 setne dil 25917 neg edi 25918 test al, al 25919 cmovg edi, esi 25920 mov dword ptr [r8 + 4*rdx], edi 25921 movzx eax, byte ptr [rcx + rdx + 1] 25922 xor edi, edi 25923 test al, al 25924 setne dil 25925 neg edi 25926 test al, al 25927 cmovg edi, esi 25928 mov dword ptr [r8 + 4*rdx + 4], edi 25929 add rdx, 2 25930 cmp r10, rdx 25931 jne .LBB4_1244 25932 jmp .LBB4_1655 25933 .LBB4_36: 25934 cmp edi, 7 25935 je .LBB4_86 25936 # %bb.37: 25937 cmp edi, 8 25938 jne .LBB4_1655 25939 # %bb.38: 25940 cmp esi, 6 25941 jg .LBB4_123 25942 # %bb.39: 25943 cmp esi, 3 25944 jle .LBB4_215 25945 # %bb.40: 25946 cmp esi, 4 25947 je .LBB4_321 25948 # %bb.41: 25949 cmp esi, 5 25950 je .LBB4_324 25951 # %bb.42: 25952 cmp esi, 6 25953 jne .LBB4_1655 25954 # %bb.43: 25955 test r9d, r9d 25956 jle .LBB4_1655 25957 # %bb.44: 25958 mov eax, r9d 25959 cmp r9d, 4 25960 jae .LBB4_507 25961 # %bb.45: 25962 xor edx, edx 25963 jmp .LBB4_998 25964 .LBB4_46: 25965 cmp esi, 6 25966 jg .LBB4_135 25967 # %bb.47: 25968 cmp esi, 3 25969 jle .LBB4_220 25970 # %bb.48: 25971 cmp esi, 4 25972 je .LBB4_327 25973 # %bb.49: 25974 cmp esi, 5 25975 je .LBB4_330 25976 # %bb.50: 25977 cmp esi, 6 25978 jne .LBB4_1655 25979 # %bb.51: 25980 test r9d, r9d 25981 jle .LBB4_1655 25982 # %bb.52: 25983 mov eax, r9d 25984 cmp r9d, 8 25985 jae .LBB4_510 25986 # %bb.53: 25987 xor edx, edx 25988 jmp .LBB4_1116 25989 .LBB4_54: 25990 cmp esi, 6 25991 jg .LBB4_147 25992 # %bb.55: 25993 cmp esi, 3 25994 jle .LBB4_225 25995 # %bb.56: 25996 cmp esi, 4 25997 je .LBB4_333 25998 # %bb.57: 25999 cmp esi, 5 26000 je .LBB4_336 26001 # %bb.58: 26002 cmp esi, 6 26003 jne .LBB4_1655 26004 # %bb.59: 26005 test r9d, r9d 26006 jle .LBB4_1655 26007 # %bb.60: 26008 mov r10d, r9d 26009 cmp r9d, 8 26010 jae .LBB4_513 26011 # %bb.61: 26012 xor edx, edx 26013 jmp .LBB4_1121 26014 .LBB4_62: 26015 cmp esi, 6 26016 jg .LBB4_157 26017 # %bb.63: 26018 cmp esi, 3 26019 jle .LBB4_230 26020 # %bb.64: 26021 cmp esi, 4 26022 je .LBB4_339 26023 # %bb.65: 26024 cmp esi, 5 26025 je .LBB4_342 26026 # %bb.66: 26027 cmp esi, 6 26028 jne .LBB4_1655 26029 # %bb.67: 26030 test r9d, r9d 26031 jle .LBB4_1655 26032 # %bb.68: 26033 mov r10d, r9d 26034 cmp r9d, 4 26035 jae .LBB4_516 26036 # %bb.69: 26037 xor edx, edx 26038 jmp .LBB4_1127 26039 .LBB4_70: 26040 cmp esi, 6 26041 jg .LBB4_167 26042 # %bb.71: 26043 cmp esi, 3 26044 jle .LBB4_235 26045 # %bb.72: 26046 cmp esi, 4 26047 je .LBB4_345 26048 # %bb.73: 26049 cmp esi, 5 26050 je .LBB4_348 26051 # %bb.74: 26052 cmp esi, 6 26053 jne .LBB4_1655 26054 # %bb.75: 26055 test r9d, r9d 26056 jle .LBB4_1655 26057 # %bb.76: 26058 mov eax, r9d 26059 cmp r9d, 4 26060 jae .LBB4_519 26061 # %bb.77: 26062 xor edx, edx 26063 jmp .LBB4_1133 26064 .LBB4_78: 26065 cmp esi, 6 26066 jg .LBB4_178 26067 # %bb.79: 26068 cmp esi, 3 26069 jle .LBB4_240 26070 # %bb.80: 26071 cmp esi, 4 26072 je .LBB4_351 26073 # %bb.81: 26074 cmp esi, 5 26075 je .LBB4_354 26076 # %bb.82: 26077 cmp esi, 6 26078 jne .LBB4_1655 26079 # %bb.83: 26080 test r9d, r9d 26081 jle .LBB4_1655 26082 # %bb.84: 26083 mov r10d, r9d 26084 cmp r9d, 8 26085 jb .LBB4_85 26086 # %bb.522: 26087 lea rdx, [rcx + r10] 26088 cmp rdx, r8 26089 jbe .LBB4_524 26090 # %bb.523: 26091 lea rdx, [r8 + 4*r10] 26092 cmp rdx, rcx 26093 jbe .LBB4_524 26094 .LBB4_85: 26095 xor edx, edx 26096 .LBB4_1249: 26097 mov rsi, rdx 26098 not rsi 26099 add rsi, r10 26100 mov rdi, r10 26101 and rdi, 3 26102 je .LBB4_1251 26103 .LBB4_1250: # =>This Inner Loop Header: Depth=1 26104 xor eax, eax 26105 cmp byte ptr [rcx + rdx], 0 26106 setne al 26107 mov dword ptr [r8 + 4*rdx], eax 26108 add rdx, 1 26109 add rdi, -1 26110 jne .LBB4_1250 26111 .LBB4_1251: 26112 cmp rsi, 3 26113 jb .LBB4_1655 26114 .LBB4_1252: # =>This Inner Loop Header: Depth=1 26115 xor eax, eax 26116 cmp byte ptr [rcx + rdx], 0 26117 setne al 26118 mov dword ptr [r8 + 4*rdx], eax 26119 xor eax, eax 26120 cmp byte ptr [rcx + rdx + 1], 0 26121 setne al 26122 mov dword ptr [r8 + 4*rdx + 4], eax 26123 xor eax, eax 26124 cmp byte ptr [rcx + rdx + 2], 0 26125 setne al 26126 mov dword ptr [r8 + 4*rdx + 8], eax 26127 xor eax, eax 26128 cmp byte ptr [rcx + rdx + 3], 0 26129 setne al 26130 mov dword ptr [r8 + 4*rdx + 12], eax 26131 add rdx, 4 26132 cmp r10, rdx 26133 jne .LBB4_1252 26134 jmp .LBB4_1655 26135 .LBB4_86: 26136 cmp esi, 6 26137 jg .LBB4_190 26138 # %bb.87: 26139 cmp esi, 3 26140 jle .LBB4_245 26141 # %bb.88: 26142 cmp esi, 4 26143 je .LBB4_357 26144 # %bb.89: 26145 cmp esi, 5 26146 je .LBB4_360 26147 # %bb.90: 26148 cmp esi, 6 26149 jne .LBB4_1655 26150 # %bb.91: 26151 test r9d, r9d 26152 jle .LBB4_1655 26153 # %bb.92: 26154 mov r11d, r9d 26155 cmp r9d, 8 26156 jb .LBB4_93 26157 # %bb.527: 26158 lea rdx, [rcx + 4*r11] 26159 cmp rdx, r8 26160 jbe .LBB4_529 26161 # %bb.528: 26162 lea rdx, [r8 + 4*r11] 26163 cmp rdx, rcx 26164 jbe .LBB4_529 26165 .LBB4_93: 26166 xor edx, edx 26167 .LBB4_1257: 26168 mov rsi, rdx 26169 not rsi 26170 test r11b, 1 26171 je .LBB4_1259 26172 # %bb.1258: 26173 mov r9d, dword ptr [rcx + 4*rdx] 26174 xor r10d, r10d 26175 test r9d, r9d 26176 setne r10b 26177 neg r10d 26178 test r9d, r9d 26179 mov edi, 1 26180 cmovle edi, r10d 26181 mov dword ptr [r8 + 4*rdx], edi 26182 or rdx, 1 26183 .LBB4_1259: 26184 add rsi, r11 26185 je .LBB4_1655 26186 # %bb.1260: 26187 mov esi, 1 26188 .LBB4_1261: # =>This Inner Loop Header: Depth=1 26189 mov edi, dword ptr [rcx + 4*rdx] 26190 xor eax, eax 26191 test edi, edi 26192 setne al 26193 neg eax 26194 test edi, edi 26195 cmovg eax, esi 26196 mov dword ptr [r8 + 4*rdx], eax 26197 mov eax, dword ptr [rcx + 4*rdx + 4] 26198 xor edi, edi 26199 test eax, eax 26200 setne dil 26201 neg edi 26202 test eax, eax 26203 cmovg edi, esi 26204 mov dword ptr [r8 + 4*rdx + 4], edi 26205 add rdx, 2 26206 cmp r11, rdx 26207 jne .LBB4_1261 26208 jmp .LBB4_1655 26209 .LBB4_94: 26210 cmp esi, 8 26211 jle .LBB4_250 26212 # %bb.95: 26213 cmp esi, 9 26214 je .LBB4_363 26215 # %bb.96: 26216 cmp esi, 11 26217 je .LBB4_366 26218 # %bb.97: 26219 cmp esi, 12 26220 jne .LBB4_1655 26221 # %bb.98: 26222 test r9d, r9d 26223 jle .LBB4_1655 26224 # %bb.99: 26225 mov edx, r9d 26226 lea rsi, [rdx - 1] 26227 mov eax, edx 26228 and eax, 3 26229 cmp rsi, 3 26230 jae .LBB4_532 26231 # %bb.100: 26232 xor esi, esi 26233 .LBB4_101: 26234 test rax, rax 26235 je .LBB4_1655 26236 # %bb.102: 26237 lea rdx, [r8 + 8*rsi] 26238 lea rcx, [rcx + 4*rsi] 26239 xor esi, esi 26240 movsd xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero 26241 jmp .LBB4_104 26242 .LBB4_103: # in Loop: Header=BB4_104 Depth=1 26243 movsd qword ptr [rdx + 8*rsi], xmm1 26244 add rsi, 1 26245 cmp rax, rsi 26246 je .LBB4_1655 26247 .LBB4_104: # =>This Inner Loop Header: Depth=1 26248 cmp dword ptr [rcx + 4*rsi], 0 26249 movapd xmm1, xmm0 26250 jne .LBB4_103 26251 # %bb.105: # in Loop: Header=BB4_104 Depth=1 26252 xorpd xmm1, xmm1 26253 jmp .LBB4_103 26254 .LBB4_106: 26255 cmp esi, 8 26256 jle .LBB4_255 26257 # %bb.107: 26258 cmp esi, 9 26259 je .LBB4_369 26260 # %bb.108: 26261 cmp esi, 11 26262 je .LBB4_372 26263 # %bb.109: 26264 cmp esi, 12 26265 jne .LBB4_1655 26266 # %bb.110: 26267 test r9d, r9d 26268 jle .LBB4_1655 26269 # %bb.111: 26270 mov eax, r9d 26271 cmp r9d, 4 26272 jb .LBB4_112 26273 # %bb.542: 26274 lea rdx, [rcx + 8*rax] 26275 cmp rdx, r8 26276 jbe .LBB4_544 26277 # %bb.543: 26278 lea rdx, [r8 + 8*rax] 26279 cmp rdx, rcx 26280 jbe .LBB4_544 26281 .LBB4_112: 26282 xor edx, edx 26283 .LBB4_1266: 26284 mov rsi, rdx 26285 not rsi 26286 test al, 1 26287 je .LBB4_1268 26288 # %bb.1267: 26289 movsd xmm0, qword ptr [rcx + 8*rdx] # xmm0 = mem[0],zero 26290 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 26291 andpd xmm1, xmm0 26292 movsd xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero 26293 orpd xmm2, xmm1 26294 xorpd xmm1, xmm1 26295 cmpeqsd xmm1, xmm0 26296 andnpd xmm1, xmm2 26297 movlpd qword ptr [r8 + 8*rdx], xmm1 26298 or rdx, 1 26299 .LBB4_1268: 26300 add rsi, rax 26301 je .LBB4_1655 26302 # %bb.1269: 26303 movapd xmm0, xmmword ptr [rip + .LCPI4_0] # xmm0 = [-0.0E+0,-0.0E+0] 26304 movsd xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero 26305 xorpd xmm2, xmm2 26306 .LBB4_1270: # =>This Inner Loop Header: Depth=1 26307 movsd xmm3, qword ptr [rcx + 8*rdx] # xmm3 = mem[0],zero 26308 movapd xmm4, xmm3 26309 andpd xmm4, xmm0 26310 orpd xmm4, xmm1 26311 cmpeqsd xmm3, xmm2 26312 andnpd xmm3, xmm4 26313 movlpd qword ptr [r8 + 8*rdx], xmm3 26314 movsd xmm3, qword ptr [rcx + 8*rdx + 8] # xmm3 = mem[0],zero 26315 movapd xmm4, xmm3 26316 andpd xmm4, xmm0 26317 orpd xmm4, xmm1 26318 cmpeqsd xmm3, xmm2 26319 andnpd xmm3, xmm4 26320 movlpd qword ptr [r8 + 8*rdx + 8], xmm3 26321 add rdx, 2 26322 cmp rax, rdx 26323 jne .LBB4_1270 26324 jmp .LBB4_1655 26325 .LBB4_113: 26326 cmp esi, 8 26327 jle .LBB4_260 26328 # %bb.114: 26329 cmp esi, 9 26330 je .LBB4_375 26331 # %bb.115: 26332 cmp esi, 11 26333 je .LBB4_378 26334 # %bb.116: 26335 cmp esi, 12 26336 jne .LBB4_1655 26337 # %bb.117: 26338 test r9d, r9d 26339 jle .LBB4_1655 26340 # %bb.118: 26341 mov edx, r9d 26342 cmp r9d, 1 26343 jne .LBB4_547 26344 # %bb.119: 26345 xor eax, eax 26346 .LBB4_120: 26347 test dl, 1 26348 je .LBB4_1655 26349 # %bb.121: 26350 cmp byte ptr [rcx + rax], 0 26351 jne .LBB4_982 26352 .LBB4_122: 26353 xorpd xmm0, xmm0 26354 jmp .LBB4_983 26355 .LBB4_123: 26356 cmp esi, 8 26357 jle .LBB4_265 26358 # %bb.124: 26359 cmp esi, 9 26360 je .LBB4_381 26361 # %bb.125: 26362 cmp esi, 11 26363 je .LBB4_384 26364 # %bb.126: 26365 cmp esi, 12 26366 jne .LBB4_1655 26367 # %bb.127: 26368 test r9d, r9d 26369 jle .LBB4_1655 26370 # %bb.128: 26371 mov edx, r9d 26372 lea rsi, [rdx - 1] 26373 mov eax, edx 26374 and eax, 3 26375 cmp rsi, 3 26376 jae .LBB4_557 26377 # %bb.129: 26378 xor esi, esi 26379 .LBB4_130: 26380 test rax, rax 26381 je .LBB4_1655 26382 # %bb.131: 26383 lea rdx, [r8 + 8*rsi] 26384 lea rcx, [rcx + 8*rsi] 26385 xor esi, esi 26386 movsd xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero 26387 jmp .LBB4_133 26388 .LBB4_132: # in Loop: Header=BB4_133 Depth=1 26389 movsd qword ptr [rdx + 8*rsi], xmm1 26390 add rsi, 1 26391 cmp rax, rsi 26392 je .LBB4_1655 26393 .LBB4_133: # =>This Inner Loop Header: Depth=1 26394 cmp qword ptr [rcx + 8*rsi], 0 26395 movapd xmm1, xmm0 26396 jne .LBB4_132 26397 # %bb.134: # in Loop: Header=BB4_133 Depth=1 26398 xorpd xmm1, xmm1 26399 jmp .LBB4_132 26400 .LBB4_135: 26401 cmp esi, 8 26402 jle .LBB4_270 26403 # %bb.136: 26404 cmp esi, 9 26405 je .LBB4_392 26406 # %bb.137: 26407 cmp esi, 11 26408 je .LBB4_395 26409 # %bb.138: 26410 cmp esi, 12 26411 jne .LBB4_1655 26412 # %bb.139: 26413 test r9d, r9d 26414 jle .LBB4_1655 26415 # %bb.140: 26416 mov edx, r9d 26417 lea rsi, [rdx - 1] 26418 mov eax, edx 26419 and eax, 3 26420 cmp rsi, 3 26421 jae .LBB4_567 26422 # %bb.141: 26423 xor esi, esi 26424 .LBB4_142: 26425 test rax, rax 26426 je .LBB4_1655 26427 # %bb.143: 26428 lea rdx, [r8 + 8*rsi] 26429 lea rcx, [rcx + 2*rsi] 26430 xor esi, esi 26431 movsd xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero 26432 jmp .LBB4_145 26433 .LBB4_144: # in Loop: Header=BB4_145 Depth=1 26434 movsd qword ptr [rdx + 8*rsi], xmm1 26435 add rsi, 1 26436 cmp rax, rsi 26437 je .LBB4_1655 26438 .LBB4_145: # =>This Inner Loop Header: Depth=1 26439 cmp word ptr [rcx + 2*rsi], 0 26440 movapd xmm1, xmm0 26441 jne .LBB4_144 26442 # %bb.146: # in Loop: Header=BB4_145 Depth=1 26443 xorpd xmm1, xmm1 26444 jmp .LBB4_144 26445 .LBB4_147: 26446 cmp esi, 8 26447 jle .LBB4_275 26448 # %bb.148: 26449 cmp esi, 9 26450 je .LBB4_398 26451 # %bb.149: 26452 cmp esi, 11 26453 je .LBB4_401 26454 # %bb.150: 26455 cmp esi, 12 26456 jne .LBB4_1655 26457 # %bb.151: 26458 test r9d, r9d 26459 jle .LBB4_1655 26460 # %bb.152: 26461 mov edx, r9d 26462 cmp r9d, 1 26463 jne .LBB4_577 26464 # %bb.153: 26465 xor eax, eax 26466 .LBB4_154: 26467 test dl, 1 26468 je .LBB4_1655 26469 # %bb.155: 26470 cmp word ptr [rcx + 2*rax], 0 26471 je .LBB4_122 26472 .LBB4_982: 26473 movsd xmm0, qword ptr [rip + .LCPI4_13] # xmm0 = mem[0],zero 26474 .LBB4_983: 26475 jle .LBB4_985 26476 # %bb.984: 26477 movsd xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero 26478 .LBB4_985: 26479 movsd qword ptr [r8 + 8*rax], xmm0 26480 jmp .LBB4_1655 26481 .LBB4_157: 26482 cmp esi, 8 26483 jle .LBB4_280 26484 # %bb.158: 26485 cmp esi, 9 26486 je .LBB4_404 26487 # %bb.159: 26488 cmp esi, 11 26489 je .LBB4_407 26490 # %bb.160: 26491 cmp esi, 12 26492 jne .LBB4_1655 26493 # %bb.161: 26494 test r9d, r9d 26495 jle .LBB4_1655 26496 # %bb.162: 26497 mov edx, r9d 26498 cmp r9d, 1 26499 jne .LBB4_587 26500 # %bb.163: 26501 xor eax, eax 26502 .LBB4_164: 26503 test dl, 1 26504 je .LBB4_1655 26505 # %bb.165: 26506 cmp qword ptr [rcx + 8*rax], 0 26507 je .LBB4_122 26508 jmp .LBB4_982 26509 .LBB4_167: 26510 cmp esi, 8 26511 jle .LBB4_285 26512 # %bb.168: 26513 cmp esi, 9 26514 je .LBB4_413 26515 # %bb.169: 26516 cmp esi, 11 26517 je .LBB4_419 26518 # %bb.170: 26519 cmp esi, 12 26520 jne .LBB4_1655 26521 # %bb.171: 26522 test r9d, r9d 26523 jle .LBB4_1655 26524 # %bb.172: 26525 mov edx, r9d 26526 cmp r9d, 1 26527 jne .LBB4_597 26528 # %bb.173: 26529 xor eax, eax 26530 .LBB4_174: 26531 test dl, 1 26532 je .LBB4_1655 26533 # %bb.175: 26534 movss xmm1, dword ptr [rcx + 4*rax] # xmm1 = mem[0],zero,zero,zero 26535 xorps xmm0, xmm0 26536 xorps xmm2, xmm2 26537 ucomiss xmm2, xmm1 26538 je .LBB4_177 26539 # %bb.176: 26540 movmskps ecx, xmm1 26541 and ecx, 1 26542 neg ecx 26543 or ecx, 1 26544 xorps xmm0, xmm0 26545 cvtsi2ss xmm0, ecx 26546 cvtss2sd xmm0, xmm0 26547 .LBB4_177: 26548 movsd qword ptr [r8 + 8*rax], xmm0 26549 jmp .LBB4_1655 26550 .LBB4_178: 26551 cmp esi, 8 26552 jle .LBB4_293 26553 # %bb.179: 26554 cmp esi, 9 26555 je .LBB4_422 26556 # %bb.180: 26557 cmp esi, 11 26558 je .LBB4_425 26559 # %bb.181: 26560 cmp esi, 12 26561 jne .LBB4_1655 26562 # %bb.182: 26563 test r9d, r9d 26564 jle .LBB4_1655 26565 # %bb.183: 26566 mov edx, r9d 26567 lea rsi, [rdx - 1] 26568 mov eax, edx 26569 and eax, 3 26570 cmp rsi, 3 26571 jae .LBB4_603 26572 # %bb.184: 26573 xor esi, esi 26574 .LBB4_185: 26575 test rax, rax 26576 je .LBB4_1655 26577 # %bb.186: 26578 lea rdx, [r8 + 8*rsi] 26579 add rcx, rsi 26580 xor esi, esi 26581 movsd xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero 26582 jmp .LBB4_188 26583 .LBB4_187: # in Loop: Header=BB4_188 Depth=1 26584 movsd qword ptr [rdx + 8*rsi], xmm1 26585 add rsi, 1 26586 cmp rax, rsi 26587 je .LBB4_1655 26588 .LBB4_188: # =>This Inner Loop Header: Depth=1 26589 cmp byte ptr [rcx + rsi], 0 26590 movapd xmm1, xmm0 26591 jne .LBB4_187 26592 # %bb.189: # in Loop: Header=BB4_188 Depth=1 26593 xorpd xmm1, xmm1 26594 jmp .LBB4_187 26595 .LBB4_190: 26596 cmp esi, 8 26597 jle .LBB4_298 26598 # %bb.191: 26599 cmp esi, 9 26600 je .LBB4_428 26601 # %bb.192: 26602 cmp esi, 11 26603 je .LBB4_431 26604 # %bb.193: 26605 cmp esi, 12 26606 jne .LBB4_1655 26607 # %bb.194: 26608 test r9d, r9d 26609 jle .LBB4_1655 26610 # %bb.195: 26611 mov edx, r9d 26612 cmp r9d, 1 26613 jne .LBB4_613 26614 # %bb.196: 26615 xor eax, eax 26616 .LBB4_197: 26617 test dl, 1 26618 je .LBB4_1655 26619 # %bb.198: 26620 cmp dword ptr [rcx + 4*rax], 0 26621 je .LBB4_122 26622 jmp .LBB4_982 26623 .LBB4_200: 26624 cmp esi, 2 26625 je .LBB4_434 26626 # %bb.201: 26627 cmp esi, 3 26628 jne .LBB4_1655 26629 # %bb.202: 26630 test r9d, r9d 26631 jle .LBB4_1655 26632 # %bb.203: 26633 mov eax, r9d 26634 cmp r9d, 8 26635 jb .LBB4_204 26636 # %bb.623: 26637 lea rdx, [rcx + 4*rax] 26638 cmp rdx, r8 26639 jbe .LBB4_625 26640 # %bb.624: 26641 lea rdx, [r8 + rax] 26642 cmp rdx, rcx 26643 jbe .LBB4_625 26644 .LBB4_204: 26645 xor edx, edx 26646 .LBB4_1275: 26647 mov rsi, rdx 26648 not rsi 26649 add rsi, rax 26650 mov rdi, rax 26651 and rdi, 3 26652 je .LBB4_1277 26653 .LBB4_1276: # =>This Inner Loop Header: Depth=1 26654 cmp dword ptr [rcx + 4*rdx], 0 26655 setne byte ptr [r8 + rdx] 26656 add rdx, 1 26657 add rdi, -1 26658 jne .LBB4_1276 26659 .LBB4_1277: 26660 cmp rsi, 3 26661 jb .LBB4_1655 26662 .LBB4_1278: # =>This Inner Loop Header: Depth=1 26663 cmp dword ptr [rcx + 4*rdx], 0 26664 setne byte ptr [r8 + rdx] 26665 cmp dword ptr [rcx + 4*rdx + 4], 0 26666 setne byte ptr [r8 + rdx + 1] 26667 cmp dword ptr [rcx + 4*rdx + 8], 0 26668 setne byte ptr [r8 + rdx + 2] 26669 cmp dword ptr [rcx + 4*rdx + 12], 0 26670 setne byte ptr [r8 + rdx + 3] 26671 add rdx, 4 26672 cmp rax, rdx 26673 jne .LBB4_1278 26674 jmp .LBB4_1655 26675 .LBB4_205: 26676 cmp esi, 2 26677 je .LBB4_437 26678 # %bb.206: 26679 cmp esi, 3 26680 jne .LBB4_1655 26681 # %bb.207: 26682 test r9d, r9d 26683 jle .LBB4_1655 26684 # %bb.208: 26685 mov eax, r9d 26686 cmp r9d, 4 26687 jb .LBB4_209 26688 # %bb.628: 26689 lea rdx, [rcx + 8*rax] 26690 cmp rdx, r8 26691 jbe .LBB4_630 26692 # %bb.629: 26693 lea rdx, [r8 + rax] 26694 cmp rdx, rcx 26695 jbe .LBB4_630 26696 .LBB4_209: 26697 xor edx, edx 26698 .LBB4_1283: 26699 mov rsi, rdx 26700 not rsi 26701 test al, 1 26702 je .LBB4_1285 26703 # %bb.1284: 26704 movsd xmm0, qword ptr [rcx + 8*rdx] # xmm0 = mem[0],zero 26705 xor r9d, r9d 26706 pxor xmm1, xmm1 26707 ucomisd xmm1, xmm0 26708 andpd xmm0, xmmword ptr [rip + .LCPI4_0] 26709 movsd xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero 26710 orpd xmm1, xmm0 26711 cvttsd2si edi, xmm1 26712 cmove edi, r9d 26713 mov byte ptr [r8 + rdx], dil 26714 or rdx, 1 26715 .LBB4_1285: 26716 add rsi, rax 26717 je .LBB4_1655 26718 # %bb.1286: 26719 xor esi, esi 26720 xorpd xmm0, xmm0 26721 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 26722 movsd xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero 26723 .LBB4_1287: # =>This Inner Loop Header: Depth=1 26724 movsd xmm3, qword ptr [rcx + 8*rdx] # xmm3 = mem[0],zero 26725 ucomisd xmm0, xmm3 26726 andpd xmm3, xmm1 26727 orpd xmm3, xmm2 26728 cvttsd2si edi, xmm3 26729 cmove edi, esi 26730 mov byte ptr [r8 + rdx], dil 26731 movsd xmm3, qword ptr [rcx + 8*rdx + 8] # xmm3 = mem[0],zero 26732 ucomisd xmm0, xmm3 26733 andpd xmm3, xmm1 26734 orpd xmm3, xmm2 26735 cvttsd2si edi, xmm3 26736 cmove edi, esi 26737 mov byte ptr [r8 + rdx + 1], dil 26738 add rdx, 2 26739 cmp rax, rdx 26740 jne .LBB4_1287 26741 jmp .LBB4_1655 26742 .LBB4_210: 26743 cmp esi, 2 26744 je .LBB4_440 26745 # %bb.211: 26746 cmp esi, 3 26747 jne .LBB4_1655 26748 # %bb.212: 26749 test r9d, r9d 26750 jle .LBB4_1655 26751 # %bb.213: 26752 mov r10d, r9d 26753 cmp r9d, 32 26754 jb .LBB4_214 26755 # %bb.633: 26756 lea rdx, [rcx + r10] 26757 cmp rdx, r8 26758 jbe .LBB4_635 26759 # %bb.634: 26760 lea rdx, [r8 + r10] 26761 cmp rdx, rcx 26762 jbe .LBB4_635 26763 .LBB4_214: 26764 xor esi, esi 26765 .LBB4_1292: 26766 mov rax, rsi 26767 not rax 26768 test r10b, 1 26769 je .LBB4_1294 26770 # %bb.1293: 26771 mov dil, byte ptr [rcx + rsi] 26772 test dil, dil 26773 setne r9b 26774 neg r9b 26775 test dil, dil 26776 movzx r9d, r9b 26777 mov edi, 1 26778 cmovle edi, r9d 26779 mov byte ptr [r8 + rsi], dil 26780 or rsi, 1 26781 .LBB4_1294: 26782 add rax, r10 26783 je .LBB4_1655 26784 # %bb.1295: 26785 mov edi, 1 26786 .LBB4_1296: # =>This Inner Loop Header: Depth=1 26787 movzx eax, byte ptr [rcx + rsi] 26788 test al, al 26789 setne dl 26790 neg dl 26791 test al, al 26792 movzx eax, dl 26793 cmovg eax, edi 26794 mov byte ptr [r8 + rsi], al 26795 movzx eax, byte ptr [rcx + rsi + 1] 26796 test al, al 26797 setne dl 26798 neg dl 26799 test al, al 26800 movzx eax, dl 26801 cmovg eax, edi 26802 mov byte ptr [r8 + rsi + 1], al 26803 add rsi, 2 26804 cmp r10, rsi 26805 jne .LBB4_1296 26806 jmp .LBB4_1655 26807 .LBB4_215: 26808 cmp esi, 2 26809 je .LBB4_443 26810 # %bb.216: 26811 cmp esi, 3 26812 jne .LBB4_1655 26813 # %bb.217: 26814 test r9d, r9d 26815 jle .LBB4_1655 26816 # %bb.218: 26817 mov eax, r9d 26818 cmp r9d, 4 26819 jb .LBB4_219 26820 # %bb.638: 26821 lea rdx, [rcx + 8*rax] 26822 cmp rdx, r8 26823 jbe .LBB4_640 26824 # %bb.639: 26825 lea rdx, [r8 + rax] 26826 cmp rdx, rcx 26827 jbe .LBB4_640 26828 .LBB4_219: 26829 xor edx, edx 26830 .LBB4_1301: 26831 mov rsi, rdx 26832 not rsi 26833 add rsi, rax 26834 mov rdi, rax 26835 and rdi, 3 26836 je .LBB4_1303 26837 .LBB4_1302: # =>This Inner Loop Header: Depth=1 26838 cmp qword ptr [rcx + 8*rdx], 0 26839 setne byte ptr [r8 + rdx] 26840 add rdx, 1 26841 add rdi, -1 26842 jne .LBB4_1302 26843 .LBB4_1303: 26844 cmp rsi, 3 26845 jb .LBB4_1655 26846 .LBB4_1304: # =>This Inner Loop Header: Depth=1 26847 cmp qword ptr [rcx + 8*rdx], 0 26848 setne byte ptr [r8 + rdx] 26849 cmp qword ptr [rcx + 8*rdx + 8], 0 26850 setne byte ptr [r8 + rdx + 1] 26851 cmp qword ptr [rcx + 8*rdx + 16], 0 26852 setne byte ptr [r8 + rdx + 2] 26853 cmp qword ptr [rcx + 8*rdx + 24], 0 26854 setne byte ptr [r8 + rdx + 3] 26855 add rdx, 4 26856 cmp rax, rdx 26857 jne .LBB4_1304 26858 jmp .LBB4_1655 26859 .LBB4_220: 26860 cmp esi, 2 26861 je .LBB4_446 26862 # %bb.221: 26863 cmp esi, 3 26864 jne .LBB4_1655 26865 # %bb.222: 26866 test r9d, r9d 26867 jle .LBB4_1655 26868 # %bb.223: 26869 mov eax, r9d 26870 cmp r9d, 16 26871 jb .LBB4_224 26872 # %bb.643: 26873 lea rdx, [rcx + 2*rax] 26874 cmp rdx, r8 26875 jbe .LBB4_645 26876 # %bb.644: 26877 lea rdx, [r8 + rax] 26878 cmp rdx, rcx 26879 jbe .LBB4_645 26880 .LBB4_224: 26881 xor edx, edx 26882 .LBB4_1309: 26883 mov rsi, rdx 26884 not rsi 26885 add rsi, rax 26886 mov rdi, rax 26887 and rdi, 3 26888 je .LBB4_1311 26889 .LBB4_1310: # =>This Inner Loop Header: Depth=1 26890 cmp word ptr [rcx + 2*rdx], 0 26891 setne byte ptr [r8 + rdx] 26892 add rdx, 1 26893 add rdi, -1 26894 jne .LBB4_1310 26895 .LBB4_1311: 26896 cmp rsi, 3 26897 jb .LBB4_1655 26898 .LBB4_1312: # =>This Inner Loop Header: Depth=1 26899 cmp word ptr [rcx + 2*rdx], 0 26900 setne byte ptr [r8 + rdx] 26901 cmp word ptr [rcx + 2*rdx + 2], 0 26902 setne byte ptr [r8 + rdx + 1] 26903 cmp word ptr [rcx + 2*rdx + 4], 0 26904 setne byte ptr [r8 + rdx + 2] 26905 cmp word ptr [rcx + 2*rdx + 6], 0 26906 setne byte ptr [r8 + rdx + 3] 26907 add rdx, 4 26908 cmp rax, rdx 26909 jne .LBB4_1312 26910 jmp .LBB4_1655 26911 .LBB4_225: 26912 cmp esi, 2 26913 je .LBB4_449 26914 # %bb.226: 26915 cmp esi, 3 26916 jne .LBB4_1655 26917 # %bb.227: 26918 test r9d, r9d 26919 jle .LBB4_1655 26920 # %bb.228: 26921 mov r10d, r9d 26922 cmp r9d, 16 26923 jb .LBB4_229 26924 # %bb.648: 26925 lea rdx, [rcx + 2*r10] 26926 cmp rdx, r8 26927 jbe .LBB4_650 26928 # %bb.649: 26929 lea rdx, [r8 + r10] 26930 cmp rdx, rcx 26931 jbe .LBB4_650 26932 .LBB4_229: 26933 xor esi, esi 26934 .LBB4_1317: 26935 mov rax, rsi 26936 not rax 26937 test r10b, 1 26938 je .LBB4_1319 26939 # %bb.1318: 26940 movzx edi, word ptr [rcx + 2*rsi] 26941 test di, di 26942 setne r9b 26943 neg r9b 26944 test di, di 26945 movzx r9d, r9b 26946 mov edi, 1 26947 cmovle edi, r9d 26948 mov byte ptr [r8 + rsi], dil 26949 or rsi, 1 26950 .LBB4_1319: 26951 add rax, r10 26952 je .LBB4_1655 26953 # %bb.1320: 26954 mov r9d, 1 26955 .LBB4_1321: # =>This Inner Loop Header: Depth=1 26956 movzx edi, word ptr [rcx + 2*rsi] 26957 test di, di 26958 setne al 26959 neg al 26960 test di, di 26961 movzx eax, al 26962 cmovg eax, r9d 26963 mov byte ptr [r8 + rsi], al 26964 movzx eax, word ptr [rcx + 2*rsi + 2] 26965 test ax, ax 26966 setne dl 26967 neg dl 26968 test ax, ax 26969 movzx eax, dl 26970 cmovg eax, r9d 26971 mov byte ptr [r8 + rsi + 1], al 26972 add rsi, 2 26973 cmp r10, rsi 26974 jne .LBB4_1321 26975 jmp .LBB4_1655 26976 .LBB4_230: 26977 cmp esi, 2 26978 je .LBB4_452 26979 # %bb.231: 26980 cmp esi, 3 26981 jne .LBB4_1655 26982 # %bb.232: 26983 test r9d, r9d 26984 jle .LBB4_1655 26985 # %bb.233: 26986 mov r10d, r9d 26987 cmp r9d, 4 26988 jb .LBB4_234 26989 # %bb.653: 26990 lea rdx, [rcx + 8*r10] 26991 cmp rdx, r8 26992 jbe .LBB4_655 26993 # %bb.654: 26994 lea rdx, [r8 + r10] 26995 cmp rdx, rcx 26996 jbe .LBB4_655 26997 .LBB4_234: 26998 xor esi, esi 26999 .LBB4_1326: 27000 mov rdx, rsi 27001 not rdx 27002 test r10b, 1 27003 je .LBB4_1328 27004 # %bb.1327: 27005 mov rdi, qword ptr [rcx + 8*rsi] 27006 test rdi, rdi 27007 setne al 27008 neg al 27009 test rdi, rdi 27010 movzx eax, al 27011 mov edi, 1 27012 cmovle edi, eax 27013 mov byte ptr [r8 + rsi], dil 27014 or rsi, 1 27015 .LBB4_1328: 27016 add rdx, r10 27017 je .LBB4_1655 27018 # %bb.1329: 27019 mov edi, 1 27020 .LBB4_1330: # =>This Inner Loop Header: Depth=1 27021 mov rax, qword ptr [rcx + 8*rsi] 27022 test rax, rax 27023 setne dl 27024 neg dl 27025 test rax, rax 27026 movzx eax, dl 27027 cmovg eax, edi 27028 mov byte ptr [r8 + rsi], al 27029 mov rax, qword ptr [rcx + 8*rsi + 8] 27030 test rax, rax 27031 setne dl 27032 neg dl 27033 test rax, rax 27034 movzx eax, dl 27035 cmovg eax, edi 27036 mov byte ptr [r8 + rsi + 1], al 27037 add rsi, 2 27038 cmp r10, rsi 27039 jne .LBB4_1330 27040 jmp .LBB4_1655 27041 .LBB4_235: 27042 cmp esi, 2 27043 je .LBB4_455 27044 # %bb.236: 27045 cmp esi, 3 27046 jne .LBB4_1655 27047 # %bb.237: 27048 test r9d, r9d 27049 jle .LBB4_1655 27050 # %bb.238: 27051 mov r10d, r9d 27052 cmp r9d, 8 27053 jb .LBB4_239 27054 # %bb.658: 27055 lea rdx, [rcx + 4*r10] 27056 cmp rdx, r8 27057 jbe .LBB4_660 27058 # %bb.659: 27059 lea rdx, [r8 + r10] 27060 cmp rdx, rcx 27061 jbe .LBB4_660 27062 .LBB4_239: 27063 xor edx, edx 27064 .LBB4_1335: 27065 mov rsi, rdx 27066 not rsi 27067 test r10b, 1 27068 je .LBB4_1337 27069 # %bb.1336: 27070 movd xmm0, dword ptr [rcx + 4*rdx] # xmm0 = mem[0],zero,zero,zero 27071 movd edi, xmm0 27072 test edi, edi 27073 setns al 27074 add al, al 27075 add al, -1 27076 xor edi, edi 27077 pxor xmm1, xmm1 27078 ucomiss xmm1, xmm0 27079 movzx eax, al 27080 cmove eax, edi 27081 mov byte ptr [r8 + rdx], al 27082 or rdx, 1 27083 .LBB4_1337: 27084 add rsi, r10 27085 je .LBB4_1655 27086 # %bb.1338: 27087 xor esi, esi 27088 xorps xmm0, xmm0 27089 .LBB4_1339: # =>This Inner Loop Header: Depth=1 27090 movd xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero 27091 movd eax, xmm1 27092 test eax, eax 27093 setns al 27094 add al, al 27095 add al, -1 27096 ucomiss xmm0, xmm1 27097 movzx eax, al 27098 cmove eax, esi 27099 mov byte ptr [r8 + rdx], al 27100 movd xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero 27101 movd eax, xmm1 27102 test eax, eax 27103 setns al 27104 add al, al 27105 add al, -1 27106 ucomiss xmm0, xmm1 27107 movzx eax, al 27108 cmove eax, esi 27109 mov byte ptr [r8 + rdx + 1], al 27110 add rdx, 2 27111 cmp r10, rdx 27112 jne .LBB4_1339 27113 jmp .LBB4_1655 27114 .LBB4_240: 27115 cmp esi, 2 27116 je .LBB4_458 27117 # %bb.241: 27118 cmp esi, 3 27119 jne .LBB4_1655 27120 # %bb.242: 27121 test r9d, r9d 27122 jle .LBB4_1655 27123 # %bb.243: 27124 mov eax, r9d 27125 cmp r9d, 32 27126 jb .LBB4_244 27127 # %bb.663: 27128 lea rdx, [rcx + rax] 27129 cmp rdx, r8 27130 jbe .LBB4_665 27131 # %bb.664: 27132 lea rdx, [r8 + rax] 27133 cmp rdx, rcx 27134 jbe .LBB4_665 27135 .LBB4_244: 27136 xor edx, edx 27137 .LBB4_1344: 27138 mov rsi, rdx 27139 not rsi 27140 add rsi, rax 27141 mov rdi, rax 27142 and rdi, 3 27143 je .LBB4_1346 27144 .LBB4_1345: # =>This Inner Loop Header: Depth=1 27145 cmp byte ptr [rcx + rdx], 0 27146 setne byte ptr [r8 + rdx] 27147 add rdx, 1 27148 add rdi, -1 27149 jne .LBB4_1345 27150 .LBB4_1346: 27151 cmp rsi, 3 27152 jb .LBB4_1655 27153 .LBB4_1347: # =>This Inner Loop Header: Depth=1 27154 cmp byte ptr [rcx + rdx], 0 27155 setne byte ptr [r8 + rdx] 27156 cmp byte ptr [rcx + rdx + 1], 0 27157 setne byte ptr [r8 + rdx + 1] 27158 cmp byte ptr [rcx + rdx + 2], 0 27159 setne byte ptr [r8 + rdx + 2] 27160 cmp byte ptr [rcx + rdx + 3], 0 27161 setne byte ptr [r8 + rdx + 3] 27162 add rdx, 4 27163 cmp rax, rdx 27164 jne .LBB4_1347 27165 jmp .LBB4_1655 27166 .LBB4_245: 27167 cmp esi, 2 27168 je .LBB4_461 27169 # %bb.246: 27170 cmp esi, 3 27171 jne .LBB4_1655 27172 # %bb.247: 27173 test r9d, r9d 27174 jle .LBB4_1655 27175 # %bb.248: 27176 mov r10d, r9d 27177 cmp r9d, 8 27178 jb .LBB4_249 27179 # %bb.668: 27180 lea rdx, [rcx + 4*r10] 27181 cmp rdx, r8 27182 jbe .LBB4_670 27183 # %bb.669: 27184 lea rdx, [r8 + r10] 27185 cmp rdx, rcx 27186 jbe .LBB4_670 27187 .LBB4_249: 27188 xor esi, esi 27189 .LBB4_1352: 27190 mov rax, rsi 27191 not rax 27192 test r10b, 1 27193 je .LBB4_1354 27194 # %bb.1353: 27195 mov edi, dword ptr [rcx + 4*rsi] 27196 test edi, edi 27197 setne r9b 27198 neg r9b 27199 test edi, edi 27200 movzx r9d, r9b 27201 mov edi, 1 27202 cmovle edi, r9d 27203 mov byte ptr [r8 + rsi], dil 27204 or rsi, 1 27205 .LBB4_1354: 27206 add rax, r10 27207 je .LBB4_1655 27208 # %bb.1355: 27209 mov r9d, 1 27210 .LBB4_1356: # =>This Inner Loop Header: Depth=1 27211 mov edi, dword ptr [rcx + 4*rsi] 27212 test edi, edi 27213 setne al 27214 neg al 27215 test edi, edi 27216 movzx eax, al 27217 cmovg eax, r9d 27218 mov byte ptr [r8 + rsi], al 27219 mov eax, dword ptr [rcx + 4*rsi + 4] 27220 test eax, eax 27221 setne dl 27222 neg dl 27223 test eax, eax 27224 movzx eax, dl 27225 cmovg eax, r9d 27226 mov byte ptr [r8 + rsi + 1], al 27227 add rsi, 2 27228 cmp r10, rsi 27229 jne .LBB4_1356 27230 jmp .LBB4_1655 27231 .LBB4_250: 27232 cmp esi, 7 27233 je .LBB4_464 27234 # %bb.251: 27235 cmp esi, 8 27236 jne .LBB4_1655 27237 # %bb.252: 27238 test r9d, r9d 27239 jle .LBB4_1655 27240 # %bb.253: 27241 mov eax, r9d 27242 cmp r9d, 4 27243 jae .LBB4_673 27244 # %bb.254: 27245 xor edx, edx 27246 jmp .LBB4_1003 27247 .LBB4_255: 27248 cmp esi, 7 27249 je .LBB4_467 27250 # %bb.256: 27251 cmp esi, 8 27252 jne .LBB4_1655 27253 # %bb.257: 27254 test r9d, r9d 27255 jle .LBB4_1655 27256 # %bb.258: 27257 mov r10d, r9d 27258 movabs r11, -9223372036854775808 27259 cmp r9d, 1 27260 jne .LBB4_676 27261 # %bb.259: 27262 xor esi, esi 27263 jmp .LBB4_1008 27264 .LBB4_260: 27265 cmp esi, 7 27266 je .LBB4_470 27267 # %bb.261: 27268 cmp esi, 8 27269 jne .LBB4_1655 27270 # %bb.262: 27271 test r9d, r9d 27272 jle .LBB4_1655 27273 # %bb.263: 27274 mov r10d, r9d 27275 cmp r9d, 4 27276 jb .LBB4_264 27277 # %bb.679: 27278 lea rdx, [rcx + r10] 27279 cmp rdx, r8 27280 jbe .LBB4_681 27281 # %bb.680: 27282 lea rdx, [r8 + 8*r10] 27283 cmp rdx, rcx 27284 jbe .LBB4_681 27285 .LBB4_264: 27286 xor edx, edx 27287 .LBB4_1361: 27288 mov rsi, rdx 27289 not rsi 27290 test r10b, 1 27291 je .LBB4_1363 27292 # %bb.1362: 27293 mov al, byte ptr [rcx + rdx] 27294 xor edi, edi 27295 test al, al 27296 setne dil 27297 neg rdi 27298 test al, al 27299 mov eax, 1 27300 cmovle rax, rdi 27301 mov qword ptr [r8 + 8*rdx], rax 27302 or rdx, 1 27303 .LBB4_1363: 27304 add rsi, r10 27305 je .LBB4_1655 27306 # %bb.1364: 27307 mov esi, 1 27308 .LBB4_1365: # =>This Inner Loop Header: Depth=1 27309 movzx eax, byte ptr [rcx + rdx] 27310 xor edi, edi 27311 test al, al 27312 setne dil 27313 neg rdi 27314 test al, al 27315 cmovg rdi, rsi 27316 mov qword ptr [r8 + 8*rdx], rdi 27317 movzx eax, byte ptr [rcx + rdx + 1] 27318 xor edi, edi 27319 test al, al 27320 setne dil 27321 neg rdi 27322 test al, al 27323 cmovg rdi, rsi 27324 mov qword ptr [r8 + 8*rdx + 8], rdi 27325 add rdx, 2 27326 cmp r10, rdx 27327 jne .LBB4_1365 27328 jmp .LBB4_1655 27329 .LBB4_265: 27330 cmp esi, 7 27331 je .LBB4_473 27332 # %bb.266: 27333 cmp esi, 8 27334 jne .LBB4_1655 27335 # %bb.267: 27336 test r9d, r9d 27337 jle .LBB4_1655 27338 # %bb.268: 27339 mov r10d, r9d 27340 cmp r9d, 4 27341 jb .LBB4_269 27342 # %bb.684: 27343 lea rdx, [rcx + 8*r10] 27344 cmp rdx, r8 27345 jbe .LBB4_686 27346 # %bb.685: 27347 lea rdx, [r8 + 8*r10] 27348 cmp rdx, rcx 27349 jbe .LBB4_686 27350 .LBB4_269: 27351 xor edx, edx 27352 .LBB4_1370: 27353 mov rsi, rdx 27354 not rsi 27355 add rsi, r10 27356 mov rdi, r10 27357 and rdi, 3 27358 je .LBB4_1372 27359 .LBB4_1371: # =>This Inner Loop Header: Depth=1 27360 xor eax, eax 27361 cmp qword ptr [rcx + 8*rdx], 0 27362 setne al 27363 mov qword ptr [r8 + 8*rdx], rax 27364 add rdx, 1 27365 add rdi, -1 27366 jne .LBB4_1371 27367 .LBB4_1372: 27368 cmp rsi, 3 27369 jb .LBB4_1655 27370 .LBB4_1373: # =>This Inner Loop Header: Depth=1 27371 xor eax, eax 27372 cmp qword ptr [rcx + 8*rdx], 0 27373 setne al 27374 mov qword ptr [r8 + 8*rdx], rax 27375 xor eax, eax 27376 cmp qword ptr [rcx + 8*rdx + 8], 0 27377 setne al 27378 mov qword ptr [r8 + 8*rdx + 8], rax 27379 xor eax, eax 27380 cmp qword ptr [rcx + 8*rdx + 16], 0 27381 setne al 27382 mov qword ptr [r8 + 8*rdx + 16], rax 27383 xor eax, eax 27384 cmp qword ptr [rcx + 8*rdx + 24], 0 27385 setne al 27386 mov qword ptr [r8 + 8*rdx + 24], rax 27387 add rdx, 4 27388 cmp r10, rdx 27389 jne .LBB4_1373 27390 jmp .LBB4_1655 27391 .LBB4_270: 27392 cmp esi, 7 27393 je .LBB4_476 27394 # %bb.271: 27395 cmp esi, 8 27396 jne .LBB4_1655 27397 # %bb.272: 27398 test r9d, r9d 27399 jle .LBB4_1655 27400 # %bb.273: 27401 mov eax, r9d 27402 cmp r9d, 4 27403 jae .LBB4_689 27404 # %bb.274: 27405 xor edx, edx 27406 jmp .LBB4_1014 27407 .LBB4_275: 27408 cmp esi, 7 27409 je .LBB4_479 27410 # %bb.276: 27411 cmp esi, 8 27412 jne .LBB4_1655 27413 # %bb.277: 27414 test r9d, r9d 27415 jle .LBB4_1655 27416 # %bb.278: 27417 mov r10d, r9d 27418 cmp r9d, 4 27419 jae .LBB4_692 27420 # %bb.279: 27421 xor edx, edx 27422 jmp .LBB4_1019 27423 .LBB4_280: 27424 cmp esi, 7 27425 je .LBB4_482 27426 # %bb.281: 27427 cmp esi, 8 27428 jne .LBB4_1655 27429 # %bb.282: 27430 test r9d, r9d 27431 jle .LBB4_1655 27432 # %bb.283: 27433 mov r11d, r9d 27434 cmp r9d, 4 27435 jb .LBB4_284 27436 # %bb.695: 27437 lea rdx, [rcx + 8*r11] 27438 cmp rdx, r8 27439 jbe .LBB4_697 27440 # %bb.696: 27441 lea rdx, [r8 + 8*r11] 27442 cmp rdx, rcx 27443 jbe .LBB4_697 27444 .LBB4_284: 27445 xor edx, edx 27446 .LBB4_1378: 27447 mov rsi, rdx 27448 not rsi 27449 test r11b, 1 27450 je .LBB4_1380 27451 # %bb.1379: 27452 mov r9, qword ptr [rcx + 8*rdx] 27453 xor r10d, r10d 27454 test r9, r9 27455 setne r10b 27456 neg r10 27457 test r9, r9 27458 mov edi, 1 27459 cmovle rdi, r10 27460 mov qword ptr [r8 + 8*rdx], rdi 27461 or rdx, 1 27462 .LBB4_1380: 27463 add rsi, r11 27464 je .LBB4_1655 27465 # %bb.1381: 27466 mov esi, 1 27467 .LBB4_1382: # =>This Inner Loop Header: Depth=1 27468 mov rdi, qword ptr [rcx + 8*rdx] 27469 xor eax, eax 27470 test rdi, rdi 27471 setne al 27472 neg rax 27473 test rdi, rdi 27474 cmovg rax, rsi 27475 mov qword ptr [r8 + 8*rdx], rax 27476 mov rax, qword ptr [rcx + 8*rdx + 8] 27477 xor edi, edi 27478 test rax, rax 27479 setne dil 27480 neg rdi 27481 test rax, rax 27482 cmovg rdi, rsi 27483 mov qword ptr [r8 + 8*rdx + 8], rdi 27484 add rdx, 2 27485 cmp r11, rdx 27486 jne .LBB4_1382 27487 jmp .LBB4_1655 27488 .LBB4_285: 27489 cmp esi, 7 27490 je .LBB4_485 27491 # %bb.286: 27492 cmp esi, 8 27493 jne .LBB4_1655 27494 # %bb.287: 27495 test r9d, r9d 27496 jle .LBB4_1655 27497 # %bb.288: 27498 mov r10d, r9d 27499 cmp r9d, 1 27500 jne .LBB4_700 27501 # %bb.289: 27502 xor eax, eax 27503 jmp .LBB4_290 27504 .LBB4_293: 27505 cmp esi, 7 27506 je .LBB4_488 27507 # %bb.294: 27508 cmp esi, 8 27509 jne .LBB4_1655 27510 # %bb.295: 27511 test r9d, r9d 27512 jle .LBB4_1655 27513 # %bb.296: 27514 mov r10d, r9d 27515 cmp r9d, 4 27516 jb .LBB4_297 27517 # %bb.708: 27518 lea rdx, [rcx + r10] 27519 cmp rdx, r8 27520 jbe .LBB4_710 27521 # %bb.709: 27522 lea rdx, [r8 + 8*r10] 27523 cmp rdx, rcx 27524 jbe .LBB4_710 27525 .LBB4_297: 27526 xor edx, edx 27527 .LBB4_1387: 27528 mov rsi, rdx 27529 not rsi 27530 add rsi, r10 27531 mov rdi, r10 27532 and rdi, 3 27533 je .LBB4_1389 27534 .LBB4_1388: # =>This Inner Loop Header: Depth=1 27535 xor eax, eax 27536 cmp byte ptr [rcx + rdx], 0 27537 setne al 27538 mov qword ptr [r8 + 8*rdx], rax 27539 add rdx, 1 27540 add rdi, -1 27541 jne .LBB4_1388 27542 .LBB4_1389: 27543 cmp rsi, 3 27544 jb .LBB4_1655 27545 .LBB4_1390: # =>This Inner Loop Header: Depth=1 27546 xor eax, eax 27547 cmp byte ptr [rcx + rdx], 0 27548 setne al 27549 mov qword ptr [r8 + 8*rdx], rax 27550 xor eax, eax 27551 cmp byte ptr [rcx + rdx + 1], 0 27552 setne al 27553 mov qword ptr [r8 + 8*rdx + 8], rax 27554 xor eax, eax 27555 cmp byte ptr [rcx + rdx + 2], 0 27556 setne al 27557 mov qword ptr [r8 + 8*rdx + 16], rax 27558 xor eax, eax 27559 cmp byte ptr [rcx + rdx + 3], 0 27560 setne al 27561 mov qword ptr [r8 + 8*rdx + 24], rax 27562 add rdx, 4 27563 cmp r10, rdx 27564 jne .LBB4_1390 27565 jmp .LBB4_1655 27566 .LBB4_298: 27567 cmp esi, 7 27568 je .LBB4_491 27569 # %bb.299: 27570 cmp esi, 8 27571 jne .LBB4_1655 27572 # %bb.300: 27573 test r9d, r9d 27574 jle .LBB4_1655 27575 # %bb.301: 27576 mov r10d, r9d 27577 cmp r9d, 4 27578 jae .LBB4_713 27579 # %bb.302: 27580 xor edx, edx 27581 jmp .LBB4_1025 27582 .LBB4_303: 27583 test r9d, r9d 27584 jle .LBB4_1655 27585 # %bb.304: 27586 mov eax, r9d 27587 cmp r9d, 8 27588 jae .LBB4_716 27589 # %bb.305: 27590 xor edx, edx 27591 jmp .LBB4_1141 27592 .LBB4_306: 27593 test r9d, r9d 27594 jle .LBB4_1655 27595 # %bb.307: 27596 mov eax, r9d 27597 cmp r9d, 8 27598 jae .LBB4_719 27599 # %bb.308: 27600 xor edx, edx 27601 jmp .LBB4_1146 27602 .LBB4_309: 27603 test r9d, r9d 27604 jle .LBB4_1655 27605 # %bb.310: 27606 mov eax, r9d 27607 xor r10d, r10d 27608 cmp r9d, 4 27609 jae .LBB4_722 27610 # %bb.311: 27611 xor esi, esi 27612 jmp .LBB4_1151 27613 .LBB4_312: 27614 test r9d, r9d 27615 jle .LBB4_1655 27616 # %bb.313: 27617 mov eax, r9d 27618 xor r10d, r10d 27619 cmp r9d, 4 27620 jae .LBB4_725 27621 # %bb.314: 27622 xor esi, esi 27623 jmp .LBB4_1157 27624 .LBB4_315: 27625 test r9d, r9d 27626 jle .LBB4_1655 27627 # %bb.316: 27628 mov r10d, r9d 27629 cmp r9d, 16 27630 jb .LBB4_317 27631 # %bb.728: 27632 lea rdx, [rcx + r10] 27633 cmp rdx, r8 27634 jbe .LBB4_730 27635 # %bb.729: 27636 lea rdx, [r8 + 2*r10] 27637 cmp rdx, rcx 27638 jbe .LBB4_730 27639 .LBB4_317: 27640 xor edx, edx 27641 .LBB4_1395: 27642 mov rsi, rdx 27643 not rsi 27644 test r10b, 1 27645 je .LBB4_1397 27646 # %bb.1396: 27647 mov r9b, byte ptr [rcx + rdx] 27648 xor edi, edi 27649 test r9b, r9b 27650 setne dil 27651 neg edi 27652 test r9b, r9b 27653 mov eax, 1 27654 cmovle eax, edi 27655 mov word ptr [r8 + 2*rdx], ax 27656 or rdx, 1 27657 .LBB4_1397: 27658 add rsi, r10 27659 je .LBB4_1655 27660 # %bb.1398: 27661 mov esi, 1 27662 .LBB4_1399: # =>This Inner Loop Header: Depth=1 27663 movzx eax, byte ptr [rcx + rdx] 27664 xor edi, edi 27665 test al, al 27666 setne dil 27667 neg edi 27668 test al, al 27669 cmovg edi, esi 27670 mov word ptr [r8 + 2*rdx], di 27671 movzx eax, byte ptr [rcx + rdx + 1] 27672 xor edi, edi 27673 test al, al 27674 setne dil 27675 neg edi 27676 test al, al 27677 cmovg edi, esi 27678 mov word ptr [r8 + 2*rdx + 2], di 27679 add rdx, 2 27680 cmp r10, rdx 27681 jne .LBB4_1399 27682 jmp .LBB4_1655 27683 .LBB4_318: 27684 test r9d, r9d 27685 jle .LBB4_1655 27686 # %bb.319: 27687 mov r10d, r9d 27688 cmp r9d, 16 27689 jb .LBB4_320 27690 # %bb.733: 27691 lea rdx, [rcx + r10] 27692 cmp rdx, r8 27693 jbe .LBB4_735 27694 # %bb.734: 27695 lea rdx, [r8 + 2*r10] 27696 cmp rdx, rcx 27697 jbe .LBB4_735 27698 .LBB4_320: 27699 xor edx, edx 27700 .LBB4_1404: 27701 mov rsi, rdx 27702 not rsi 27703 test r10b, 1 27704 je .LBB4_1406 27705 # %bb.1405: 27706 mov r9b, byte ptr [rcx + rdx] 27707 xor edi, edi 27708 test r9b, r9b 27709 setne dil 27710 neg edi 27711 test r9b, r9b 27712 mov eax, 1 27713 cmovle eax, edi 27714 mov word ptr [r8 + 2*rdx], ax 27715 or rdx, 1 27716 .LBB4_1406: 27717 add rsi, r10 27718 je .LBB4_1655 27719 # %bb.1407: 27720 mov esi, 1 27721 .LBB4_1408: # =>This Inner Loop Header: Depth=1 27722 movzx eax, byte ptr [rcx + rdx] 27723 xor edi, edi 27724 test al, al 27725 setne dil 27726 neg edi 27727 test al, al 27728 cmovg edi, esi 27729 mov word ptr [r8 + 2*rdx], di 27730 movzx eax, byte ptr [rcx + rdx + 1] 27731 xor edi, edi 27732 test al, al 27733 setne dil 27734 neg edi 27735 test al, al 27736 cmovg edi, esi 27737 mov word ptr [r8 + 2*rdx + 2], di 27738 add rdx, 2 27739 cmp r10, rdx 27740 jne .LBB4_1408 27741 jmp .LBB4_1655 27742 .LBB4_321: 27743 test r9d, r9d 27744 jle .LBB4_1655 27745 # %bb.322: 27746 mov eax, r9d 27747 cmp r9d, 4 27748 jae .LBB4_738 27749 # %bb.323: 27750 xor edx, edx 27751 jmp .LBB4_1031 27752 .LBB4_324: 27753 test r9d, r9d 27754 jle .LBB4_1655 27755 # %bb.325: 27756 mov eax, r9d 27757 cmp r9d, 4 27758 jae .LBB4_741 27759 # %bb.326: 27760 xor edx, edx 27761 jmp .LBB4_1036 27762 .LBB4_327: 27763 test r9d, r9d 27764 jle .LBB4_1655 27765 # %bb.328: 27766 mov r10d, r9d 27767 cmp r9d, 16 27768 jb .LBB4_329 27769 # %bb.744: 27770 lea rdx, [rcx + 2*r10] 27771 cmp rdx, r8 27772 jbe .LBB4_746 27773 # %bb.745: 27774 lea rdx, [r8 + 2*r10] 27775 cmp rdx, rcx 27776 jbe .LBB4_746 27777 .LBB4_329: 27778 xor edx, edx 27779 .LBB4_1413: 27780 mov rsi, rdx 27781 not rsi 27782 add rsi, r10 27783 mov rdi, r10 27784 and rdi, 3 27785 je .LBB4_1415 27786 .LBB4_1414: # =>This Inner Loop Header: Depth=1 27787 xor eax, eax 27788 cmp word ptr [rcx + 2*rdx], 0 27789 setne al 27790 mov word ptr [r8 + 2*rdx], ax 27791 add rdx, 1 27792 add rdi, -1 27793 jne .LBB4_1414 27794 .LBB4_1415: 27795 cmp rsi, 3 27796 jb .LBB4_1655 27797 .LBB4_1416: # =>This Inner Loop Header: Depth=1 27798 xor eax, eax 27799 cmp word ptr [rcx + 2*rdx], 0 27800 setne al 27801 mov word ptr [r8 + 2*rdx], ax 27802 xor eax, eax 27803 cmp word ptr [rcx + 2*rdx + 2], 0 27804 setne al 27805 mov word ptr [r8 + 2*rdx + 2], ax 27806 xor eax, eax 27807 cmp word ptr [rcx + 2*rdx + 4], 0 27808 setne al 27809 mov word ptr [r8 + 2*rdx + 4], ax 27810 xor eax, eax 27811 cmp word ptr [rcx + 2*rdx + 6], 0 27812 setne al 27813 mov word ptr [r8 + 2*rdx + 6], ax 27814 add rdx, 4 27815 cmp r10, rdx 27816 jne .LBB4_1416 27817 jmp .LBB4_1655 27818 .LBB4_330: 27819 test r9d, r9d 27820 jle .LBB4_1655 27821 # %bb.331: 27822 mov r10d, r9d 27823 cmp r9d, 16 27824 jb .LBB4_332 27825 # %bb.749: 27826 lea rdx, [rcx + 2*r10] 27827 cmp rdx, r8 27828 jbe .LBB4_751 27829 # %bb.750: 27830 lea rdx, [r8 + 2*r10] 27831 cmp rdx, rcx 27832 jbe .LBB4_751 27833 .LBB4_332: 27834 xor edx, edx 27835 .LBB4_1421: 27836 mov rsi, rdx 27837 not rsi 27838 add rsi, r10 27839 mov rdi, r10 27840 and rdi, 3 27841 je .LBB4_1423 27842 .LBB4_1422: # =>This Inner Loop Header: Depth=1 27843 xor eax, eax 27844 cmp word ptr [rcx + 2*rdx], 0 27845 setne al 27846 mov word ptr [r8 + 2*rdx], ax 27847 add rdx, 1 27848 add rdi, -1 27849 jne .LBB4_1422 27850 .LBB4_1423: 27851 cmp rsi, 3 27852 jb .LBB4_1655 27853 .LBB4_1424: # =>This Inner Loop Header: Depth=1 27854 xor eax, eax 27855 cmp word ptr [rcx + 2*rdx], 0 27856 setne al 27857 mov word ptr [r8 + 2*rdx], ax 27858 xor eax, eax 27859 cmp word ptr [rcx + 2*rdx + 2], 0 27860 setne al 27861 mov word ptr [r8 + 2*rdx + 2], ax 27862 xor eax, eax 27863 cmp word ptr [rcx + 2*rdx + 4], 0 27864 setne al 27865 mov word ptr [r8 + 2*rdx + 4], ax 27866 xor eax, eax 27867 cmp word ptr [rcx + 2*rdx + 6], 0 27868 setne al 27869 mov word ptr [r8 + 2*rdx + 6], ax 27870 add rdx, 4 27871 cmp r10, rdx 27872 jne .LBB4_1424 27873 jmp .LBB4_1655 27874 .LBB4_333: 27875 test r9d, r9d 27876 jle .LBB4_1655 27877 # %bb.334: 27878 mov r11d, r9d 27879 cmp r9d, 16 27880 jb .LBB4_335 27881 # %bb.754: 27882 lea rdx, [rcx + 2*r11] 27883 cmp rdx, r8 27884 jbe .LBB4_756 27885 # %bb.755: 27886 lea rdx, [r8 + 2*r11] 27887 cmp rdx, rcx 27888 jbe .LBB4_756 27889 .LBB4_335: 27890 xor edx, edx 27891 .LBB4_1429: 27892 mov rsi, rdx 27893 not rsi 27894 test r11b, 1 27895 je .LBB4_1431 27896 # %bb.1430: 27897 movzx r9d, word ptr [rcx + 2*rdx] 27898 xor r10d, r10d 27899 test r9w, r9w 27900 setne r10b 27901 neg r10d 27902 test r9w, r9w 27903 mov edi, 1 27904 cmovle edi, r10d 27905 mov word ptr [r8 + 2*rdx], di 27906 or rdx, 1 27907 .LBB4_1431: 27908 add rsi, r11 27909 je .LBB4_1655 27910 # %bb.1432: 27911 mov esi, 1 27912 .LBB4_1433: # =>This Inner Loop Header: Depth=1 27913 movzx edi, word ptr [rcx + 2*rdx] 27914 xor eax, eax 27915 test di, di 27916 setne al 27917 neg eax 27918 test di, di 27919 cmovg eax, esi 27920 mov word ptr [r8 + 2*rdx], ax 27921 movzx eax, word ptr [rcx + 2*rdx + 2] 27922 xor edi, edi 27923 test ax, ax 27924 setne dil 27925 neg edi 27926 test ax, ax 27927 cmovg edi, esi 27928 mov word ptr [r8 + 2*rdx + 2], di 27929 add rdx, 2 27930 cmp r11, rdx 27931 jne .LBB4_1433 27932 jmp .LBB4_1655 27933 .LBB4_336: 27934 test r9d, r9d 27935 jle .LBB4_1655 27936 # %bb.337: 27937 mov r11d, r9d 27938 cmp r9d, 16 27939 jb .LBB4_338 27940 # %bb.759: 27941 lea rdx, [rcx + 2*r11] 27942 cmp rdx, r8 27943 jbe .LBB4_761 27944 # %bb.760: 27945 lea rdx, [r8 + 2*r11] 27946 cmp rdx, rcx 27947 jbe .LBB4_761 27948 .LBB4_338: 27949 xor edx, edx 27950 .LBB4_1438: 27951 mov rsi, rdx 27952 not rsi 27953 test r11b, 1 27954 je .LBB4_1440 27955 # %bb.1439: 27956 movzx r9d, word ptr [rcx + 2*rdx] 27957 xor r10d, r10d 27958 test r9w, r9w 27959 setne r10b 27960 neg r10d 27961 test r9w, r9w 27962 mov edi, 1 27963 cmovle edi, r10d 27964 mov word ptr [r8 + 2*rdx], di 27965 or rdx, 1 27966 .LBB4_1440: 27967 add rsi, r11 27968 je .LBB4_1655 27969 # %bb.1441: 27970 mov esi, 1 27971 .LBB4_1442: # =>This Inner Loop Header: Depth=1 27972 movzx edi, word ptr [rcx + 2*rdx] 27973 xor eax, eax 27974 test di, di 27975 setne al 27976 neg eax 27977 test di, di 27978 cmovg eax, esi 27979 mov word ptr [r8 + 2*rdx], ax 27980 movzx eax, word ptr [rcx + 2*rdx + 2] 27981 xor edi, edi 27982 test ax, ax 27983 setne dil 27984 neg edi 27985 test ax, ax 27986 cmovg edi, esi 27987 mov word ptr [r8 + 2*rdx + 2], di 27988 add rdx, 2 27989 cmp r11, rdx 27990 jne .LBB4_1442 27991 jmp .LBB4_1655 27992 .LBB4_339: 27993 test r9d, r9d 27994 jle .LBB4_1655 27995 # %bb.340: 27996 mov r10d, r9d 27997 cmp r9d, 4 27998 jae .LBB4_764 27999 # %bb.341: 28000 xor edx, edx 28001 jmp .LBB4_1041 28002 .LBB4_342: 28003 test r9d, r9d 28004 jle .LBB4_1655 28005 # %bb.343: 28006 mov r10d, r9d 28007 cmp r9d, 4 28008 jae .LBB4_767 28009 # %bb.344: 28010 xor edx, edx 28011 jmp .LBB4_1163 28012 .LBB4_345: 28013 test r9d, r9d 28014 jle .LBB4_1655 28015 # %bb.346: 28016 mov eax, r9d 28017 xor r10d, r10d 28018 cmp r9d, 8 28019 jae .LBB4_770 28020 # %bb.347: 28021 xor esi, esi 28022 jmp .LBB4_1169 28023 .LBB4_348: 28024 test r9d, r9d 28025 jle .LBB4_1655 28026 # %bb.349: 28027 mov eax, r9d 28028 xor r10d, r10d 28029 cmp r9d, 8 28030 jae .LBB4_773 28031 # %bb.350: 28032 xor esi, esi 28033 jmp .LBB4_1175 28034 .LBB4_351: 28035 test r9d, r9d 28036 jle .LBB4_1655 28037 # %bb.352: 28038 mov r10d, r9d 28039 cmp r9d, 16 28040 jb .LBB4_353 28041 # %bb.776: 28042 lea rdx, [rcx + r10] 28043 cmp rdx, r8 28044 jbe .LBB4_778 28045 # %bb.777: 28046 lea rdx, [r8 + 2*r10] 28047 cmp rdx, rcx 28048 jbe .LBB4_778 28049 .LBB4_353: 28050 xor edx, edx 28051 .LBB4_1447: 28052 mov rsi, rdx 28053 not rsi 28054 add rsi, r10 28055 mov rdi, r10 28056 and rdi, 3 28057 je .LBB4_1449 28058 .LBB4_1448: # =>This Inner Loop Header: Depth=1 28059 xor eax, eax 28060 cmp byte ptr [rcx + rdx], 0 28061 setne al 28062 mov word ptr [r8 + 2*rdx], ax 28063 add rdx, 1 28064 add rdi, -1 28065 jne .LBB4_1448 28066 .LBB4_1449: 28067 cmp rsi, 3 28068 jb .LBB4_1655 28069 .LBB4_1450: # =>This Inner Loop Header: Depth=1 28070 xor eax, eax 28071 cmp byte ptr [rcx + rdx], 0 28072 setne al 28073 mov word ptr [r8 + 2*rdx], ax 28074 xor eax, eax 28075 cmp byte ptr [rcx + rdx + 1], 0 28076 setne al 28077 mov word ptr [r8 + 2*rdx + 2], ax 28078 xor eax, eax 28079 cmp byte ptr [rcx + rdx + 2], 0 28080 setne al 28081 mov word ptr [r8 + 2*rdx + 4], ax 28082 xor eax, eax 28083 cmp byte ptr [rcx + rdx + 3], 0 28084 setne al 28085 mov word ptr [r8 + 2*rdx + 6], ax 28086 add rdx, 4 28087 cmp r10, rdx 28088 jne .LBB4_1450 28089 jmp .LBB4_1655 28090 .LBB4_354: 28091 test r9d, r9d 28092 jle .LBB4_1655 28093 # %bb.355: 28094 mov r10d, r9d 28095 cmp r9d, 16 28096 jb .LBB4_356 28097 # %bb.781: 28098 lea rdx, [rcx + r10] 28099 cmp rdx, r8 28100 jbe .LBB4_783 28101 # %bb.782: 28102 lea rdx, [r8 + 2*r10] 28103 cmp rdx, rcx 28104 jbe .LBB4_783 28105 .LBB4_356: 28106 xor edx, edx 28107 .LBB4_1455: 28108 mov rsi, rdx 28109 not rsi 28110 add rsi, r10 28111 mov rdi, r10 28112 and rdi, 3 28113 je .LBB4_1457 28114 .LBB4_1456: # =>This Inner Loop Header: Depth=1 28115 xor eax, eax 28116 cmp byte ptr [rcx + rdx], 0 28117 setne al 28118 mov word ptr [r8 + 2*rdx], ax 28119 add rdx, 1 28120 add rdi, -1 28121 jne .LBB4_1456 28122 .LBB4_1457: 28123 cmp rsi, 3 28124 jb .LBB4_1655 28125 .LBB4_1458: # =>This Inner Loop Header: Depth=1 28126 xor eax, eax 28127 cmp byte ptr [rcx + rdx], 0 28128 setne al 28129 mov word ptr [r8 + 2*rdx], ax 28130 xor eax, eax 28131 cmp byte ptr [rcx + rdx + 1], 0 28132 setne al 28133 mov word ptr [r8 + 2*rdx + 2], ax 28134 xor eax, eax 28135 cmp byte ptr [rcx + rdx + 2], 0 28136 setne al 28137 mov word ptr [r8 + 2*rdx + 4], ax 28138 xor eax, eax 28139 cmp byte ptr [rcx + rdx + 3], 0 28140 setne al 28141 mov word ptr [r8 + 2*rdx + 6], ax 28142 add rdx, 4 28143 cmp r10, rdx 28144 jne .LBB4_1458 28145 jmp .LBB4_1655 28146 .LBB4_357: 28147 test r9d, r9d 28148 jle .LBB4_1655 28149 # %bb.358: 28150 mov r10d, r9d 28151 cmp r9d, 8 28152 jae .LBB4_786 28153 # %bb.359: 28154 xor edx, edx 28155 jmp .LBB4_1047 28156 .LBB4_360: 28157 test r9d, r9d 28158 jle .LBB4_1655 28159 # %bb.361: 28160 mov r10d, r9d 28161 cmp r9d, 8 28162 jae .LBB4_789 28163 # %bb.362: 28164 xor edx, edx 28165 jmp .LBB4_1053 28166 .LBB4_363: 28167 test r9d, r9d 28168 jle .LBB4_1655 28169 # %bb.364: 28170 mov eax, r9d 28171 cmp r9d, 4 28172 jae .LBB4_792 28173 # %bb.365: 28174 xor edx, edx 28175 jmp .LBB4_1181 28176 .LBB4_366: 28177 test r9d, r9d 28178 jle .LBB4_1655 28179 # %bb.367: 28180 mov eax, r9d 28181 cmp r9d, 8 28182 jae .LBB4_795 28183 # %bb.368: 28184 xor edx, edx 28185 jmp .LBB4_1186 28186 .LBB4_369: 28187 test r9d, r9d 28188 jle .LBB4_1655 28189 # %bb.370: 28190 mov eax, r9d 28191 cmp r9d, 4 28192 jae .LBB4_798 28193 # %bb.371: 28194 xor edx, edx 28195 jmp .LBB4_1194 28196 .LBB4_372: 28197 test r9d, r9d 28198 jle .LBB4_1655 28199 # %bb.373: 28200 mov eax, r9d 28201 cmp r9d, 4 28202 jae .LBB4_801 28203 # %bb.374: 28204 xor edx, edx 28205 jmp .LBB4_1200 28206 .LBB4_375: 28207 test r9d, r9d 28208 jle .LBB4_1655 28209 # %bb.376: 28210 mov r10d, r9d 28211 cmp r9d, 4 28212 jb .LBB4_377 28213 # %bb.804: 28214 lea rdx, [rcx + r10] 28215 cmp rdx, r8 28216 jbe .LBB4_806 28217 # %bb.805: 28218 lea rdx, [r8 + 8*r10] 28219 cmp rdx, rcx 28220 jbe .LBB4_806 28221 .LBB4_377: 28222 xor edx, edx 28223 .LBB4_1463: 28224 mov rsi, rdx 28225 not rsi 28226 test r10b, 1 28227 je .LBB4_1465 28228 # %bb.1464: 28229 mov al, byte ptr [rcx + rdx] 28230 xor edi, edi 28231 test al, al 28232 setne dil 28233 neg rdi 28234 test al, al 28235 mov eax, 1 28236 cmovle rax, rdi 28237 mov qword ptr [r8 + 8*rdx], rax 28238 or rdx, 1 28239 .LBB4_1465: 28240 add rsi, r10 28241 je .LBB4_1655 28242 # %bb.1466: 28243 mov esi, 1 28244 .LBB4_1467: # =>This Inner Loop Header: Depth=1 28245 movzx eax, byte ptr [rcx + rdx] 28246 xor edi, edi 28247 test al, al 28248 setne dil 28249 neg rdi 28250 test al, al 28251 cmovg rdi, rsi 28252 mov qword ptr [r8 + 8*rdx], rdi 28253 movzx eax, byte ptr [rcx + rdx + 1] 28254 xor edi, edi 28255 test al, al 28256 setne dil 28257 neg rdi 28258 test al, al 28259 cmovg rdi, rsi 28260 mov qword ptr [r8 + 8*rdx + 8], rdi 28261 add rdx, 2 28262 cmp r10, rdx 28263 jne .LBB4_1467 28264 jmp .LBB4_1655 28265 .LBB4_378: 28266 test r9d, r9d 28267 jle .LBB4_1655 28268 # %bb.379: 28269 mov eax, r9d 28270 cmp r9d, 8 28271 jb .LBB4_380 28272 # %bb.809: 28273 lea rdx, [rcx + rax] 28274 cmp rdx, r8 28275 jbe .LBB4_811 28276 # %bb.810: 28277 lea rdx, [r8 + 4*rax] 28278 cmp rdx, rcx 28279 jbe .LBB4_811 28280 .LBB4_380: 28281 xor edx, edx 28282 .LBB4_1472: 28283 mov rsi, rdx 28284 not rsi 28285 test al, 1 28286 je .LBB4_1479 28287 # %bb.1473: 28288 cmp byte ptr [rcx + rdx], 0 28289 jne .LBB4_1475 28290 # %bb.1474: 28291 pxor xmm0, xmm0 28292 jmp .LBB4_1476 28293 .LBB4_381: 28294 test r9d, r9d 28295 jle .LBB4_1655 28296 # %bb.382: 28297 mov r10d, r9d 28298 cmp r9d, 4 28299 jb .LBB4_383 28300 # %bb.814: 28301 lea rdx, [rcx + 8*r10] 28302 cmp rdx, r8 28303 jbe .LBB4_816 28304 # %bb.815: 28305 lea rdx, [r8 + 8*r10] 28306 cmp rdx, rcx 28307 jbe .LBB4_816 28308 .LBB4_383: 28309 xor edx, edx 28310 .LBB4_1494: 28311 mov rsi, rdx 28312 not rsi 28313 add rsi, r10 28314 mov rdi, r10 28315 and rdi, 3 28316 je .LBB4_1496 28317 .LBB4_1495: # =>This Inner Loop Header: Depth=1 28318 xor eax, eax 28319 cmp qword ptr [rcx + 8*rdx], 0 28320 setne al 28321 mov qword ptr [r8 + 8*rdx], rax 28322 add rdx, 1 28323 add rdi, -1 28324 jne .LBB4_1495 28325 .LBB4_1496: 28326 cmp rsi, 3 28327 jb .LBB4_1655 28328 .LBB4_1497: # =>This Inner Loop Header: Depth=1 28329 xor eax, eax 28330 cmp qword ptr [rcx + 8*rdx], 0 28331 setne al 28332 mov qword ptr [r8 + 8*rdx], rax 28333 xor eax, eax 28334 cmp qword ptr [rcx + 8*rdx + 8], 0 28335 setne al 28336 mov qword ptr [r8 + 8*rdx + 8], rax 28337 xor eax, eax 28338 cmp qword ptr [rcx + 8*rdx + 16], 0 28339 setne al 28340 mov qword ptr [r8 + 8*rdx + 16], rax 28341 xor eax, eax 28342 cmp qword ptr [rcx + 8*rdx + 24], 0 28343 setne al 28344 mov qword ptr [r8 + 8*rdx + 24], rax 28345 add rdx, 4 28346 cmp r10, rdx 28347 jne .LBB4_1497 28348 jmp .LBB4_1655 28349 .LBB4_384: 28350 test r9d, r9d 28351 jle .LBB4_1655 28352 # %bb.385: 28353 mov edx, r9d 28354 lea rsi, [rdx - 1] 28355 mov eax, edx 28356 and eax, 3 28357 cmp rsi, 3 28358 jae .LBB4_819 28359 # %bb.386: 28360 xor esi, esi 28361 .LBB4_387: 28362 test rax, rax 28363 je .LBB4_1655 28364 # %bb.388: 28365 lea rdx, [r8 + 4*rsi] 28366 lea rcx, [rcx + 8*rsi] 28367 xor esi, esi 28368 movss xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero 28369 jmp .LBB4_390 28370 .LBB4_389: # in Loop: Header=BB4_390 Depth=1 28371 movss dword ptr [rdx + 4*rsi], xmm1 28372 add rsi, 1 28373 cmp rax, rsi 28374 je .LBB4_1655 28375 .LBB4_390: # =>This Inner Loop Header: Depth=1 28376 cmp qword ptr [rcx + 8*rsi], 0 28377 movapd xmm1, xmm0 28378 jne .LBB4_389 28379 # %bb.391: # in Loop: Header=BB4_390 Depth=1 28380 xorpd xmm1, xmm1 28381 jmp .LBB4_389 28382 .LBB4_392: 28383 test r9d, r9d 28384 jle .LBB4_1655 28385 # %bb.393: 28386 mov eax, r9d 28387 cmp r9d, 4 28388 jae .LBB4_829 28389 # %bb.394: 28390 xor edx, edx 28391 jmp .LBB4_1059 28392 .LBB4_395: 28393 test r9d, r9d 28394 jle .LBB4_1655 28395 # %bb.396: 28396 mov eax, r9d 28397 cmp r9d, 8 28398 jae .LBB4_832 28399 # %bb.397: 28400 xor edx, edx 28401 jmp .LBB4_1208 28402 .LBB4_398: 28403 test r9d, r9d 28404 jle .LBB4_1655 28405 # %bb.399: 28406 mov r10d, r9d 28407 cmp r9d, 4 28408 jae .LBB4_835 28409 # %bb.400: 28410 xor edx, edx 28411 jmp .LBB4_1216 28412 .LBB4_401: 28413 test r9d, r9d 28414 jle .LBB4_1655 28415 # %bb.402: 28416 mov eax, r9d 28417 cmp r9d, 8 28418 jae .LBB4_838 28419 # %bb.403: 28420 xor edx, edx 28421 jmp .LBB4_1222 28422 .LBB4_404: 28423 test r9d, r9d 28424 jle .LBB4_1655 28425 # %bb.405: 28426 mov r11d, r9d 28427 cmp r9d, 4 28428 jb .LBB4_406 28429 # %bb.841: 28430 lea rdx, [rcx + 8*r11] 28431 cmp rdx, r8 28432 jbe .LBB4_843 28433 # %bb.842: 28434 lea rdx, [r8 + 8*r11] 28435 cmp rdx, rcx 28436 jbe .LBB4_843 28437 .LBB4_406: 28438 xor edx, edx 28439 .LBB4_1502: 28440 mov rsi, rdx 28441 not rsi 28442 test r11b, 1 28443 je .LBB4_1504 28444 # %bb.1503: 28445 mov r9, qword ptr [rcx + 8*rdx] 28446 xor r10d, r10d 28447 test r9, r9 28448 setne r10b 28449 neg r10 28450 test r9, r9 28451 mov edi, 1 28452 cmovle rdi, r10 28453 mov qword ptr [r8 + 8*rdx], rdi 28454 or rdx, 1 28455 .LBB4_1504: 28456 add rsi, r11 28457 je .LBB4_1655 28458 # %bb.1505: 28459 mov esi, 1 28460 .LBB4_1506: # =>This Inner Loop Header: Depth=1 28461 mov rdi, qword ptr [rcx + 8*rdx] 28462 xor eax, eax 28463 test rdi, rdi 28464 setne al 28465 neg rax 28466 test rdi, rdi 28467 cmovg rax, rsi 28468 mov qword ptr [r8 + 8*rdx], rax 28469 mov rax, qword ptr [rcx + 8*rdx + 8] 28470 xor edi, edi 28471 test rax, rax 28472 setne dil 28473 neg rdi 28474 test rax, rax 28475 cmovg rdi, rsi 28476 mov qword ptr [r8 + 8*rdx + 8], rdi 28477 add rdx, 2 28478 cmp r11, rdx 28479 jne .LBB4_1506 28480 jmp .LBB4_1655 28481 .LBB4_407: 28482 test r9d, r9d 28483 jle .LBB4_1655 28484 # %bb.408: 28485 mov edx, r9d 28486 cmp r9d, 1 28487 jne .LBB4_846 28488 # %bb.409: 28489 xor eax, eax 28490 .LBB4_410: 28491 test dl, 1 28492 je .LBB4_1655 28493 # %bb.411: 28494 cmp qword ptr [rcx + 8*rax], 0 28495 jne .LBB4_989 28496 # %bb.412: 28497 xorpd xmm0, xmm0 28498 jmp .LBB4_990 28499 .LBB4_413: 28500 test r9d, r9d 28501 jle .LBB4_1655 28502 # %bb.414: 28503 mov edx, r9d 28504 cmp r9d, 1 28505 jne .LBB4_856 28506 # %bb.415: 28507 xor eax, eax 28508 jmp .LBB4_416 28509 .LBB4_419: 28510 test r9d, r9d 28511 jle .LBB4_1655 28512 # %bb.420: 28513 mov eax, r9d 28514 cmp r9d, 8 28515 jb .LBB4_421 28516 # %bb.864: 28517 lea rdx, [rcx + 4*rax] 28518 cmp rdx, r8 28519 jbe .LBB4_866 28520 # %bb.865: 28521 lea rdx, [r8 + 4*rax] 28522 cmp rdx, rcx 28523 jbe .LBB4_866 28524 .LBB4_421: 28525 xor edx, edx 28526 .LBB4_869: 28527 mov rsi, rdx 28528 not rsi 28529 test al, 1 28530 je .LBB4_871 28531 # %bb.870: 28532 movss xmm0, dword ptr [rcx + 4*rdx] # xmm0 = mem[0],zero,zero,zero 28533 movmskps edi, xmm0 28534 and edi, 1 28535 neg edi 28536 or edi, 1 28537 xorps xmm1, xmm1 28538 cvtsi2ss xmm1, edi 28539 xorps xmm2, xmm2 28540 cmpeqss xmm2, xmm0 28541 andnps xmm2, xmm1 28542 movss dword ptr [r8 + 4*rdx], xmm2 28543 or rdx, 1 28544 .LBB4_871: 28545 add rsi, rax 28546 je .LBB4_1655 28547 # %bb.872: 28548 xorps xmm0, xmm0 28549 .LBB4_873: # =>This Inner Loop Header: Depth=1 28550 movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero 28551 movmskps esi, xmm1 28552 and esi, 1 28553 neg esi 28554 or esi, 1 28555 xorps xmm2, xmm2 28556 cvtsi2ss xmm2, esi 28557 cmpeqss xmm1, xmm0 28558 andnps xmm1, xmm2 28559 movss dword ptr [r8 + 4*rdx], xmm1 28560 movss xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero 28561 movmskps esi, xmm1 28562 and esi, 1 28563 neg esi 28564 or esi, 1 28565 xorps xmm2, xmm2 28566 cvtsi2ss xmm2, esi 28567 cmpeqss xmm1, xmm0 28568 andnps xmm1, xmm2 28569 movss dword ptr [r8 + 4*rdx + 4], xmm1 28570 add rdx, 2 28571 cmp rax, rdx 28572 jne .LBB4_873 28573 jmp .LBB4_1655 28574 .LBB4_422: 28575 test r9d, r9d 28576 jle .LBB4_1655 28577 # %bb.423: 28578 mov r10d, r9d 28579 cmp r9d, 4 28580 jb .LBB4_424 28581 # %bb.874: 28582 lea rdx, [rcx + r10] 28583 cmp rdx, r8 28584 jbe .LBB4_876 28585 # %bb.875: 28586 lea rdx, [r8 + 8*r10] 28587 cmp rdx, rcx 28588 jbe .LBB4_876 28589 .LBB4_424: 28590 xor edx, edx 28591 .LBB4_1511: 28592 mov rsi, rdx 28593 not rsi 28594 add rsi, r10 28595 mov rdi, r10 28596 and rdi, 3 28597 je .LBB4_1513 28598 .LBB4_1512: # =>This Inner Loop Header: Depth=1 28599 xor eax, eax 28600 cmp byte ptr [rcx + rdx], 0 28601 setne al 28602 mov qword ptr [r8 + 8*rdx], rax 28603 add rdx, 1 28604 add rdi, -1 28605 jne .LBB4_1512 28606 .LBB4_1513: 28607 cmp rsi, 3 28608 jb .LBB4_1655 28609 .LBB4_1514: # =>This Inner Loop Header: Depth=1 28610 xor eax, eax 28611 cmp byte ptr [rcx + rdx], 0 28612 setne al 28613 mov qword ptr [r8 + 8*rdx], rax 28614 xor eax, eax 28615 cmp byte ptr [rcx + rdx + 1], 0 28616 setne al 28617 mov qword ptr [r8 + 8*rdx + 8], rax 28618 xor eax, eax 28619 cmp byte ptr [rcx + rdx + 2], 0 28620 setne al 28621 mov qword ptr [r8 + 8*rdx + 16], rax 28622 xor eax, eax 28623 cmp byte ptr [rcx + rdx + 3], 0 28624 setne al 28625 mov qword ptr [r8 + 8*rdx + 24], rax 28626 add rdx, 4 28627 cmp r10, rdx 28628 jne .LBB4_1514 28629 jmp .LBB4_1655 28630 .LBB4_425: 28631 test r9d, r9d 28632 jle .LBB4_1655 28633 # %bb.426: 28634 mov eax, r9d 28635 cmp r9d, 8 28636 jb .LBB4_427 28637 # %bb.879: 28638 lea rdx, [rcx + rax] 28639 cmp rdx, r8 28640 jbe .LBB4_881 28641 # %bb.880: 28642 lea rdx, [r8 + 4*rax] 28643 cmp rdx, rcx 28644 jbe .LBB4_881 28645 .LBB4_427: 28646 xor edx, edx 28647 .LBB4_1519: 28648 mov rsi, rdx 28649 not rsi 28650 add rsi, rax 28651 mov rdi, rax 28652 and rdi, 3 28653 je .LBB4_1524 28654 # %bb.1520: 28655 movd xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero 28656 jmp .LBB4_1522 28657 .LBB4_1521: # in Loop: Header=BB4_1522 Depth=1 28658 movd dword ptr [r8 + 4*rdx], xmm1 28659 add rdx, 1 28660 add rdi, -1 28661 je .LBB4_1524 28662 .LBB4_1522: # =>This Inner Loop Header: Depth=1 28663 cmp byte ptr [rcx + rdx], 0 28664 movdqa xmm1, xmm0 28665 jne .LBB4_1521 28666 # %bb.1523: # in Loop: Header=BB4_1522 Depth=1 28667 pxor xmm1, xmm1 28668 jmp .LBB4_1521 28669 .LBB4_428: 28670 test r9d, r9d 28671 jle .LBB4_1655 28672 # %bb.429: 28673 mov r10d, r9d 28674 cmp r9d, 4 28675 jae .LBB4_884 28676 # %bb.430: 28677 xor edx, edx 28678 jmp .LBB4_1064 28679 .LBB4_431: 28680 test r9d, r9d 28681 jle .LBB4_1655 28682 # %bb.432: 28683 mov eax, r9d 28684 cmp r9d, 8 28685 jae .LBB4_887 28686 # %bb.433: 28687 xor edx, edx 28688 jmp .LBB4_1070 28689 .LBB4_434: 28690 test r9d, r9d 28691 jle .LBB4_1655 28692 # %bb.435: 28693 mov eax, r9d 28694 cmp r9d, 8 28695 jb .LBB4_436 28696 # %bb.890: 28697 lea rdx, [rcx + 4*rax] 28698 cmp rdx, r8 28699 jbe .LBB4_892 28700 # %bb.891: 28701 lea rdx, [r8 + rax] 28702 cmp rdx, rcx 28703 jbe .LBB4_892 28704 .LBB4_436: 28705 xor edx, edx 28706 .LBB4_1539: 28707 mov rsi, rdx 28708 not rsi 28709 add rsi, rax 28710 mov rdi, rax 28711 and rdi, 3 28712 je .LBB4_1541 28713 .LBB4_1540: # =>This Inner Loop Header: Depth=1 28714 cmp dword ptr [rcx + 4*rdx], 0 28715 setne byte ptr [r8 + rdx] 28716 add rdx, 1 28717 add rdi, -1 28718 jne .LBB4_1540 28719 .LBB4_1541: 28720 cmp rsi, 3 28721 jb .LBB4_1655 28722 .LBB4_1542: # =>This Inner Loop Header: Depth=1 28723 cmp dword ptr [rcx + 4*rdx], 0 28724 setne byte ptr [r8 + rdx] 28725 cmp dword ptr [rcx + 4*rdx + 4], 0 28726 setne byte ptr [r8 + rdx + 1] 28727 cmp dword ptr [rcx + 4*rdx + 8], 0 28728 setne byte ptr [r8 + rdx + 2] 28729 cmp dword ptr [rcx + 4*rdx + 12], 0 28730 setne byte ptr [r8 + rdx + 3] 28731 add rdx, 4 28732 cmp rax, rdx 28733 jne .LBB4_1542 28734 jmp .LBB4_1655 28735 .LBB4_437: 28736 test r9d, r9d 28737 jle .LBB4_1655 28738 # %bb.438: 28739 mov eax, r9d 28740 cmp r9d, 4 28741 jb .LBB4_439 28742 # %bb.895: 28743 lea rdx, [rcx + 8*rax] 28744 cmp rdx, r8 28745 jbe .LBB4_897 28746 # %bb.896: 28747 lea rdx, [r8 + rax] 28748 cmp rdx, rcx 28749 jbe .LBB4_897 28750 .LBB4_439: 28751 xor edx, edx 28752 .LBB4_1547: 28753 mov rsi, rdx 28754 not rsi 28755 test al, 1 28756 je .LBB4_1549 28757 # %bb.1548: 28758 movsd xmm0, qword ptr [rcx + 8*rdx] # xmm0 = mem[0],zero 28759 xor r9d, r9d 28760 pxor xmm1, xmm1 28761 ucomisd xmm1, xmm0 28762 andpd xmm0, xmmword ptr [rip + .LCPI4_0] 28763 movsd xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero 28764 orpd xmm1, xmm0 28765 cvttsd2si edi, xmm1 28766 cmove edi, r9d 28767 mov byte ptr [r8 + rdx], dil 28768 or rdx, 1 28769 .LBB4_1549: 28770 add rsi, rax 28771 je .LBB4_1655 28772 # %bb.1550: 28773 xor esi, esi 28774 xorpd xmm0, xmm0 28775 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 28776 movsd xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero 28777 .LBB4_1551: # =>This Inner Loop Header: Depth=1 28778 movsd xmm3, qword ptr [rcx + 8*rdx] # xmm3 = mem[0],zero 28779 ucomisd xmm0, xmm3 28780 andpd xmm3, xmm1 28781 orpd xmm3, xmm2 28782 cvttsd2si edi, xmm3 28783 cmove edi, esi 28784 mov byte ptr [r8 + rdx], dil 28785 movsd xmm3, qword ptr [rcx + 8*rdx + 8] # xmm3 = mem[0],zero 28786 ucomisd xmm0, xmm3 28787 andpd xmm3, xmm1 28788 orpd xmm3, xmm2 28789 cvttsd2si edi, xmm3 28790 cmove edi, esi 28791 mov byte ptr [r8 + rdx + 1], dil 28792 add rdx, 2 28793 cmp rax, rdx 28794 jne .LBB4_1551 28795 jmp .LBB4_1655 28796 .LBB4_440: 28797 test r9d, r9d 28798 jle .LBB4_1655 28799 # %bb.441: 28800 mov r10d, r9d 28801 cmp r9d, 32 28802 jb .LBB4_442 28803 # %bb.900: 28804 lea rdx, [rcx + r10] 28805 cmp rdx, r8 28806 jbe .LBB4_902 28807 # %bb.901: 28808 lea rdx, [r8 + r10] 28809 cmp rdx, rcx 28810 jbe .LBB4_902 28811 .LBB4_442: 28812 xor esi, esi 28813 .LBB4_1556: 28814 mov rax, rsi 28815 not rax 28816 test r10b, 1 28817 je .LBB4_1558 28818 # %bb.1557: 28819 mov dil, byte ptr [rcx + rsi] 28820 test dil, dil 28821 setne r9b 28822 neg r9b 28823 test dil, dil 28824 movzx r9d, r9b 28825 mov edi, 1 28826 cmovle edi, r9d 28827 mov byte ptr [r8 + rsi], dil 28828 or rsi, 1 28829 .LBB4_1558: 28830 add rax, r10 28831 je .LBB4_1655 28832 # %bb.1559: 28833 mov edi, 1 28834 .LBB4_1560: # =>This Inner Loop Header: Depth=1 28835 movzx eax, byte ptr [rcx + rsi] 28836 test al, al 28837 setne dl 28838 neg dl 28839 test al, al 28840 movzx eax, dl 28841 cmovg eax, edi 28842 mov byte ptr [r8 + rsi], al 28843 movzx eax, byte ptr [rcx + rsi + 1] 28844 test al, al 28845 setne dl 28846 neg dl 28847 test al, al 28848 movzx eax, dl 28849 cmovg eax, edi 28850 mov byte ptr [r8 + rsi + 1], al 28851 add rsi, 2 28852 cmp r10, rsi 28853 jne .LBB4_1560 28854 jmp .LBB4_1655 28855 .LBB4_443: 28856 test r9d, r9d 28857 jle .LBB4_1655 28858 # %bb.444: 28859 mov eax, r9d 28860 cmp r9d, 4 28861 jb .LBB4_445 28862 # %bb.905: 28863 lea rdx, [rcx + 8*rax] 28864 cmp rdx, r8 28865 jbe .LBB4_907 28866 # %bb.906: 28867 lea rdx, [r8 + rax] 28868 cmp rdx, rcx 28869 jbe .LBB4_907 28870 .LBB4_445: 28871 xor edx, edx 28872 .LBB4_1565: 28873 mov rsi, rdx 28874 not rsi 28875 add rsi, rax 28876 mov rdi, rax 28877 and rdi, 3 28878 je .LBB4_1567 28879 .LBB4_1566: # =>This Inner Loop Header: Depth=1 28880 cmp qword ptr [rcx + 8*rdx], 0 28881 setne byte ptr [r8 + rdx] 28882 add rdx, 1 28883 add rdi, -1 28884 jne .LBB4_1566 28885 .LBB4_1567: 28886 cmp rsi, 3 28887 jb .LBB4_1655 28888 .LBB4_1568: # =>This Inner Loop Header: Depth=1 28889 cmp qword ptr [rcx + 8*rdx], 0 28890 setne byte ptr [r8 + rdx] 28891 cmp qword ptr [rcx + 8*rdx + 8], 0 28892 setne byte ptr [r8 + rdx + 1] 28893 cmp qword ptr [rcx + 8*rdx + 16], 0 28894 setne byte ptr [r8 + rdx + 2] 28895 cmp qword ptr [rcx + 8*rdx + 24], 0 28896 setne byte ptr [r8 + rdx + 3] 28897 add rdx, 4 28898 cmp rax, rdx 28899 jne .LBB4_1568 28900 jmp .LBB4_1655 28901 .LBB4_446: 28902 test r9d, r9d 28903 jle .LBB4_1655 28904 # %bb.447: 28905 mov eax, r9d 28906 cmp r9d, 16 28907 jb .LBB4_448 28908 # %bb.910: 28909 lea rdx, [rcx + 2*rax] 28910 cmp rdx, r8 28911 jbe .LBB4_912 28912 # %bb.911: 28913 lea rdx, [r8 + rax] 28914 cmp rdx, rcx 28915 jbe .LBB4_912 28916 .LBB4_448: 28917 xor edx, edx 28918 .LBB4_1573: 28919 mov rsi, rdx 28920 not rsi 28921 add rsi, rax 28922 mov rdi, rax 28923 and rdi, 3 28924 je .LBB4_1575 28925 .LBB4_1574: # =>This Inner Loop Header: Depth=1 28926 cmp word ptr [rcx + 2*rdx], 0 28927 setne byte ptr [r8 + rdx] 28928 add rdx, 1 28929 add rdi, -1 28930 jne .LBB4_1574 28931 .LBB4_1575: 28932 cmp rsi, 3 28933 jb .LBB4_1655 28934 .LBB4_1576: # =>This Inner Loop Header: Depth=1 28935 cmp word ptr [rcx + 2*rdx], 0 28936 setne byte ptr [r8 + rdx] 28937 cmp word ptr [rcx + 2*rdx + 2], 0 28938 setne byte ptr [r8 + rdx + 1] 28939 cmp word ptr [rcx + 2*rdx + 4], 0 28940 setne byte ptr [r8 + rdx + 2] 28941 cmp word ptr [rcx + 2*rdx + 6], 0 28942 setne byte ptr [r8 + rdx + 3] 28943 add rdx, 4 28944 cmp rax, rdx 28945 jne .LBB4_1576 28946 jmp .LBB4_1655 28947 .LBB4_449: 28948 test r9d, r9d 28949 jle .LBB4_1655 28950 # %bb.450: 28951 mov r10d, r9d 28952 cmp r9d, 16 28953 jb .LBB4_451 28954 # %bb.915: 28955 lea rdx, [rcx + 2*r10] 28956 cmp rdx, r8 28957 jbe .LBB4_917 28958 # %bb.916: 28959 lea rdx, [r8 + r10] 28960 cmp rdx, rcx 28961 jbe .LBB4_917 28962 .LBB4_451: 28963 xor esi, esi 28964 .LBB4_1581: 28965 mov rax, rsi 28966 not rax 28967 test r10b, 1 28968 je .LBB4_1583 28969 # %bb.1582: 28970 movzx edi, word ptr [rcx + 2*rsi] 28971 test di, di 28972 setne r9b 28973 neg r9b 28974 test di, di 28975 movzx r9d, r9b 28976 mov edi, 1 28977 cmovle edi, r9d 28978 mov byte ptr [r8 + rsi], dil 28979 or rsi, 1 28980 .LBB4_1583: 28981 add rax, r10 28982 je .LBB4_1655 28983 # %bb.1584: 28984 mov r9d, 1 28985 .LBB4_1585: # =>This Inner Loop Header: Depth=1 28986 movzx edi, word ptr [rcx + 2*rsi] 28987 test di, di 28988 setne al 28989 neg al 28990 test di, di 28991 movzx eax, al 28992 cmovg eax, r9d 28993 mov byte ptr [r8 + rsi], al 28994 movzx eax, word ptr [rcx + 2*rsi + 2] 28995 test ax, ax 28996 setne dl 28997 neg dl 28998 test ax, ax 28999 movzx eax, dl 29000 cmovg eax, r9d 29001 mov byte ptr [r8 + rsi + 1], al 29002 add rsi, 2 29003 cmp r10, rsi 29004 jne .LBB4_1585 29005 jmp .LBB4_1655 29006 .LBB4_452: 29007 test r9d, r9d 29008 jle .LBB4_1655 29009 # %bb.453: 29010 mov r10d, r9d 29011 cmp r9d, 4 29012 jb .LBB4_454 29013 # %bb.920: 29014 lea rdx, [rcx + 8*r10] 29015 cmp rdx, r8 29016 jbe .LBB4_922 29017 # %bb.921: 29018 lea rdx, [r8 + r10] 29019 cmp rdx, rcx 29020 jbe .LBB4_922 29021 .LBB4_454: 29022 xor esi, esi 29023 .LBB4_1590: 29024 mov rdx, rsi 29025 not rdx 29026 test r10b, 1 29027 je .LBB4_1592 29028 # %bb.1591: 29029 mov rdi, qword ptr [rcx + 8*rsi] 29030 test rdi, rdi 29031 setne al 29032 neg al 29033 test rdi, rdi 29034 movzx eax, al 29035 mov edi, 1 29036 cmovle edi, eax 29037 mov byte ptr [r8 + rsi], dil 29038 or rsi, 1 29039 .LBB4_1592: 29040 add rdx, r10 29041 je .LBB4_1655 29042 # %bb.1593: 29043 mov edi, 1 29044 .LBB4_1594: # =>This Inner Loop Header: Depth=1 29045 mov rax, qword ptr [rcx + 8*rsi] 29046 test rax, rax 29047 setne dl 29048 neg dl 29049 test rax, rax 29050 movzx eax, dl 29051 cmovg eax, edi 29052 mov byte ptr [r8 + rsi], al 29053 mov rax, qword ptr [rcx + 8*rsi + 8] 29054 test rax, rax 29055 setne dl 29056 neg dl 29057 test rax, rax 29058 movzx eax, dl 29059 cmovg eax, edi 29060 mov byte ptr [r8 + rsi + 1], al 29061 add rsi, 2 29062 cmp r10, rsi 29063 jne .LBB4_1594 29064 jmp .LBB4_1655 29065 .LBB4_455: 29066 test r9d, r9d 29067 jle .LBB4_1655 29068 # %bb.456: 29069 mov r10d, r9d 29070 cmp r9d, 8 29071 jb .LBB4_457 29072 # %bb.925: 29073 lea rdx, [rcx + 4*r10] 29074 cmp rdx, r8 29075 jbe .LBB4_927 29076 # %bb.926: 29077 lea rdx, [r8 + r10] 29078 cmp rdx, rcx 29079 jbe .LBB4_927 29080 .LBB4_457: 29081 xor edx, edx 29082 .LBB4_1599: 29083 mov rsi, rdx 29084 not rsi 29085 test r10b, 1 29086 je .LBB4_1601 29087 # %bb.1600: 29088 movd xmm0, dword ptr [rcx + 4*rdx] # xmm0 = mem[0],zero,zero,zero 29089 movd edi, xmm0 29090 test edi, edi 29091 setns al 29092 add al, al 29093 add al, -1 29094 xor edi, edi 29095 pxor xmm1, xmm1 29096 ucomiss xmm1, xmm0 29097 movzx eax, al 29098 cmove eax, edi 29099 mov byte ptr [r8 + rdx], al 29100 or rdx, 1 29101 .LBB4_1601: 29102 add rsi, r10 29103 je .LBB4_1655 29104 # %bb.1602: 29105 xor esi, esi 29106 xorps xmm0, xmm0 29107 .LBB4_1603: # =>This Inner Loop Header: Depth=1 29108 movd xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero 29109 movd eax, xmm1 29110 test eax, eax 29111 setns al 29112 add al, al 29113 add al, -1 29114 ucomiss xmm0, xmm1 29115 movzx eax, al 29116 cmove eax, esi 29117 mov byte ptr [r8 + rdx], al 29118 movd xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero 29119 movd eax, xmm1 29120 test eax, eax 29121 setns al 29122 add al, al 29123 add al, -1 29124 ucomiss xmm0, xmm1 29125 movzx eax, al 29126 cmove eax, esi 29127 mov byte ptr [r8 + rdx + 1], al 29128 add rdx, 2 29129 cmp r10, rdx 29130 jne .LBB4_1603 29131 jmp .LBB4_1655 29132 .LBB4_458: 29133 test r9d, r9d 29134 jle .LBB4_1655 29135 # %bb.459: 29136 mov eax, r9d 29137 cmp r9d, 32 29138 jb .LBB4_460 29139 # %bb.930: 29140 lea rdx, [rcx + rax] 29141 cmp rdx, r8 29142 jbe .LBB4_932 29143 # %bb.931: 29144 lea rdx, [r8 + rax] 29145 cmp rdx, rcx 29146 jbe .LBB4_932 29147 .LBB4_460: 29148 xor edx, edx 29149 .LBB4_1608: 29150 mov rsi, rdx 29151 not rsi 29152 add rsi, rax 29153 mov rdi, rax 29154 and rdi, 3 29155 je .LBB4_1610 29156 .LBB4_1609: # =>This Inner Loop Header: Depth=1 29157 cmp byte ptr [rcx + rdx], 0 29158 setne byte ptr [r8 + rdx] 29159 add rdx, 1 29160 add rdi, -1 29161 jne .LBB4_1609 29162 .LBB4_1610: 29163 cmp rsi, 3 29164 jb .LBB4_1655 29165 .LBB4_1611: # =>This Inner Loop Header: Depth=1 29166 cmp byte ptr [rcx + rdx], 0 29167 setne byte ptr [r8 + rdx] 29168 cmp byte ptr [rcx + rdx + 1], 0 29169 setne byte ptr [r8 + rdx + 1] 29170 cmp byte ptr [rcx + rdx + 2], 0 29171 setne byte ptr [r8 + rdx + 2] 29172 cmp byte ptr [rcx + rdx + 3], 0 29173 setne byte ptr [r8 + rdx + 3] 29174 add rdx, 4 29175 cmp rax, rdx 29176 jne .LBB4_1611 29177 jmp .LBB4_1655 29178 .LBB4_461: 29179 test r9d, r9d 29180 jle .LBB4_1655 29181 # %bb.462: 29182 mov r10d, r9d 29183 cmp r9d, 8 29184 jb .LBB4_463 29185 # %bb.935: 29186 lea rdx, [rcx + 4*r10] 29187 cmp rdx, r8 29188 jbe .LBB4_937 29189 # %bb.936: 29190 lea rdx, [r8 + r10] 29191 cmp rdx, rcx 29192 jbe .LBB4_937 29193 .LBB4_463: 29194 xor esi, esi 29195 .LBB4_1616: 29196 mov rax, rsi 29197 not rax 29198 test r10b, 1 29199 je .LBB4_1618 29200 # %bb.1617: 29201 mov edi, dword ptr [rcx + 4*rsi] 29202 test edi, edi 29203 setne r9b 29204 neg r9b 29205 test edi, edi 29206 movzx r9d, r9b 29207 mov edi, 1 29208 cmovle edi, r9d 29209 mov byte ptr [r8 + rsi], dil 29210 or rsi, 1 29211 .LBB4_1618: 29212 add rax, r10 29213 je .LBB4_1655 29214 # %bb.1619: 29215 mov r9d, 1 29216 .LBB4_1620: # =>This Inner Loop Header: Depth=1 29217 mov edi, dword ptr [rcx + 4*rsi] 29218 test edi, edi 29219 setne al 29220 neg al 29221 test edi, edi 29222 movzx eax, al 29223 cmovg eax, r9d 29224 mov byte ptr [r8 + rsi], al 29225 mov eax, dword ptr [rcx + 4*rsi + 4] 29226 test eax, eax 29227 setne dl 29228 neg dl 29229 test eax, eax 29230 movzx eax, dl 29231 cmovg eax, r9d 29232 mov byte ptr [r8 + rsi + 1], al 29233 add rsi, 2 29234 cmp r10, rsi 29235 jne .LBB4_1620 29236 jmp .LBB4_1655 29237 .LBB4_464: 29238 test r9d, r9d 29239 jle .LBB4_1655 29240 # %bb.465: 29241 mov r10d, r9d 29242 cmp r9d, 8 29243 jb .LBB4_466 29244 # %bb.940: 29245 lea rdx, [rcx + 4*r10] 29246 cmp rdx, r8 29247 jbe .LBB4_942 29248 # %bb.941: 29249 lea rdx, [r8 + 4*r10] 29250 cmp rdx, rcx 29251 jbe .LBB4_942 29252 .LBB4_466: 29253 xor edx, edx 29254 .LBB4_1625: 29255 mov rsi, rdx 29256 not rsi 29257 add rsi, r10 29258 mov rdi, r10 29259 and rdi, 3 29260 je .LBB4_1627 29261 .LBB4_1626: # =>This Inner Loop Header: Depth=1 29262 xor eax, eax 29263 cmp dword ptr [rcx + 4*rdx], 0 29264 setne al 29265 mov dword ptr [r8 + 4*rdx], eax 29266 add rdx, 1 29267 add rdi, -1 29268 jne .LBB4_1626 29269 .LBB4_1627: 29270 cmp rsi, 3 29271 jb .LBB4_1655 29272 .LBB4_1628: # =>This Inner Loop Header: Depth=1 29273 xor eax, eax 29274 cmp dword ptr [rcx + 4*rdx], 0 29275 setne al 29276 mov dword ptr [r8 + 4*rdx], eax 29277 xor eax, eax 29278 cmp dword ptr [rcx + 4*rdx + 4], 0 29279 setne al 29280 mov dword ptr [r8 + 4*rdx + 4], eax 29281 xor eax, eax 29282 cmp dword ptr [rcx + 4*rdx + 8], 0 29283 setne al 29284 mov dword ptr [r8 + 4*rdx + 8], eax 29285 xor eax, eax 29286 cmp dword ptr [rcx + 4*rdx + 12], 0 29287 setne al 29288 mov dword ptr [r8 + 4*rdx + 12], eax 29289 add rdx, 4 29290 cmp r10, rdx 29291 jne .LBB4_1628 29292 jmp .LBB4_1655 29293 .LBB4_467: 29294 test r9d, r9d 29295 jle .LBB4_1655 29296 # %bb.468: 29297 mov eax, r9d 29298 xor r10d, r10d 29299 cmp r9d, 4 29300 jae .LBB4_945 29301 # %bb.469: 29302 xor esi, esi 29303 jmp .LBB4_1080 29304 .LBB4_470: 29305 test r9d, r9d 29306 jle .LBB4_1655 29307 # %bb.471: 29308 mov r10d, r9d 29309 cmp r9d, 8 29310 jb .LBB4_472 29311 # %bb.948: 29312 lea rdx, [rcx + r10] 29313 cmp rdx, r8 29314 jbe .LBB4_950 29315 # %bb.949: 29316 lea rdx, [r8 + 4*r10] 29317 cmp rdx, rcx 29318 jbe .LBB4_950 29319 .LBB4_472: 29320 xor edx, edx 29321 .LBB4_1633: 29322 mov rsi, rdx 29323 not rsi 29324 test r10b, 1 29325 je .LBB4_1635 29326 # %bb.1634: 29327 mov r9b, byte ptr [rcx + rdx] 29328 xor edi, edi 29329 test r9b, r9b 29330 setne dil 29331 neg edi 29332 test r9b, r9b 29333 mov eax, 1 29334 cmovle eax, edi 29335 mov dword ptr [r8 + 4*rdx], eax 29336 or rdx, 1 29337 .LBB4_1635: 29338 add rsi, r10 29339 je .LBB4_1655 29340 # %bb.1636: 29341 mov esi, 1 29342 .LBB4_1637: # =>This Inner Loop Header: Depth=1 29343 movzx eax, byte ptr [rcx + rdx] 29344 xor edi, edi 29345 test al, al 29346 setne dil 29347 neg edi 29348 test al, al 29349 cmovg edi, esi 29350 mov dword ptr [r8 + 4*rdx], edi 29351 movzx eax, byte ptr [rcx + rdx + 1] 29352 xor edi, edi 29353 test al, al 29354 setne dil 29355 neg edi 29356 test al, al 29357 cmovg edi, esi 29358 mov dword ptr [r8 + 4*rdx + 4], edi 29359 add rdx, 2 29360 cmp r10, rdx 29361 jne .LBB4_1637 29362 jmp .LBB4_1655 29363 .LBB4_473: 29364 test r9d, r9d 29365 jle .LBB4_1655 29366 # %bb.474: 29367 mov eax, r9d 29368 cmp r9d, 4 29369 jae .LBB4_953 29370 # %bb.475: 29371 xor edx, edx 29372 jmp .LBB4_1086 29373 .LBB4_476: 29374 test r9d, r9d 29375 jle .LBB4_1655 29376 # %bb.477: 29377 mov eax, r9d 29378 cmp r9d, 8 29379 jae .LBB4_956 29380 # %bb.478: 29381 xor edx, edx 29382 jmp .LBB4_1091 29383 .LBB4_479: 29384 test r9d, r9d 29385 jle .LBB4_1655 29386 # %bb.480: 29387 mov r10d, r9d 29388 cmp r9d, 8 29389 jae .LBB4_959 29390 # %bb.481: 29391 xor edx, edx 29392 jmp .LBB4_1096 29393 .LBB4_482: 29394 test r9d, r9d 29395 jle .LBB4_1655 29396 # %bb.483: 29397 mov r10d, r9d 29398 cmp r9d, 4 29399 jae .LBB4_962 29400 # %bb.484: 29401 xor edx, edx 29402 jmp .LBB4_1102 29403 .LBB4_485: 29404 test r9d, r9d 29405 jle .LBB4_1655 29406 # %bb.486: 29407 mov eax, r9d 29408 cmp r9d, 8 29409 jae .LBB4_965 29410 # %bb.487: 29411 xor edx, edx 29412 jmp .LBB4_968 29413 .LBB4_488: 29414 test r9d, r9d 29415 jle .LBB4_1655 29416 # %bb.489: 29417 mov r10d, r9d 29418 cmp r9d, 8 29419 jb .LBB4_490 29420 # %bb.972: 29421 lea rdx, [rcx + r10] 29422 cmp rdx, r8 29423 jbe .LBB4_974 29424 # %bb.973: 29425 lea rdx, [r8 + 4*r10] 29426 cmp rdx, rcx 29427 jbe .LBB4_974 29428 .LBB4_490: 29429 xor edx, edx 29430 .LBB4_1642: 29431 mov rsi, rdx 29432 not rsi 29433 add rsi, r10 29434 mov rdi, r10 29435 and rdi, 3 29436 je .LBB4_1644 29437 .LBB4_1643: # =>This Inner Loop Header: Depth=1 29438 xor eax, eax 29439 cmp byte ptr [rcx + rdx], 0 29440 setne al 29441 mov dword ptr [r8 + 4*rdx], eax 29442 add rdx, 1 29443 add rdi, -1 29444 jne .LBB4_1643 29445 .LBB4_1644: 29446 cmp rsi, 3 29447 jb .LBB4_1655 29448 .LBB4_1645: # =>This Inner Loop Header: Depth=1 29449 xor eax, eax 29450 cmp byte ptr [rcx + rdx], 0 29451 setne al 29452 mov dword ptr [r8 + 4*rdx], eax 29453 xor eax, eax 29454 cmp byte ptr [rcx + rdx + 1], 0 29455 setne al 29456 mov dword ptr [r8 + 4*rdx + 4], eax 29457 xor eax, eax 29458 cmp byte ptr [rcx + rdx + 2], 0 29459 setne al 29460 mov dword ptr [r8 + 4*rdx + 8], eax 29461 xor eax, eax 29462 cmp byte ptr [rcx + rdx + 3], 0 29463 setne al 29464 mov dword ptr [r8 + 4*rdx + 12], eax 29465 add rdx, 4 29466 cmp r10, rdx 29467 jne .LBB4_1645 29468 jmp .LBB4_1655 29469 .LBB4_491: 29470 test r9d, r9d 29471 jle .LBB4_1655 29472 # %bb.492: 29473 mov r11d, r9d 29474 cmp r9d, 8 29475 jb .LBB4_493 29476 # %bb.977: 29477 lea rdx, [rcx + 4*r11] 29478 cmp rdx, r8 29479 jbe .LBB4_979 29480 # %bb.978: 29481 lea rdx, [r8 + 4*r11] 29482 cmp rdx, rcx 29483 jbe .LBB4_979 29484 .LBB4_493: 29485 xor edx, edx 29486 .LBB4_1650: 29487 mov rsi, rdx 29488 not rsi 29489 test r11b, 1 29490 je .LBB4_1652 29491 # %bb.1651: 29492 mov r9d, dword ptr [rcx + 4*rdx] 29493 xor r10d, r10d 29494 test r9d, r9d 29495 setne r10b 29496 neg r10d 29497 test r9d, r9d 29498 mov edi, 1 29499 cmovle edi, r10d 29500 mov dword ptr [r8 + 4*rdx], edi 29501 or rdx, 1 29502 .LBB4_1652: 29503 add rsi, r11 29504 je .LBB4_1655 29505 # %bb.1653: 29506 mov esi, 1 29507 .LBB4_1654: # =>This Inner Loop Header: Depth=1 29508 mov edi, dword ptr [rcx + 4*rdx] 29509 xor eax, eax 29510 test edi, edi 29511 setne al 29512 neg eax 29513 test edi, edi 29514 cmovg eax, esi 29515 mov dword ptr [r8 + 4*rdx], eax 29516 mov eax, dword ptr [rcx + 4*rdx + 4] 29517 xor edi, edi 29518 test eax, eax 29519 setne dil 29520 neg edi 29521 test eax, eax 29522 cmovg edi, esi 29523 mov dword ptr [r8 + 4*rdx + 4], edi 29524 add rdx, 2 29525 cmp r11, rdx 29526 jne .LBB4_1654 29527 jmp .LBB4_1655 29528 .LBB4_1524: 29529 cmp rsi, 3 29530 jb .LBB4_1655 29531 # %bb.1525: 29532 movd xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero 29533 jmp .LBB4_1527 29534 .LBB4_1526: # in Loop: Header=BB4_1527 Depth=1 29535 movd dword ptr [r8 + 4*rdx + 12], xmm1 29536 add rdx, 4 29537 cmp rax, rdx 29538 je .LBB4_1655 29539 .LBB4_1527: # =>This Inner Loop Header: Depth=1 29540 cmp byte ptr [rcx + rdx], 0 29541 movdqa xmm1, xmm0 29542 jne .LBB4_1528 29543 # %bb.1531: # in Loop: Header=BB4_1527 Depth=1 29544 pxor xmm1, xmm1 29545 movd dword ptr [r8 + 4*rdx], xmm1 29546 cmp byte ptr [rcx + rdx + 1], 0 29547 movdqa xmm1, xmm0 29548 je .LBB4_1532 29549 .LBB4_1529: # in Loop: Header=BB4_1527 Depth=1 29550 movd dword ptr [r8 + 4*rdx + 4], xmm1 29551 cmp byte ptr [rcx + rdx + 2], 0 29552 movdqa xmm1, xmm0 29553 jne .LBB4_1530 29554 .LBB4_1533: # in Loop: Header=BB4_1527 Depth=1 29555 pxor xmm1, xmm1 29556 movd dword ptr [r8 + 4*rdx + 8], xmm1 29557 cmp byte ptr [rcx + rdx + 3], 0 29558 movdqa xmm1, xmm0 29559 jne .LBB4_1526 29560 jmp .LBB4_1534 29561 .LBB4_1528: # in Loop: Header=BB4_1527 Depth=1 29562 movd dword ptr [r8 + 4*rdx], xmm1 29563 cmp byte ptr [rcx + rdx + 1], 0 29564 movdqa xmm1, xmm0 29565 jne .LBB4_1529 29566 .LBB4_1532: # in Loop: Header=BB4_1527 Depth=1 29567 pxor xmm1, xmm1 29568 movd dword ptr [r8 + 4*rdx + 4], xmm1 29569 cmp byte ptr [rcx + rdx + 2], 0 29570 movdqa xmm1, xmm0 29571 je .LBB4_1533 29572 .LBB4_1530: # in Loop: Header=BB4_1527 Depth=1 29573 movd dword ptr [r8 + 4*rdx + 8], xmm1 29574 cmp byte ptr [rcx + rdx + 3], 0 29575 movdqa xmm1, xmm0 29576 jne .LBB4_1526 29577 .LBB4_1534: # in Loop: Header=BB4_1527 Depth=1 29578 pxor xmm1, xmm1 29579 jmp .LBB4_1526 29580 .LBB4_499: 29581 mov esi, r11d 29582 and esi, -4 29583 lea rdx, [rsi - 4] 29584 mov r9, rdx 29585 shr r9, 2 29586 add r9, 1 29587 test rdx, rdx 29588 je .LBB4_1106 29589 # %bb.500: 29590 mov rdx, r9 29591 and rdx, -2 29592 neg rdx 29593 xor edi, edi 29594 xorpd xmm0, xmm0 29595 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 29596 movapd xmm2, xmmword ptr [rip + .LCPI4_1] # xmm2 = [1.0E+0,1.0E+0] 29597 .LBB4_501: # =>This Inner Loop Header: Depth=1 29598 movupd xmm5, xmmword ptr [rcx + 8*rdi] 29599 movupd xmm6, xmmword ptr [rcx + 8*rdi + 16] 29600 movapd xmm3, xmm5 29601 cmpeqpd xmm3, xmm0 29602 shufps xmm3, xmm3, 232 # xmm3 = xmm3[0,2,2,3] 29603 movapd xmm4, xmm6 29604 cmpeqpd xmm4, xmm0 29605 andpd xmm5, xmm1 29606 orpd xmm5, xmm2 29607 andpd xmm6, xmm1 29608 orpd xmm6, xmm2 29609 pshufd xmm7, xmm5, 238 # xmm7 = xmm5[2,3,2,3] 29610 cvttsd2si rax, xmm7 29611 cvttsd2si rbx, xmm5 29612 movd xmm5, ebx 29613 pinsrd xmm5, eax, 1 29614 pshufd xmm7, xmm6, 238 # xmm7 = xmm6[2,3,2,3] 29615 cvttsd2si rax, xmm7 29616 cvttsd2si rbx, xmm6 29617 shufps xmm4, xmm4, 232 # xmm4 = xmm4[0,2,2,3] 29618 movd xmm6, ebx 29619 pinsrd xmm6, eax, 1 29620 andnps xmm3, xmm5 29621 andnps xmm4, xmm6 29622 movlhps xmm3, xmm4 # xmm3 = xmm3[0],xmm4[0] 29623 movups xmmword ptr [r8 + 4*rdi], xmm3 29624 movupd xmm5, xmmword ptr [rcx + 8*rdi + 32] 29625 movupd xmm6, xmmword ptr [rcx + 8*rdi + 48] 29626 movapd xmm3, xmm5 29627 cmpeqpd xmm3, xmm0 29628 shufps xmm3, xmm3, 232 # xmm3 = xmm3[0,2,2,3] 29629 movapd xmm4, xmm6 29630 cmpeqpd xmm4, xmm0 29631 shufps xmm4, xmm4, 232 # xmm4 = xmm4[0,2,2,3] 29632 andpd xmm5, xmm1 29633 orpd xmm5, xmm2 29634 andpd xmm6, xmm1 29635 pshufd xmm7, xmm5, 238 # xmm7 = xmm5[2,3,2,3] 29636 cvttsd2si rax, xmm7 29637 orpd xmm6, xmm2 29638 cvttsd2si rbx, xmm5 29639 movd xmm5, ebx 29640 pinsrd xmm5, eax, 1 29641 andnps xmm3, xmm5 29642 pshufd xmm5, xmm6, 238 # xmm5 = xmm6[2,3,2,3] 29643 cvttsd2si rax, xmm5 29644 cvttsd2si rbx, xmm6 29645 movd xmm5, ebx 29646 pinsrd xmm5, eax, 1 29647 andnps xmm4, xmm5 29648 movlhps xmm3, xmm4 # xmm3 = xmm3[0],xmm4[0] 29649 movups xmmword ptr [r8 + 4*rdi + 16], xmm3 29650 add rdi, 8 29651 add rdx, 2 29652 jne .LBB4_501 29653 jmp .LBB4_1107 29654 .LBB4_507: 29655 mov edx, eax 29656 and edx, -4 29657 lea rsi, [rdx - 4] 29658 mov r9, rsi 29659 shr r9, 2 29660 add r9, 1 29661 test rsi, rsi 29662 je .LBB4_994 29663 # %bb.508: 29664 mov rdi, r9 29665 and rdi, -2 29666 neg rdi 29667 xor esi, esi 29668 pxor xmm0, xmm0 29669 movdqa xmm1, xmmword ptr [rip + .LCPI4_16] # xmm1 = <1,1,u,u> 29670 .LBB4_509: # =>This Inner Loop Header: Depth=1 29671 movdqu xmm2, xmmword ptr [rcx + 8*rsi] 29672 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 16] 29673 pcmpeqq xmm2, xmm0 29674 pshufd xmm2, xmm2, 232 # xmm2 = xmm2[0,2,2,3] 29675 pandn xmm2, xmm1 29676 pcmpeqq xmm3, xmm0 29677 pshufd xmm3, xmm3, 232 # xmm3 = xmm3[0,2,2,3] 29678 pandn xmm3, xmm1 29679 punpcklqdq xmm2, xmm3 # xmm2 = xmm2[0],xmm3[0] 29680 movdqu xmmword ptr [r8 + 4*rsi], xmm2 29681 movdqu xmm2, xmmword ptr [rcx + 8*rsi + 32] 29682 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 48] 29683 pcmpeqq xmm2, xmm0 29684 pshufd xmm2, xmm2, 232 # xmm2 = xmm2[0,2,2,3] 29685 pandn xmm2, xmm1 29686 pcmpeqq xmm3, xmm0 29687 pshufd xmm3, xmm3, 232 # xmm3 = xmm3[0,2,2,3] 29688 pandn xmm3, xmm1 29689 punpcklqdq xmm2, xmm3 # xmm2 = xmm2[0],xmm3[0] 29690 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm2 29691 add rsi, 8 29692 add rdi, 2 29693 jne .LBB4_509 29694 jmp .LBB4_995 29695 .LBB4_510: 29696 mov edx, eax 29697 and edx, -8 29698 lea rsi, [rdx - 8] 29699 mov r9, rsi 29700 shr r9, 3 29701 add r9, 1 29702 test rsi, rsi 29703 je .LBB4_1112 29704 # %bb.511: 29705 mov rdi, r9 29706 and rdi, -2 29707 neg rdi 29708 xor esi, esi 29709 pxor xmm0, xmm0 29710 pcmpeqd xmm1, xmm1 29711 movdqa xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1] 29712 .LBB4_512: # =>This Inner Loop Header: Depth=1 29713 movq xmm3, qword ptr [rcx + 2*rsi] # xmm3 = mem[0],zero 29714 movq xmm4, qword ptr [rcx + 2*rsi + 8] # xmm4 = mem[0],zero 29715 pcmpeqw xmm3, xmm0 29716 pxor xmm3, xmm1 29717 pmovzxwd xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 29718 pand xmm3, xmm2 29719 pcmpeqw xmm4, xmm0 29720 pxor xmm4, xmm1 29721 pmovzxwd xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 29722 pand xmm4, xmm2 29723 movdqu xmmword ptr [r8 + 4*rsi], xmm3 29724 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm4 29725 movq xmm3, qword ptr [rcx + 2*rsi + 16] # xmm3 = mem[0],zero 29726 movq xmm4, qword ptr [rcx + 2*rsi + 24] # xmm4 = mem[0],zero 29727 pcmpeqw xmm3, xmm0 29728 pxor xmm3, xmm1 29729 pmovzxwd xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 29730 pand xmm3, xmm2 29731 pcmpeqw xmm4, xmm0 29732 pxor xmm4, xmm1 29733 pmovzxwd xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 29734 pand xmm4, xmm2 29735 movdqu xmmword ptr [r8 + 4*rsi + 32], xmm3 29736 movdqu xmmword ptr [r8 + 4*rsi + 48], xmm4 29737 add rsi, 16 29738 add rdi, 2 29739 jne .LBB4_512 29740 jmp .LBB4_1113 29741 .LBB4_513: 29742 mov edx, r10d 29743 and edx, -8 29744 lea rsi, [rdx - 8] 29745 mov r9, rsi 29746 shr r9, 3 29747 add r9, 1 29748 test rsi, rsi 29749 je .LBB4_1117 29750 # %bb.514: 29751 mov rdi, r9 29752 and rdi, -2 29753 neg rdi 29754 xor esi, esi 29755 pxor xmm2, xmm2 29756 pcmpeqd xmm3, xmm3 29757 movaps xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 29758 .LBB4_515: # =>This Inner Loop Header: Depth=1 29759 movq xmm5, qword ptr [rcx + 2*rsi] # xmm5 = mem[0],zero 29760 movq xmm6, qword ptr [rcx + 2*rsi + 8] # xmm6 = mem[0],zero 29761 movdqa xmm0, xmm5 29762 pcmpgtw xmm0, xmm2 29763 pmovsxwd xmm0, xmm0 29764 movdqa xmm1, xmm6 29765 pcmpgtw xmm1, xmm2 29766 pmovsxwd xmm1, xmm1 29767 pcmpeqw xmm5, xmm2 29768 pxor xmm5, xmm3 29769 pmovsxwd xmm5, xmm5 29770 pcmpeqw xmm6, xmm2 29771 pxor xmm6, xmm3 29772 pmovsxwd xmm6, xmm6 29773 blendvps xmm5, xmm4, xmm0 29774 movdqa xmm0, xmm1 29775 blendvps xmm6, xmm4, xmm0 29776 movups xmmword ptr [r8 + 4*rsi], xmm5 29777 movups xmmword ptr [r8 + 4*rsi + 16], xmm6 29778 movq xmm5, qword ptr [rcx + 2*rsi + 16] # xmm5 = mem[0],zero 29779 movq xmm6, qword ptr [rcx + 2*rsi + 24] # xmm6 = mem[0],zero 29780 movdqa xmm0, xmm5 29781 pcmpgtw xmm0, xmm2 29782 pmovsxwd xmm0, xmm0 29783 movdqa xmm1, xmm6 29784 pcmpgtw xmm1, xmm2 29785 pmovsxwd xmm1, xmm1 29786 pcmpeqw xmm5, xmm2 29787 pxor xmm5, xmm3 29788 pmovsxwd xmm5, xmm5 29789 pcmpeqw xmm6, xmm2 29790 pxor xmm6, xmm3 29791 pmovsxwd xmm6, xmm6 29792 blendvps xmm5, xmm4, xmm0 29793 movdqa xmm0, xmm1 29794 blendvps xmm6, xmm4, xmm0 29795 movups xmmword ptr [r8 + 4*rsi + 32], xmm5 29796 movups xmmword ptr [r8 + 4*rsi + 48], xmm6 29797 add rsi, 16 29798 add rdi, 2 29799 jne .LBB4_515 29800 jmp .LBB4_1118 29801 .LBB4_516: 29802 mov edx, r10d 29803 and edx, -4 29804 lea rsi, [rdx - 4] 29805 mov r9, rsi 29806 shr r9, 2 29807 add r9, 1 29808 test rsi, rsi 29809 je .LBB4_1123 29810 # %bb.517: 29811 mov rdi, r9 29812 and rdi, -2 29813 neg rdi 29814 xor esi, esi 29815 pxor xmm2, xmm2 29816 pcmpeqd xmm3, xmm3 29817 movaps xmm4, xmmword ptr [rip + .LCPI4_16] # xmm4 = <1,1,u,u> 29818 .LBB4_518: # =>This Inner Loop Header: Depth=1 29819 movdqu xmm5, xmmword ptr [rcx + 8*rsi] 29820 movdqu xmm6, xmmword ptr [rcx + 8*rsi + 16] 29821 movdqa xmm0, xmm5 29822 pcmpgtq xmm0, xmm2 29823 pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] 29824 movdqa xmm1, xmm6 29825 pcmpgtq xmm1, xmm2 29826 pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] 29827 pcmpeqq xmm5, xmm2 29828 pshufd xmm5, xmm5, 232 # xmm5 = xmm5[0,2,2,3] 29829 pxor xmm5, xmm3 29830 pcmpeqq xmm6, xmm2 29831 pshufd xmm6, xmm6, 232 # xmm6 = xmm6[0,2,2,3] 29832 pxor xmm6, xmm3 29833 blendvps xmm5, xmm4, xmm0 29834 movdqa xmm0, xmm1 29835 blendvps xmm6, xmm4, xmm0 29836 movlhps xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 29837 movups xmmword ptr [r8 + 4*rsi], xmm5 29838 movdqu xmm5, xmmword ptr [rcx + 8*rsi + 32] 29839 movdqu xmm6, xmmword ptr [rcx + 8*rsi + 48] 29840 movdqa xmm0, xmm5 29841 pcmpgtq xmm0, xmm2 29842 pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] 29843 movdqa xmm1, xmm6 29844 pcmpgtq xmm1, xmm2 29845 pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] 29846 pcmpeqq xmm5, xmm2 29847 pshufd xmm5, xmm5, 232 # xmm5 = xmm5[0,2,2,3] 29848 pxor xmm5, xmm3 29849 pcmpeqq xmm6, xmm2 29850 pshufd xmm6, xmm6, 232 # xmm6 = xmm6[0,2,2,3] 29851 pxor xmm6, xmm3 29852 blendvps xmm5, xmm4, xmm0 29853 movdqa xmm0, xmm1 29854 blendvps xmm6, xmm4, xmm0 29855 movlhps xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 29856 movups xmmword ptr [r8 + 4*rsi + 16], xmm5 29857 add rsi, 8 29858 add rdi, 2 29859 jne .LBB4_518 29860 jmp .LBB4_1124 29861 .LBB4_519: 29862 mov edx, eax 29863 and edx, -4 29864 lea rsi, [rdx - 4] 29865 mov r9, rsi 29866 shr r9, 2 29867 add r9, 1 29868 test rsi, rsi 29869 je .LBB4_1129 29870 # %bb.520: 29871 mov rdi, r9 29872 and rdi, -2 29873 neg rdi 29874 xor esi, esi 29875 xorps xmm1, xmm1 29876 movdqa xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1] 29877 movaps xmm3, xmmword ptr [rip + .LCPI4_10] # xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] 29878 movaps xmm4, xmmword ptr [rip + .LCPI4_4] # xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 29879 .LBB4_521: # =>This Inner Loop Header: Depth=1 29880 movdqu xmm5, xmmword ptr [rcx + 4*rsi] 29881 movdqa xmm0, xmm5 29882 psrad xmm0, 31 29883 por xmm0, xmm2 29884 cvtdq2ps xmm6, xmm0 29885 movaps xmm0, xmm6 29886 cmpltps xmm0, xmm3 29887 cvttps2dq xmm7, xmm6 29888 subps xmm6, xmm3 29889 cvttps2dq xmm6, xmm6 29890 xorps xmm6, xmm4 29891 blendvps xmm6, xmm7, xmm0 29892 cmpneqps xmm5, xmm1 29893 andps xmm5, xmm6 29894 movups xmmword ptr [r8 + 4*rsi], xmm5 29895 movdqu xmm5, xmmword ptr [rcx + 4*rsi + 16] 29896 movdqa xmm0, xmm5 29897 psrad xmm0, 31 29898 por xmm0, xmm2 29899 cvtdq2ps xmm6, xmm0 29900 movaps xmm0, xmm6 29901 cmpltps xmm0, xmm3 29902 cvttps2dq xmm7, xmm6 29903 subps xmm6, xmm3 29904 cvttps2dq xmm6, xmm6 29905 xorps xmm6, xmm4 29906 blendvps xmm6, xmm7, xmm0 29907 cmpneqps xmm5, xmm1 29908 andps xmm5, xmm6 29909 movups xmmword ptr [r8 + 4*rsi + 16], xmm5 29910 add rsi, 8 29911 add rdi, 2 29912 jne .LBB4_521 29913 jmp .LBB4_1130 29914 .LBB4_532: 29915 and edx, -4 29916 xor esi, esi 29917 movsd xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero 29918 jmp .LBB4_534 29919 .LBB4_533: # in Loop: Header=BB4_534 Depth=1 29920 movsd qword ptr [r8 + 8*rsi + 24], xmm1 29921 add rsi, 4 29922 cmp rdx, rsi 29923 je .LBB4_101 29924 .LBB4_534: # =>This Inner Loop Header: Depth=1 29925 cmp dword ptr [rcx + 4*rsi], 0 29926 movapd xmm1, xmm0 29927 jne .LBB4_535 29928 # %bb.538: # in Loop: Header=BB4_534 Depth=1 29929 xorpd xmm1, xmm1 29930 movsd qword ptr [r8 + 8*rsi], xmm1 29931 cmp dword ptr [rcx + 4*rsi + 4], 0 29932 movapd xmm1, xmm0 29933 je .LBB4_539 29934 .LBB4_536: # in Loop: Header=BB4_534 Depth=1 29935 movsd qword ptr [r8 + 8*rsi + 8], xmm1 29936 cmp dword ptr [rcx + 4*rsi + 8], 0 29937 movapd xmm1, xmm0 29938 jne .LBB4_537 29939 .LBB4_540: # in Loop: Header=BB4_534 Depth=1 29940 xorpd xmm1, xmm1 29941 movsd qword ptr [r8 + 8*rsi + 16], xmm1 29942 cmp dword ptr [rcx + 4*rsi + 12], 0 29943 movapd xmm1, xmm0 29944 jne .LBB4_533 29945 jmp .LBB4_541 29946 .LBB4_535: # in Loop: Header=BB4_534 Depth=1 29947 movsd qword ptr [r8 + 8*rsi], xmm1 29948 cmp dword ptr [rcx + 4*rsi + 4], 0 29949 movapd xmm1, xmm0 29950 jne .LBB4_536 29951 .LBB4_539: # in Loop: Header=BB4_534 Depth=1 29952 xorpd xmm1, xmm1 29953 movsd qword ptr [r8 + 8*rsi + 8], xmm1 29954 cmp dword ptr [rcx + 4*rsi + 8], 0 29955 movapd xmm1, xmm0 29956 je .LBB4_540 29957 .LBB4_537: # in Loop: Header=BB4_534 Depth=1 29958 movsd qword ptr [r8 + 8*rsi + 16], xmm1 29959 cmp dword ptr [rcx + 4*rsi + 12], 0 29960 movapd xmm1, xmm0 29961 jne .LBB4_533 29962 .LBB4_541: # in Loop: Header=BB4_534 Depth=1 29963 xorpd xmm1, xmm1 29964 jmp .LBB4_533 29965 .LBB4_547: 29966 mov esi, edx 29967 and esi, -2 29968 xor eax, eax 29969 movsd xmm0, qword ptr [rip + .LCPI4_13] # xmm0 = mem[0],zero 29970 movsd xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero 29971 jmp .LBB4_549 29972 .LBB4_548: # in Loop: Header=BB4_549 Depth=1 29973 movsd qword ptr [r8 + 8*rax + 8], xmm3 29974 add rax, 2 29975 cmp rsi, rax 29976 je .LBB4_120 29977 .LBB4_549: # =>This Inner Loop Header: Depth=1 29978 cmp byte ptr [rcx + rax], 0 29979 movapd xmm2, xmm0 29980 jne .LBB4_550 29981 # %bb.553: # in Loop: Header=BB4_549 Depth=1 29982 xorpd xmm2, xmm2 29983 movapd xmm3, xmm1 29984 jle .LBB4_554 29985 .LBB4_551: # in Loop: Header=BB4_549 Depth=1 29986 movsd qword ptr [r8 + 8*rax], xmm3 29987 cmp byte ptr [rcx + rax + 1], 0 29988 movapd xmm2, xmm0 29989 jne .LBB4_552 29990 .LBB4_555: # in Loop: Header=BB4_549 Depth=1 29991 xorpd xmm2, xmm2 29992 movapd xmm3, xmm1 29993 jg .LBB4_548 29994 jmp .LBB4_556 29995 .LBB4_550: # in Loop: Header=BB4_549 Depth=1 29996 movapd xmm3, xmm1 29997 jg .LBB4_551 29998 .LBB4_554: # in Loop: Header=BB4_549 Depth=1 29999 movapd xmm3, xmm2 30000 movsd qword ptr [r8 + 8*rax], xmm3 30001 cmp byte ptr [rcx + rax + 1], 0 30002 movapd xmm2, xmm0 30003 je .LBB4_555 30004 .LBB4_552: # in Loop: Header=BB4_549 Depth=1 30005 movapd xmm3, xmm1 30006 jg .LBB4_548 30007 .LBB4_556: # in Loop: Header=BB4_549 Depth=1 30008 movapd xmm3, xmm2 30009 jmp .LBB4_548 30010 .LBB4_557: 30011 and edx, -4 30012 xor esi, esi 30013 movsd xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero 30014 jmp .LBB4_559 30015 .LBB4_558: # in Loop: Header=BB4_559 Depth=1 30016 movsd qword ptr [r8 + 8*rsi + 24], xmm1 30017 add rsi, 4 30018 cmp rdx, rsi 30019 je .LBB4_130 30020 .LBB4_559: # =>This Inner Loop Header: Depth=1 30021 cmp qword ptr [rcx + 8*rsi], 0 30022 movapd xmm1, xmm0 30023 jne .LBB4_560 30024 # %bb.563: # in Loop: Header=BB4_559 Depth=1 30025 xorpd xmm1, xmm1 30026 movsd qword ptr [r8 + 8*rsi], xmm1 30027 cmp qword ptr [rcx + 8*rsi + 8], 0 30028 movapd xmm1, xmm0 30029 je .LBB4_564 30030 .LBB4_561: # in Loop: Header=BB4_559 Depth=1 30031 movsd qword ptr [r8 + 8*rsi + 8], xmm1 30032 cmp qword ptr [rcx + 8*rsi + 16], 0 30033 movapd xmm1, xmm0 30034 jne .LBB4_562 30035 .LBB4_565: # in Loop: Header=BB4_559 Depth=1 30036 xorpd xmm1, xmm1 30037 movsd qword ptr [r8 + 8*rsi + 16], xmm1 30038 cmp qword ptr [rcx + 8*rsi + 24], 0 30039 movapd xmm1, xmm0 30040 jne .LBB4_558 30041 jmp .LBB4_566 30042 .LBB4_560: # in Loop: Header=BB4_559 Depth=1 30043 movsd qword ptr [r8 + 8*rsi], xmm1 30044 cmp qword ptr [rcx + 8*rsi + 8], 0 30045 movapd xmm1, xmm0 30046 jne .LBB4_561 30047 .LBB4_564: # in Loop: Header=BB4_559 Depth=1 30048 xorpd xmm1, xmm1 30049 movsd qword ptr [r8 + 8*rsi + 8], xmm1 30050 cmp qword ptr [rcx + 8*rsi + 16], 0 30051 movapd xmm1, xmm0 30052 je .LBB4_565 30053 .LBB4_562: # in Loop: Header=BB4_559 Depth=1 30054 movsd qword ptr [r8 + 8*rsi + 16], xmm1 30055 cmp qword ptr [rcx + 8*rsi + 24], 0 30056 movapd xmm1, xmm0 30057 jne .LBB4_558 30058 .LBB4_566: # in Loop: Header=BB4_559 Depth=1 30059 xorpd xmm1, xmm1 30060 jmp .LBB4_558 30061 .LBB4_567: 30062 and edx, -4 30063 xor esi, esi 30064 movsd xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero 30065 jmp .LBB4_569 30066 .LBB4_568: # in Loop: Header=BB4_569 Depth=1 30067 movsd qword ptr [r8 + 8*rsi + 24], xmm1 30068 add rsi, 4 30069 cmp rdx, rsi 30070 je .LBB4_142 30071 .LBB4_569: # =>This Inner Loop Header: Depth=1 30072 cmp word ptr [rcx + 2*rsi], 0 30073 movapd xmm1, xmm0 30074 jne .LBB4_570 30075 # %bb.573: # in Loop: Header=BB4_569 Depth=1 30076 xorpd xmm1, xmm1 30077 movsd qword ptr [r8 + 8*rsi], xmm1 30078 cmp word ptr [rcx + 2*rsi + 2], 0 30079 movapd xmm1, xmm0 30080 je .LBB4_574 30081 .LBB4_571: # in Loop: Header=BB4_569 Depth=1 30082 movsd qword ptr [r8 + 8*rsi + 8], xmm1 30083 cmp word ptr [rcx + 2*rsi + 4], 0 30084 movapd xmm1, xmm0 30085 jne .LBB4_572 30086 .LBB4_575: # in Loop: Header=BB4_569 Depth=1 30087 xorpd xmm1, xmm1 30088 movsd qword ptr [r8 + 8*rsi + 16], xmm1 30089 cmp word ptr [rcx + 2*rsi + 6], 0 30090 movapd xmm1, xmm0 30091 jne .LBB4_568 30092 jmp .LBB4_576 30093 .LBB4_570: # in Loop: Header=BB4_569 Depth=1 30094 movsd qword ptr [r8 + 8*rsi], xmm1 30095 cmp word ptr [rcx + 2*rsi + 2], 0 30096 movapd xmm1, xmm0 30097 jne .LBB4_571 30098 .LBB4_574: # in Loop: Header=BB4_569 Depth=1 30099 xorpd xmm1, xmm1 30100 movsd qword ptr [r8 + 8*rsi + 8], xmm1 30101 cmp word ptr [rcx + 2*rsi + 4], 0 30102 movapd xmm1, xmm0 30103 je .LBB4_575 30104 .LBB4_572: # in Loop: Header=BB4_569 Depth=1 30105 movsd qword ptr [r8 + 8*rsi + 16], xmm1 30106 cmp word ptr [rcx + 2*rsi + 6], 0 30107 movapd xmm1, xmm0 30108 jne .LBB4_568 30109 .LBB4_576: # in Loop: Header=BB4_569 Depth=1 30110 xorpd xmm1, xmm1 30111 jmp .LBB4_568 30112 .LBB4_577: 30113 mov esi, edx 30114 and esi, -2 30115 xor eax, eax 30116 movsd xmm0, qword ptr [rip + .LCPI4_13] # xmm0 = mem[0],zero 30117 movsd xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero 30118 jmp .LBB4_579 30119 .LBB4_578: # in Loop: Header=BB4_579 Depth=1 30120 movsd qword ptr [r8 + 8*rax + 8], xmm3 30121 add rax, 2 30122 cmp rsi, rax 30123 je .LBB4_154 30124 .LBB4_579: # =>This Inner Loop Header: Depth=1 30125 cmp word ptr [rcx + 2*rax], 0 30126 movapd xmm2, xmm0 30127 jne .LBB4_580 30128 # %bb.583: # in Loop: Header=BB4_579 Depth=1 30129 xorpd xmm2, xmm2 30130 movapd xmm3, xmm1 30131 jle .LBB4_584 30132 .LBB4_581: # in Loop: Header=BB4_579 Depth=1 30133 movsd qword ptr [r8 + 8*rax], xmm3 30134 cmp word ptr [rcx + 2*rax + 2], 0 30135 movapd xmm2, xmm0 30136 jne .LBB4_582 30137 .LBB4_585: # in Loop: Header=BB4_579 Depth=1 30138 xorpd xmm2, xmm2 30139 movapd xmm3, xmm1 30140 jg .LBB4_578 30141 jmp .LBB4_586 30142 .LBB4_580: # in Loop: Header=BB4_579 Depth=1 30143 movapd xmm3, xmm1 30144 jg .LBB4_581 30145 .LBB4_584: # in Loop: Header=BB4_579 Depth=1 30146 movapd xmm3, xmm2 30147 movsd qword ptr [r8 + 8*rax], xmm3 30148 cmp word ptr [rcx + 2*rax + 2], 0 30149 movapd xmm2, xmm0 30150 je .LBB4_585 30151 .LBB4_582: # in Loop: Header=BB4_579 Depth=1 30152 movapd xmm3, xmm1 30153 jg .LBB4_578 30154 .LBB4_586: # in Loop: Header=BB4_579 Depth=1 30155 movapd xmm3, xmm2 30156 jmp .LBB4_578 30157 .LBB4_587: 30158 mov esi, edx 30159 and esi, -2 30160 xor eax, eax 30161 movsd xmm0, qword ptr [rip + .LCPI4_13] # xmm0 = mem[0],zero 30162 movsd xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero 30163 jmp .LBB4_589 30164 .LBB4_588: # in Loop: Header=BB4_589 Depth=1 30165 movsd qword ptr [r8 + 8*rax + 8], xmm3 30166 add rax, 2 30167 cmp rsi, rax 30168 je .LBB4_164 30169 .LBB4_589: # =>This Inner Loop Header: Depth=1 30170 cmp qword ptr [rcx + 8*rax], 0 30171 movapd xmm2, xmm0 30172 jne .LBB4_590 30173 # %bb.593: # in Loop: Header=BB4_589 Depth=1 30174 xorpd xmm2, xmm2 30175 movapd xmm3, xmm1 30176 jle .LBB4_594 30177 .LBB4_591: # in Loop: Header=BB4_589 Depth=1 30178 movsd qword ptr [r8 + 8*rax], xmm3 30179 cmp qword ptr [rcx + 8*rax + 8], 0 30180 movapd xmm2, xmm0 30181 jne .LBB4_592 30182 .LBB4_595: # in Loop: Header=BB4_589 Depth=1 30183 xorpd xmm2, xmm2 30184 movapd xmm3, xmm1 30185 jg .LBB4_588 30186 jmp .LBB4_596 30187 .LBB4_590: # in Loop: Header=BB4_589 Depth=1 30188 movapd xmm3, xmm1 30189 jg .LBB4_591 30190 .LBB4_594: # in Loop: Header=BB4_589 Depth=1 30191 movapd xmm3, xmm2 30192 movsd qword ptr [r8 + 8*rax], xmm3 30193 cmp qword ptr [rcx + 8*rax + 8], 0 30194 movapd xmm2, xmm0 30195 je .LBB4_595 30196 .LBB4_592: # in Loop: Header=BB4_589 Depth=1 30197 movapd xmm3, xmm1 30198 jg .LBB4_588 30199 .LBB4_596: # in Loop: Header=BB4_589 Depth=1 30200 movapd xmm3, xmm2 30201 jmp .LBB4_588 30202 .LBB4_597: 30203 mov esi, edx 30204 and esi, -2 30205 xor eax, eax 30206 xorps xmm0, xmm0 30207 jmp .LBB4_599 30208 .LBB4_598: # in Loop: Header=BB4_599 Depth=1 30209 movsd qword ptr [r8 + 8*rax + 8], xmm1 30210 add rax, 2 30211 cmp rsi, rax 30212 je .LBB4_174 30213 .LBB4_599: # =>This Inner Loop Header: Depth=1 30214 movss xmm2, dword ptr [rcx + 4*rax] # xmm2 = mem[0],zero,zero,zero 30215 xorpd xmm1, xmm1 30216 ucomiss xmm0, xmm2 30217 xorpd xmm3, xmm3 30218 je .LBB4_601 30219 # %bb.600: # in Loop: Header=BB4_599 Depth=1 30220 movmskps edi, xmm2 30221 and edi, 1 30222 neg edi 30223 or edi, 1 30224 xorps xmm2, xmm2 30225 cvtsi2ss xmm2, edi 30226 xorps xmm3, xmm3 30227 cvtss2sd xmm3, xmm2 30228 .LBB4_601: # in Loop: Header=BB4_599 Depth=1 30229 movsd qword ptr [r8 + 8*rax], xmm3 30230 movss xmm2, dword ptr [rcx + 4*rax + 4] # xmm2 = mem[0],zero,zero,zero 30231 ucomiss xmm0, xmm2 30232 je .LBB4_598 30233 # %bb.602: # in Loop: Header=BB4_599 Depth=1 30234 movmskps edi, xmm2 30235 and edi, 1 30236 neg edi 30237 or edi, 1 30238 xorps xmm1, xmm1 30239 cvtsi2ss xmm1, edi 30240 cvtss2sd xmm1, xmm1 30241 jmp .LBB4_598 30242 .LBB4_603: 30243 and edx, -4 30244 xor esi, esi 30245 movsd xmm0, qword ptr [rip + .LCPI4_2] # xmm0 = mem[0],zero 30246 jmp .LBB4_605 30247 .LBB4_604: # in Loop: Header=BB4_605 Depth=1 30248 movsd qword ptr [r8 + 8*rsi + 24], xmm1 30249 add rsi, 4 30250 cmp rdx, rsi 30251 je .LBB4_185 30252 .LBB4_605: # =>This Inner Loop Header: Depth=1 30253 cmp byte ptr [rcx + rsi], 0 30254 movapd xmm1, xmm0 30255 jne .LBB4_606 30256 # %bb.609: # in Loop: Header=BB4_605 Depth=1 30257 xorpd xmm1, xmm1 30258 movsd qword ptr [r8 + 8*rsi], xmm1 30259 cmp byte ptr [rcx + rsi + 1], 0 30260 movapd xmm1, xmm0 30261 je .LBB4_610 30262 .LBB4_607: # in Loop: Header=BB4_605 Depth=1 30263 movsd qword ptr [r8 + 8*rsi + 8], xmm1 30264 cmp byte ptr [rcx + rsi + 2], 0 30265 movapd xmm1, xmm0 30266 jne .LBB4_608 30267 .LBB4_611: # in Loop: Header=BB4_605 Depth=1 30268 xorpd xmm1, xmm1 30269 movsd qword ptr [r8 + 8*rsi + 16], xmm1 30270 cmp byte ptr [rcx + rsi + 3], 0 30271 movapd xmm1, xmm0 30272 jne .LBB4_604 30273 jmp .LBB4_612 30274 .LBB4_606: # in Loop: Header=BB4_605 Depth=1 30275 movsd qword ptr [r8 + 8*rsi], xmm1 30276 cmp byte ptr [rcx + rsi + 1], 0 30277 movapd xmm1, xmm0 30278 jne .LBB4_607 30279 .LBB4_610: # in Loop: Header=BB4_605 Depth=1 30280 xorpd xmm1, xmm1 30281 movsd qword ptr [r8 + 8*rsi + 8], xmm1 30282 cmp byte ptr [rcx + rsi + 2], 0 30283 movapd xmm1, xmm0 30284 je .LBB4_611 30285 .LBB4_608: # in Loop: Header=BB4_605 Depth=1 30286 movsd qword ptr [r8 + 8*rsi + 16], xmm1 30287 cmp byte ptr [rcx + rsi + 3], 0 30288 movapd xmm1, xmm0 30289 jne .LBB4_604 30290 .LBB4_612: # in Loop: Header=BB4_605 Depth=1 30291 xorpd xmm1, xmm1 30292 jmp .LBB4_604 30293 .LBB4_613: 30294 mov esi, edx 30295 and esi, -2 30296 xor eax, eax 30297 movsd xmm0, qword ptr [rip + .LCPI4_13] # xmm0 = mem[0],zero 30298 movsd xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero 30299 jmp .LBB4_615 30300 .LBB4_614: # in Loop: Header=BB4_615 Depth=1 30301 movsd qword ptr [r8 + 8*rax + 8], xmm3 30302 add rax, 2 30303 cmp rsi, rax 30304 je .LBB4_197 30305 .LBB4_615: # =>This Inner Loop Header: Depth=1 30306 cmp dword ptr [rcx + 4*rax], 0 30307 movapd xmm2, xmm0 30308 jne .LBB4_616 30309 # %bb.619: # in Loop: Header=BB4_615 Depth=1 30310 xorpd xmm2, xmm2 30311 movapd xmm3, xmm1 30312 jle .LBB4_620 30313 .LBB4_617: # in Loop: Header=BB4_615 Depth=1 30314 movsd qword ptr [r8 + 8*rax], xmm3 30315 cmp dword ptr [rcx + 4*rax + 4], 0 30316 movapd xmm2, xmm0 30317 jne .LBB4_618 30318 .LBB4_621: # in Loop: Header=BB4_615 Depth=1 30319 xorpd xmm2, xmm2 30320 movapd xmm3, xmm1 30321 jg .LBB4_614 30322 jmp .LBB4_622 30323 .LBB4_616: # in Loop: Header=BB4_615 Depth=1 30324 movapd xmm3, xmm1 30325 jg .LBB4_617 30326 .LBB4_620: # in Loop: Header=BB4_615 Depth=1 30327 movapd xmm3, xmm2 30328 movsd qword ptr [r8 + 8*rax], xmm3 30329 cmp dword ptr [rcx + 4*rax + 4], 0 30330 movapd xmm2, xmm0 30331 je .LBB4_621 30332 .LBB4_618: # in Loop: Header=BB4_615 Depth=1 30333 movapd xmm3, xmm1 30334 jg .LBB4_614 30335 .LBB4_622: # in Loop: Header=BB4_615 Depth=1 30336 movapd xmm3, xmm2 30337 jmp .LBB4_614 30338 .LBB4_673: 30339 mov edx, eax 30340 and edx, -4 30341 lea rsi, [rdx - 4] 30342 mov r9, rsi 30343 shr r9, 2 30344 add r9, 1 30345 test rsi, rsi 30346 je .LBB4_999 30347 # %bb.674: 30348 mov rdi, r9 30349 and rdi, -2 30350 neg rdi 30351 xor esi, esi 30352 pxor xmm0, xmm0 30353 pcmpeqd xmm1, xmm1 30354 movdqa xmm2, xmmword ptr [rip + .LCPI4_15] # xmm2 = [1,1] 30355 .LBB4_675: # =>This Inner Loop Header: Depth=1 30356 movq xmm3, qword ptr [rcx + 4*rsi] # xmm3 = mem[0],zero 30357 movq xmm4, qword ptr [rcx + 4*rsi + 8] # xmm4 = mem[0],zero 30358 pcmpeqd xmm3, xmm0 30359 pxor xmm3, xmm1 30360 pmovzxdq xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero 30361 pand xmm3, xmm2 30362 pcmpeqd xmm4, xmm0 30363 pxor xmm4, xmm1 30364 pmovzxdq xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero 30365 pand xmm4, xmm2 30366 movdqu xmmword ptr [r8 + 8*rsi], xmm3 30367 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm4 30368 movq xmm3, qword ptr [rcx + 4*rsi + 16] # xmm3 = mem[0],zero 30369 movq xmm4, qword ptr [rcx + 4*rsi + 24] # xmm4 = mem[0],zero 30370 pcmpeqd xmm3, xmm0 30371 pxor xmm3, xmm1 30372 pmovzxdq xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero 30373 pand xmm3, xmm2 30374 pcmpeqd xmm4, xmm0 30375 pxor xmm4, xmm1 30376 pmovzxdq xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero 30377 pand xmm4, xmm2 30378 movdqu xmmword ptr [r8 + 8*rsi + 32], xmm3 30379 movdqu xmmword ptr [r8 + 8*rsi + 48], xmm4 30380 add rsi, 8 30381 add rdi, 2 30382 jne .LBB4_675 30383 jmp .LBB4_1000 30384 .LBB4_676: 30385 mov esi, r10d 30386 and esi, -2 30387 lea rax, [rsi - 2] 30388 mov r9, rax 30389 shr r9 30390 add r9, 1 30391 test rax, rax 30392 je .LBB4_1004 30393 # %bb.677: 30394 mov r14, r9 30395 and r14, -2 30396 neg r14 30397 xor edi, edi 30398 xorpd xmm0, xmm0 30399 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 30400 movapd xmm2, xmmword ptr [rip + .LCPI4_1] # xmm2 = [1.0E+0,1.0E+0] 30401 movsd xmm3, qword ptr [rip + .LCPI4_6] # xmm3 = mem[0],zero 30402 .LBB4_678: # =>This Inner Loop Header: Depth=1 30403 movupd xmm4, xmmword ptr [rcx + 8*rdi] 30404 movapd xmm5, xmm4 30405 andpd xmm5, xmm1 30406 orpd xmm5, xmm2 30407 movapd xmm6, xmm5 30408 subsd xmm6, xmm3 30409 cvttsd2si rbx, xmm6 30410 xor rbx, r11 30411 cvttsd2si rdx, xmm5 30412 ucomisd xmm5, xmm3 30413 cmovae rdx, rbx 30414 pshufd xmm5, xmm5, 238 # xmm5 = xmm5[2,3,2,3] 30415 movdqa xmm6, xmm5 30416 subsd xmm6, xmm3 30417 cvttsd2si rbx, xmm6 30418 xor rbx, r11 30419 cvttsd2si rax, xmm5 30420 ucomisd xmm5, xmm3 30421 cmovae rax, rbx 30422 movq xmm5, rdx 30423 movq xmm6, rax 30424 punpcklqdq xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 30425 cmpneqpd xmm4, xmm0 30426 andpd xmm4, xmm5 30427 movupd xmmword ptr [r8 + 8*rdi], xmm4 30428 movupd xmm4, xmmword ptr [rcx + 8*rdi + 16] 30429 movapd xmm5, xmm4 30430 andpd xmm5, xmm1 30431 orpd xmm5, xmm2 30432 movapd xmm6, xmm5 30433 subsd xmm6, xmm3 30434 cvttsd2si rax, xmm6 30435 xor rax, r11 30436 cvttsd2si rdx, xmm5 30437 ucomisd xmm5, xmm3 30438 cmovae rdx, rax 30439 pshufd xmm5, xmm5, 238 # xmm5 = xmm5[2,3,2,3] 30440 movdqa xmm6, xmm5 30441 subsd xmm6, xmm3 30442 cvttsd2si rax, xmm6 30443 xor rax, r11 30444 cvttsd2si rbx, xmm5 30445 ucomisd xmm5, xmm3 30446 cmovae rbx, rax 30447 movq xmm5, rdx 30448 movq xmm6, rbx 30449 punpcklqdq xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 30450 cmpneqpd xmm4, xmm0 30451 andpd xmm4, xmm5 30452 movupd xmmword ptr [r8 + 8*rdi + 16], xmm4 30453 add rdi, 4 30454 add r14, 2 30455 jne .LBB4_678 30456 jmp .LBB4_1005 30457 .LBB4_689: 30458 mov edx, eax 30459 and edx, -4 30460 lea rsi, [rdx - 4] 30461 mov r9, rsi 30462 shr r9, 2 30463 add r9, 1 30464 test rsi, rsi 30465 je .LBB4_1010 30466 # %bb.690: 30467 mov rdi, r9 30468 and rdi, -2 30469 neg rdi 30470 xor esi, esi 30471 pxor xmm0, xmm0 30472 pcmpeqd xmm1, xmm1 30473 movdqa xmm2, xmmword ptr [rip + .LCPI4_15] # xmm2 = [1,1] 30474 .LBB4_691: # =>This Inner Loop Header: Depth=1 30475 movd xmm3, dword ptr [rcx + 2*rsi] # xmm3 = mem[0],zero,zero,zero 30476 movd xmm4, dword ptr [rcx + 2*rsi + 4] # xmm4 = mem[0],zero,zero,zero 30477 pcmpeqw xmm3, xmm0 30478 pxor xmm3, xmm1 30479 pmovzxwq xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 30480 pand xmm3, xmm2 30481 pcmpeqw xmm4, xmm0 30482 pxor xmm4, xmm1 30483 pmovzxwq xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 30484 pand xmm4, xmm2 30485 movdqu xmmword ptr [r8 + 8*rsi], xmm3 30486 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm4 30487 movd xmm3, dword ptr [rcx + 2*rsi + 8] # xmm3 = mem[0],zero,zero,zero 30488 movd xmm4, dword ptr [rcx + 2*rsi + 12] # xmm4 = mem[0],zero,zero,zero 30489 pcmpeqw xmm3, xmm0 30490 pxor xmm3, xmm1 30491 pmovzxwq xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 30492 pand xmm3, xmm2 30493 pcmpeqw xmm4, xmm0 30494 pxor xmm4, xmm1 30495 pmovzxwq xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 30496 pand xmm4, xmm2 30497 movdqu xmmword ptr [r8 + 8*rsi + 32], xmm3 30498 movdqu xmmword ptr [r8 + 8*rsi + 48], xmm4 30499 add rsi, 8 30500 add rdi, 2 30501 jne .LBB4_691 30502 jmp .LBB4_1011 30503 .LBB4_692: 30504 mov edx, r10d 30505 and edx, -4 30506 lea rsi, [rdx - 4] 30507 mov r9, rsi 30508 shr r9, 2 30509 add r9, 1 30510 test rsi, rsi 30511 je .LBB4_1015 30512 # %bb.693: 30513 mov rdi, r9 30514 and rdi, -2 30515 neg rdi 30516 xor esi, esi 30517 pxor xmm2, xmm2 30518 pcmpeqd xmm3, xmm3 30519 movapd xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 30520 .LBB4_694: # =>This Inner Loop Header: Depth=1 30521 movd xmm5, dword ptr [rcx + 2*rsi] # xmm5 = mem[0],zero,zero,zero 30522 movd xmm6, dword ptr [rcx + 2*rsi + 4] # xmm6 = mem[0],zero,zero,zero 30523 movdqa xmm0, xmm5 30524 pcmpgtw xmm0, xmm2 30525 pmovsxwq xmm0, xmm0 30526 movdqa xmm1, xmm6 30527 pcmpgtw xmm1, xmm2 30528 pmovsxwq xmm1, xmm1 30529 pcmpeqw xmm5, xmm2 30530 pxor xmm5, xmm3 30531 pmovsxwq xmm5, xmm5 30532 pcmpeqw xmm6, xmm2 30533 pxor xmm6, xmm3 30534 pmovsxwq xmm6, xmm6 30535 blendvpd xmm5, xmm4, xmm0 30536 movdqa xmm0, xmm1 30537 blendvpd xmm6, xmm4, xmm0 30538 movupd xmmword ptr [r8 + 8*rsi], xmm5 30539 movupd xmmword ptr [r8 + 8*rsi + 16], xmm6 30540 movd xmm5, dword ptr [rcx + 2*rsi + 8] # xmm5 = mem[0],zero,zero,zero 30541 movd xmm6, dword ptr [rcx + 2*rsi + 12] # xmm6 = mem[0],zero,zero,zero 30542 movdqa xmm0, xmm5 30543 pcmpgtw xmm0, xmm2 30544 pmovsxwq xmm0, xmm0 30545 movdqa xmm1, xmm6 30546 pcmpgtw xmm1, xmm2 30547 pmovsxwq xmm1, xmm1 30548 pcmpeqw xmm5, xmm2 30549 pxor xmm5, xmm3 30550 pmovsxwq xmm5, xmm5 30551 pcmpeqw xmm6, xmm2 30552 pxor xmm6, xmm3 30553 pmovsxwq xmm6, xmm6 30554 blendvpd xmm5, xmm4, xmm0 30555 movdqa xmm0, xmm1 30556 blendvpd xmm6, xmm4, xmm0 30557 movupd xmmword ptr [r8 + 8*rsi + 32], xmm5 30558 movupd xmmword ptr [r8 + 8*rsi + 48], xmm6 30559 add rsi, 8 30560 add rdi, 2 30561 jne .LBB4_694 30562 jmp .LBB4_1016 30563 .LBB4_700: 30564 mov esi, r10d 30565 and esi, -2 30566 xor eax, eax 30567 xorps xmm0, xmm0 30568 movss xmm1, dword ptr [rip + .LCPI4_9] # xmm1 = mem[0],zero,zero,zero 30569 movabs r9, -9223372036854775808 30570 jmp .LBB4_703 30571 .LBB4_701: # in Loop: Header=BB4_703 Depth=1 30572 movmskps edx, xmm2 30573 and edx, 1 30574 neg edx 30575 or edx, 1 30576 xorps xmm2, xmm2 30577 cvtsi2ss xmm2, edx 30578 movaps xmm3, xmm2 30579 subss xmm3, xmm1 30580 cvttss2si rdi, xmm3 30581 xor rdi, r9 30582 cvttss2si rdx, xmm2 30583 ucomiss xmm2, xmm1 30584 cmovae rdx, rdi 30585 mov qword ptr [r8 + 8*rax + 8], rdx 30586 add rax, 2 30587 cmp rsi, rax 30588 je .LBB4_290 30589 .LBB4_703: # =>This Inner Loop Header: Depth=1 30590 movss xmm2, dword ptr [rcx + 4*rax] # xmm2 = mem[0],zero,zero,zero 30591 ucomiss xmm0, xmm2 30592 jne .LBB4_705 30593 # %bb.704: # in Loop: Header=BB4_703 Depth=1 30594 xor edx, edx 30595 jmp .LBB4_706 30596 .LBB4_705: # in Loop: Header=BB4_703 Depth=1 30597 movmskps edx, xmm2 30598 and edx, 1 30599 neg edx 30600 or edx, 1 30601 xorps xmm2, xmm2 30602 cvtsi2ss xmm2, edx 30603 movaps xmm3, xmm2 30604 subss xmm3, xmm1 30605 cvttss2si rdi, xmm3 30606 xor rdi, r9 30607 cvttss2si rdx, xmm2 30608 ucomiss xmm2, xmm1 30609 cmovae rdx, rdi 30610 .LBB4_706: # in Loop: Header=BB4_703 Depth=1 30611 mov qword ptr [r8 + 8*rax], rdx 30612 movss xmm2, dword ptr [rcx + 4*rax + 4] # xmm2 = mem[0],zero,zero,zero 30613 ucomiss xmm0, xmm2 30614 jne .LBB4_701 30615 # %bb.707: # in Loop: Header=BB4_703 Depth=1 30616 xor edx, edx 30617 mov qword ptr [r8 + 8*rax + 8], rdx 30618 add rax, 2 30619 cmp rsi, rax 30620 jne .LBB4_703 30621 .LBB4_290: 30622 test r10b, 1 30623 je .LBB4_1655 30624 # %bb.291: 30625 movss xmm0, dword ptr [rcx + 4*rax] # xmm0 = mem[0],zero,zero,zero 30626 xorps xmm1, xmm1 30627 ucomiss xmm1, xmm0 30628 jne .LBB4_993 30629 # %bb.292: 30630 xor ecx, ecx 30631 mov qword ptr [r8 + 8*rax], rcx 30632 jmp .LBB4_1655 30633 .LBB4_713: 30634 mov edx, r10d 30635 and edx, -4 30636 lea rsi, [rdx - 4] 30637 mov r9, rsi 30638 shr r9, 2 30639 add r9, 1 30640 test rsi, rsi 30641 je .LBB4_1021 30642 # %bb.714: 30643 mov rdi, r9 30644 and rdi, -2 30645 neg rdi 30646 xor esi, esi 30647 pxor xmm2, xmm2 30648 pcmpeqd xmm3, xmm3 30649 movapd xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 30650 .LBB4_715: # =>This Inner Loop Header: Depth=1 30651 movq xmm5, qword ptr [rcx + 4*rsi] # xmm5 = mem[0],zero 30652 movq xmm6, qword ptr [rcx + 4*rsi + 8] # xmm6 = mem[0],zero 30653 movdqa xmm0, xmm5 30654 pcmpgtd xmm0, xmm2 30655 pmovsxdq xmm0, xmm0 30656 movdqa xmm1, xmm6 30657 pcmpgtd xmm1, xmm2 30658 pmovsxdq xmm1, xmm1 30659 pcmpeqd xmm5, xmm2 30660 pxor xmm5, xmm3 30661 pmovsxdq xmm5, xmm5 30662 pcmpeqd xmm6, xmm2 30663 pxor xmm6, xmm3 30664 pmovsxdq xmm6, xmm6 30665 blendvpd xmm5, xmm4, xmm0 30666 movdqa xmm0, xmm1 30667 blendvpd xmm6, xmm4, xmm0 30668 movupd xmmword ptr [r8 + 8*rsi], xmm5 30669 movupd xmmword ptr [r8 + 8*rsi + 16], xmm6 30670 movq xmm5, qword ptr [rcx + 4*rsi + 16] # xmm5 = mem[0],zero 30671 movq xmm6, qword ptr [rcx + 4*rsi + 24] # xmm6 = mem[0],zero 30672 movdqa xmm0, xmm5 30673 pcmpgtd xmm0, xmm2 30674 pmovsxdq xmm0, xmm0 30675 movdqa xmm1, xmm6 30676 pcmpgtd xmm1, xmm2 30677 pmovsxdq xmm1, xmm1 30678 pcmpeqd xmm5, xmm2 30679 pxor xmm5, xmm3 30680 pmovsxdq xmm5, xmm5 30681 pcmpeqd xmm6, xmm2 30682 pxor xmm6, xmm3 30683 pmovsxdq xmm6, xmm6 30684 blendvpd xmm5, xmm4, xmm0 30685 movdqa xmm0, xmm1 30686 blendvpd xmm6, xmm4, xmm0 30687 movupd xmmword ptr [r8 + 8*rsi + 32], xmm5 30688 movupd xmmword ptr [r8 + 8*rsi + 48], xmm6 30689 add rsi, 8 30690 add rdi, 2 30691 jne .LBB4_715 30692 jmp .LBB4_1022 30693 .LBB4_716: 30694 mov edx, eax 30695 and edx, -8 30696 lea rsi, [rdx - 8] 30697 mov r9, rsi 30698 shr r9, 3 30699 add r9, 1 30700 test rsi, rsi 30701 je .LBB4_1137 30702 # %bb.717: 30703 mov rdi, r9 30704 and rdi, -2 30705 neg rdi 30706 xor esi, esi 30707 pxor xmm0, xmm0 30708 pcmpeqd xmm1, xmm1 30709 movdqa xmm2, xmmword ptr [rip + .LCPI4_11] # xmm2 = <1,1,1,1,u,u,u,u> 30710 .LBB4_718: # =>This Inner Loop Header: Depth=1 30711 movdqu xmm3, xmmword ptr [rcx + 4*rsi] 30712 movdqu xmm4, xmmword ptr [rcx + 4*rsi + 16] 30713 pcmpeqd xmm3, xmm0 30714 pxor xmm3, xmm1 30715 packssdw xmm3, xmm3 30716 pand xmm3, xmm2 30717 pcmpeqd xmm4, xmm0 30718 pxor xmm4, xmm1 30719 packssdw xmm4, xmm4 30720 pand xmm4, xmm2 30721 punpcklqdq xmm3, xmm4 # xmm3 = xmm3[0],xmm4[0] 30722 movdqu xmmword ptr [r8 + 2*rsi], xmm3 30723 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 32] 30724 movdqu xmm4, xmmword ptr [rcx + 4*rsi + 48] 30725 pcmpeqd xmm3, xmm0 30726 pxor xmm3, xmm1 30727 packssdw xmm3, xmm3 30728 pand xmm3, xmm2 30729 pcmpeqd xmm4, xmm0 30730 pxor xmm4, xmm1 30731 packssdw xmm4, xmm4 30732 pand xmm4, xmm2 30733 punpcklqdq xmm3, xmm4 # xmm3 = xmm3[0],xmm4[0] 30734 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm3 30735 add rsi, 16 30736 add rdi, 2 30737 jne .LBB4_718 30738 jmp .LBB4_1138 30739 .LBB4_719: 30740 mov edx, eax 30741 and edx, -8 30742 lea rsi, [rdx - 8] 30743 mov r9, rsi 30744 shr r9, 3 30745 add r9, 1 30746 test rsi, rsi 30747 je .LBB4_1142 30748 # %bb.720: 30749 mov rdi, r9 30750 and rdi, -2 30751 neg rdi 30752 xor esi, esi 30753 pxor xmm0, xmm0 30754 pcmpeqd xmm1, xmm1 30755 movdqa xmm2, xmmword ptr [rip + .LCPI4_11] # xmm2 = <1,1,1,1,u,u,u,u> 30756 .LBB4_721: # =>This Inner Loop Header: Depth=1 30757 movdqu xmm3, xmmword ptr [rcx + 4*rsi] 30758 movdqu xmm4, xmmword ptr [rcx + 4*rsi + 16] 30759 pcmpeqd xmm3, xmm0 30760 pxor xmm3, xmm1 30761 packssdw xmm3, xmm3 30762 pand xmm3, xmm2 30763 pcmpeqd xmm4, xmm0 30764 pxor xmm4, xmm1 30765 packssdw xmm4, xmm4 30766 pand xmm4, xmm2 30767 punpcklqdq xmm3, xmm4 # xmm3 = xmm3[0],xmm4[0] 30768 movdqu xmmword ptr [r8 + 2*rsi], xmm3 30769 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 32] 30770 movdqu xmm4, xmmword ptr [rcx + 4*rsi + 48] 30771 pcmpeqd xmm3, xmm0 30772 pxor xmm3, xmm1 30773 packssdw xmm3, xmm3 30774 pand xmm3, xmm2 30775 pcmpeqd xmm4, xmm0 30776 pxor xmm4, xmm1 30777 packssdw xmm4, xmm4 30778 pand xmm4, xmm2 30779 punpcklqdq xmm3, xmm4 # xmm3 = xmm3[0],xmm4[0] 30780 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm3 30781 add rsi, 16 30782 add rdi, 2 30783 jne .LBB4_721 30784 jmp .LBB4_1143 30785 .LBB4_722: 30786 mov esi, eax 30787 and esi, -4 30788 lea rdx, [rsi - 4] 30789 mov r9, rdx 30790 shr r9, 2 30791 add r9, 1 30792 test rdx, rdx 30793 je .LBB4_1147 30794 # %bb.723: 30795 mov rdx, r9 30796 and rdx, -2 30797 neg rdx 30798 xor edi, edi 30799 xorpd xmm2, xmm2 30800 movapd xmm3, xmmword ptr [rip + .LCPI4_0] # xmm3 = [-0.0E+0,-0.0E+0] 30801 movapd xmm4, xmmword ptr [rip + .LCPI4_1] # xmm4 = [1.0E+0,1.0E+0] 30802 .LBB4_724: # =>This Inner Loop Header: Depth=1 30803 movupd xmm5, xmmword ptr [rcx + 8*rdi] 30804 movupd xmm6, xmmword ptr [rcx + 8*rdi + 16] 30805 movapd xmm0, xmm5 30806 cmpeqpd xmm0, xmm2 30807 packssdw xmm0, xmm0 30808 packssdw xmm0, xmm0 30809 movapd xmm1, xmm6 30810 cmpeqpd xmm1, xmm2 30811 packssdw xmm1, xmm1 30812 packssdw xmm1, xmm1 30813 andpd xmm5, xmm3 30814 orpd xmm5, xmm4 30815 andpd xmm6, xmm3 30816 orpd xmm6, xmm4 30817 cvttpd2dq xmm5, xmm5 30818 pshuflw xmm5, xmm5, 232 # xmm5 = xmm5[0,2,2,3,4,5,6,7] 30819 cvttpd2dq xmm6, xmm6 30820 pshuflw xmm6, xmm6, 232 # xmm6 = xmm6[0,2,2,3,4,5,6,7] 30821 pblendvb xmm5, xmm2, xmm0 30822 movdqa xmm0, xmm1 30823 pblendvb xmm6, xmm2, xmm0 30824 movd dword ptr [r8 + 2*rdi], xmm5 30825 movd dword ptr [r8 + 2*rdi + 4], xmm6 30826 movupd xmm5, xmmword ptr [rcx + 8*rdi + 32] 30827 movupd xmm6, xmmword ptr [rcx + 8*rdi + 48] 30828 movapd xmm0, xmm5 30829 cmpeqpd xmm0, xmm2 30830 packssdw xmm0, xmm0 30831 packssdw xmm0, xmm0 30832 movapd xmm1, xmm6 30833 cmpeqpd xmm1, xmm2 30834 packssdw xmm1, xmm1 30835 packssdw xmm1, xmm1 30836 andpd xmm5, xmm3 30837 orpd xmm5, xmm4 30838 andpd xmm6, xmm3 30839 orpd xmm6, xmm4 30840 cvttpd2dq xmm5, xmm5 30841 pshuflw xmm5, xmm5, 232 # xmm5 = xmm5[0,2,2,3,4,5,6,7] 30842 cvttpd2dq xmm6, xmm6 30843 pshuflw xmm6, xmm6, 232 # xmm6 = xmm6[0,2,2,3,4,5,6,7] 30844 pblendvb xmm5, xmm2, xmm0 30845 movdqa xmm0, xmm1 30846 pblendvb xmm6, xmm2, xmm0 30847 movd dword ptr [r8 + 2*rdi + 8], xmm5 30848 movd dword ptr [r8 + 2*rdi + 12], xmm6 30849 add rdi, 8 30850 add rdx, 2 30851 jne .LBB4_724 30852 jmp .LBB4_1148 30853 .LBB4_725: 30854 mov esi, eax 30855 and esi, -4 30856 lea rdx, [rsi - 4] 30857 mov r9, rdx 30858 shr r9, 2 30859 add r9, 1 30860 test rdx, rdx 30861 je .LBB4_1153 30862 # %bb.726: 30863 mov rdx, r9 30864 and rdx, -2 30865 neg rdx 30866 xor edi, edi 30867 xorpd xmm2, xmm2 30868 movapd xmm3, xmmword ptr [rip + .LCPI4_0] # xmm3 = [-0.0E+0,-0.0E+0] 30869 movapd xmm4, xmmword ptr [rip + .LCPI4_1] # xmm4 = [1.0E+0,1.0E+0] 30870 .LBB4_727: # =>This Inner Loop Header: Depth=1 30871 movupd xmm5, xmmword ptr [rcx + 8*rdi] 30872 movupd xmm6, xmmword ptr [rcx + 8*rdi + 16] 30873 movapd xmm0, xmm5 30874 cmpeqpd xmm0, xmm2 30875 packssdw xmm0, xmm0 30876 packssdw xmm0, xmm0 30877 movapd xmm1, xmm6 30878 cmpeqpd xmm1, xmm2 30879 packssdw xmm1, xmm1 30880 packssdw xmm1, xmm1 30881 andpd xmm5, xmm3 30882 orpd xmm5, xmm4 30883 andpd xmm6, xmm3 30884 orpd xmm6, xmm4 30885 cvttpd2dq xmm5, xmm5 30886 pshuflw xmm5, xmm5, 232 # xmm5 = xmm5[0,2,2,3,4,5,6,7] 30887 cvttpd2dq xmm6, xmm6 30888 pshuflw xmm6, xmm6, 232 # xmm6 = xmm6[0,2,2,3,4,5,6,7] 30889 pblendvb xmm5, xmm2, xmm0 30890 movdqa xmm0, xmm1 30891 pblendvb xmm6, xmm2, xmm0 30892 movd dword ptr [r8 + 2*rdi], xmm5 30893 movd dword ptr [r8 + 2*rdi + 4], xmm6 30894 movupd xmm5, xmmword ptr [rcx + 8*rdi + 32] 30895 movupd xmm6, xmmword ptr [rcx + 8*rdi + 48] 30896 movapd xmm0, xmm5 30897 cmpeqpd xmm0, xmm2 30898 packssdw xmm0, xmm0 30899 packssdw xmm0, xmm0 30900 movapd xmm1, xmm6 30901 cmpeqpd xmm1, xmm2 30902 packssdw xmm1, xmm1 30903 packssdw xmm1, xmm1 30904 andpd xmm5, xmm3 30905 orpd xmm5, xmm4 30906 andpd xmm6, xmm3 30907 orpd xmm6, xmm4 30908 cvttpd2dq xmm5, xmm5 30909 pshuflw xmm5, xmm5, 232 # xmm5 = xmm5[0,2,2,3,4,5,6,7] 30910 cvttpd2dq xmm6, xmm6 30911 pshuflw xmm6, xmm6, 232 # xmm6 = xmm6[0,2,2,3,4,5,6,7] 30912 pblendvb xmm5, xmm2, xmm0 30913 movdqa xmm0, xmm1 30914 pblendvb xmm6, xmm2, xmm0 30915 movd dword ptr [r8 + 2*rdi + 8], xmm5 30916 movd dword ptr [r8 + 2*rdi + 12], xmm6 30917 add rdi, 8 30918 add rdx, 2 30919 jne .LBB4_727 30920 jmp .LBB4_1154 30921 .LBB4_738: 30922 mov edx, eax 30923 and edx, -4 30924 lea rsi, [rdx - 4] 30925 mov r9, rsi 30926 shr r9, 2 30927 add r9, 1 30928 test rsi, rsi 30929 je .LBB4_1027 30930 # %bb.739: 30931 mov rdi, r9 30932 and rdi, -2 30933 neg rdi 30934 xor esi, esi 30935 pxor xmm0, xmm0 30936 pcmpeqd xmm1, xmm1 30937 movdqa xmm2, xmmword ptr [rip + .LCPI4_17] # xmm2 = <1,1,u,u,u,u,u,u> 30938 .LBB4_740: # =>This Inner Loop Header: Depth=1 30939 movdqu xmm3, xmmword ptr [rcx + 8*rsi] 30940 movdqu xmm4, xmmword ptr [rcx + 8*rsi + 16] 30941 pcmpeqq xmm3, xmm0 30942 pxor xmm3, xmm1 30943 packssdw xmm3, xmm3 30944 packssdw xmm3, xmm3 30945 pand xmm3, xmm2 30946 pcmpeqq xmm4, xmm0 30947 pxor xmm4, xmm1 30948 packssdw xmm4, xmm4 30949 packssdw xmm4, xmm4 30950 pand xmm4, xmm2 30951 movd dword ptr [r8 + 2*rsi], xmm3 30952 movd dword ptr [r8 + 2*rsi + 4], xmm4 30953 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 32] 30954 movdqu xmm4, xmmword ptr [rcx + 8*rsi + 48] 30955 pcmpeqq xmm3, xmm0 30956 pxor xmm3, xmm1 30957 packssdw xmm3, xmm3 30958 packssdw xmm3, xmm3 30959 pand xmm3, xmm2 30960 pcmpeqq xmm4, xmm0 30961 pxor xmm4, xmm1 30962 packssdw xmm4, xmm4 30963 packssdw xmm4, xmm4 30964 pand xmm4, xmm2 30965 movd dword ptr [r8 + 2*rsi + 8], xmm3 30966 movd dword ptr [r8 + 2*rsi + 12], xmm4 30967 add rsi, 8 30968 add rdi, 2 30969 jne .LBB4_740 30970 jmp .LBB4_1028 30971 .LBB4_741: 30972 mov edx, eax 30973 and edx, -4 30974 lea rsi, [rdx - 4] 30975 mov r9, rsi 30976 shr r9, 2 30977 add r9, 1 30978 test rsi, rsi 30979 je .LBB4_1032 30980 # %bb.742: 30981 mov rdi, r9 30982 and rdi, -2 30983 neg rdi 30984 xor esi, esi 30985 pxor xmm0, xmm0 30986 pcmpeqd xmm1, xmm1 30987 movdqa xmm2, xmmword ptr [rip + .LCPI4_17] # xmm2 = <1,1,u,u,u,u,u,u> 30988 .LBB4_743: # =>This Inner Loop Header: Depth=1 30989 movdqu xmm3, xmmword ptr [rcx + 8*rsi] 30990 movdqu xmm4, xmmword ptr [rcx + 8*rsi + 16] 30991 pcmpeqq xmm3, xmm0 30992 pxor xmm3, xmm1 30993 packssdw xmm3, xmm3 30994 packssdw xmm3, xmm3 30995 pand xmm3, xmm2 30996 pcmpeqq xmm4, xmm0 30997 pxor xmm4, xmm1 30998 packssdw xmm4, xmm4 30999 packssdw xmm4, xmm4 31000 pand xmm4, xmm2 31001 movd dword ptr [r8 + 2*rsi], xmm3 31002 movd dword ptr [r8 + 2*rsi + 4], xmm4 31003 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 32] 31004 movdqu xmm4, xmmword ptr [rcx + 8*rsi + 48] 31005 pcmpeqq xmm3, xmm0 31006 pxor xmm3, xmm1 31007 packssdw xmm3, xmm3 31008 packssdw xmm3, xmm3 31009 pand xmm3, xmm2 31010 pcmpeqq xmm4, xmm0 31011 pxor xmm4, xmm1 31012 packssdw xmm4, xmm4 31013 packssdw xmm4, xmm4 31014 pand xmm4, xmm2 31015 movd dword ptr [r8 + 2*rsi + 8], xmm3 31016 movd dword ptr [r8 + 2*rsi + 12], xmm4 31017 add rsi, 8 31018 add rdi, 2 31019 jne .LBB4_743 31020 jmp .LBB4_1033 31021 .LBB4_764: 31022 mov edx, r10d 31023 and edx, -4 31024 lea rsi, [rdx - 4] 31025 mov r9, rsi 31026 shr r9, 2 31027 add r9, 1 31028 test rsi, rsi 31029 je .LBB4_1037 31030 # %bb.765: 31031 mov rdi, r9 31032 and rdi, -2 31033 neg rdi 31034 xor esi, esi 31035 pxor xmm2, xmm2 31036 pcmpeqd xmm3, xmm3 31037 movdqa xmm4, xmmword ptr [rip + .LCPI4_17] # xmm4 = <1,1,u,u,u,u,u,u> 31038 .LBB4_766: # =>This Inner Loop Header: Depth=1 31039 movdqu xmm5, xmmword ptr [rcx + 8*rsi] 31040 movdqu xmm6, xmmword ptr [rcx + 8*rsi + 16] 31041 movdqa xmm0, xmm5 31042 pcmpgtq xmm0, xmm2 31043 packssdw xmm0, xmm0 31044 packssdw xmm0, xmm0 31045 movdqa xmm1, xmm6 31046 pcmpgtq xmm1, xmm2 31047 packssdw xmm1, xmm1 31048 packssdw xmm1, xmm1 31049 pcmpeqq xmm5, xmm2 31050 pxor xmm5, xmm3 31051 packssdw xmm5, xmm5 31052 packssdw xmm5, xmm5 31053 pcmpeqq xmm6, xmm2 31054 pxor xmm6, xmm3 31055 packssdw xmm6, xmm6 31056 packssdw xmm6, xmm6 31057 pblendvb xmm5, xmm4, xmm0 31058 movdqa xmm0, xmm1 31059 pblendvb xmm6, xmm4, xmm0 31060 movd dword ptr [r8 + 2*rsi], xmm5 31061 movd dword ptr [r8 + 2*rsi + 4], xmm6 31062 movdqu xmm5, xmmword ptr [rcx + 8*rsi + 32] 31063 movdqu xmm6, xmmword ptr [rcx + 8*rsi + 48] 31064 movdqa xmm0, xmm5 31065 pcmpgtq xmm0, xmm2 31066 packssdw xmm0, xmm0 31067 packssdw xmm0, xmm0 31068 movdqa xmm1, xmm6 31069 pcmpgtq xmm1, xmm2 31070 packssdw xmm1, xmm1 31071 packssdw xmm1, xmm1 31072 pcmpeqq xmm5, xmm2 31073 pxor xmm5, xmm3 31074 packssdw xmm5, xmm5 31075 packssdw xmm5, xmm5 31076 pcmpeqq xmm6, xmm2 31077 pxor xmm6, xmm3 31078 packssdw xmm6, xmm6 31079 packssdw xmm6, xmm6 31080 pblendvb xmm5, xmm4, xmm0 31081 movdqa xmm0, xmm1 31082 pblendvb xmm6, xmm4, xmm0 31083 movd dword ptr [r8 + 2*rsi + 8], xmm5 31084 movd dword ptr [r8 + 2*rsi + 12], xmm6 31085 add rsi, 8 31086 add rdi, 2 31087 jne .LBB4_766 31088 jmp .LBB4_1038 31089 .LBB4_767: 31090 mov edx, r10d 31091 and edx, -4 31092 lea rsi, [rdx - 4] 31093 mov r9, rsi 31094 shr r9, 2 31095 add r9, 1 31096 test rsi, rsi 31097 je .LBB4_1159 31098 # %bb.768: 31099 mov rdi, r9 31100 and rdi, -2 31101 neg rdi 31102 xor esi, esi 31103 pxor xmm2, xmm2 31104 pcmpeqd xmm3, xmm3 31105 movdqa xmm4, xmmword ptr [rip + .LCPI4_17] # xmm4 = <1,1,u,u,u,u,u,u> 31106 .LBB4_769: # =>This Inner Loop Header: Depth=1 31107 movdqu xmm5, xmmword ptr [rcx + 8*rsi] 31108 movdqu xmm6, xmmword ptr [rcx + 8*rsi + 16] 31109 movdqa xmm0, xmm5 31110 pcmpgtq xmm0, xmm2 31111 packssdw xmm0, xmm0 31112 packssdw xmm0, xmm0 31113 movdqa xmm1, xmm6 31114 pcmpgtq xmm1, xmm2 31115 packssdw xmm1, xmm1 31116 packssdw xmm1, xmm1 31117 pcmpeqq xmm5, xmm2 31118 pxor xmm5, xmm3 31119 packssdw xmm5, xmm5 31120 packssdw xmm5, xmm5 31121 pcmpeqq xmm6, xmm2 31122 pxor xmm6, xmm3 31123 packssdw xmm6, xmm6 31124 packssdw xmm6, xmm6 31125 pblendvb xmm5, xmm4, xmm0 31126 movdqa xmm0, xmm1 31127 pblendvb xmm6, xmm4, xmm0 31128 movd dword ptr [r8 + 2*rsi], xmm5 31129 movd dword ptr [r8 + 2*rsi + 4], xmm6 31130 movdqu xmm5, xmmword ptr [rcx + 8*rsi + 32] 31131 movdqu xmm6, xmmword ptr [rcx + 8*rsi + 48] 31132 movdqa xmm0, xmm5 31133 pcmpgtq xmm0, xmm2 31134 packssdw xmm0, xmm0 31135 packssdw xmm0, xmm0 31136 movdqa xmm1, xmm6 31137 pcmpgtq xmm1, xmm2 31138 packssdw xmm1, xmm1 31139 packssdw xmm1, xmm1 31140 pcmpeqq xmm5, xmm2 31141 pxor xmm5, xmm3 31142 packssdw xmm5, xmm5 31143 packssdw xmm5, xmm5 31144 pcmpeqq xmm6, xmm2 31145 pxor xmm6, xmm3 31146 packssdw xmm6, xmm6 31147 packssdw xmm6, xmm6 31148 pblendvb xmm5, xmm4, xmm0 31149 movdqa xmm0, xmm1 31150 pblendvb xmm6, xmm4, xmm0 31151 movd dword ptr [r8 + 2*rsi + 8], xmm5 31152 movd dword ptr [r8 + 2*rsi + 12], xmm6 31153 add rsi, 8 31154 add rdi, 2 31155 jne .LBB4_769 31156 jmp .LBB4_1160 31157 .LBB4_770: 31158 mov esi, eax 31159 and esi, -8 31160 lea rdx, [rsi - 8] 31161 mov r9, rdx 31162 shr r9, 3 31163 add r9, 1 31164 test rdx, rdx 31165 je .LBB4_1165 31166 # %bb.771: 31167 mov rdx, r9 31168 and rdx, -2 31169 neg rdx 31170 xor edi, edi 31171 xorps xmm4, xmm4 31172 pcmpeqd xmm8, xmm8 31173 movdqa xmm6, xmmword ptr [rip + .LCPI4_11] # xmm6 = <1,1,1,1,u,u,u,u> 31174 .LBB4_772: # =>This Inner Loop Header: Depth=1 31175 movups xmm0, xmmword ptr [rcx + 4*rdi] 31176 movups xmm1, xmmword ptr [rcx + 4*rdi + 16] 31177 movaps xmm2, xmm0 31178 cmpeqps xmm2, xmm4 31179 packssdw xmm2, xmm2 31180 movaps xmm3, xmm1 31181 cmpeqps xmm3, xmm4 31182 packssdw xmm3, xmm3 31183 pcmpgtd xmm0, xmm8 31184 packssdw xmm0, xmm0 31185 pcmpgtd xmm1, xmm8 31186 packssdw xmm1, xmm1 31187 pcmpeqd xmm7, xmm7 31188 pblendvb xmm7, xmm6, xmm0 31189 pcmpeqd xmm5, xmm5 31190 movdqa xmm0, xmm1 31191 pblendvb xmm5, xmm6, xmm0 31192 movdqa xmm0, xmm2 31193 pblendvb xmm7, xmm4, xmm0 31194 movdqa xmm0, xmm3 31195 pblendvb xmm5, xmm4, xmm0 31196 punpcklqdq xmm7, xmm5 # xmm7 = xmm7[0],xmm5[0] 31197 movdqu xmmword ptr [r8 + 2*rdi], xmm7 31198 movups xmm0, xmmword ptr [rcx + 4*rdi + 32] 31199 movups xmm1, xmmword ptr [rcx + 4*rdi + 48] 31200 movaps xmm2, xmm0 31201 cmpeqps xmm2, xmm4 31202 packssdw xmm2, xmm2 31203 movaps xmm3, xmm1 31204 cmpeqps xmm3, xmm4 31205 packssdw xmm3, xmm3 31206 pcmpgtd xmm0, xmm8 31207 packssdw xmm0, xmm0 31208 pcmpgtd xmm1, xmm8 31209 pcmpeqd xmm5, xmm5 31210 pblendvb xmm5, xmm6, xmm0 31211 packssdw xmm1, xmm1 31212 pcmpeqd xmm7, xmm7 31213 movdqa xmm0, xmm1 31214 pblendvb xmm7, xmm6, xmm0 31215 movdqa xmm0, xmm2 31216 pblendvb xmm5, xmm4, xmm0 31217 movdqa xmm0, xmm3 31218 pblendvb xmm7, xmm4, xmm0 31219 punpcklqdq xmm5, xmm7 # xmm5 = xmm5[0],xmm7[0] 31220 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm5 31221 add rdi, 16 31222 add rdx, 2 31223 jne .LBB4_772 31224 jmp .LBB4_1166 31225 .LBB4_773: 31226 mov esi, eax 31227 and esi, -8 31228 lea rdx, [rsi - 8] 31229 mov r9, rdx 31230 shr r9, 3 31231 add r9, 1 31232 test rdx, rdx 31233 je .LBB4_1171 31234 # %bb.774: 31235 mov rdx, r9 31236 and rdx, -2 31237 neg rdx 31238 xor edi, edi 31239 xorps xmm4, xmm4 31240 pcmpeqd xmm8, xmm8 31241 movdqa xmm6, xmmword ptr [rip + .LCPI4_11] # xmm6 = <1,1,1,1,u,u,u,u> 31242 .LBB4_775: # =>This Inner Loop Header: Depth=1 31243 movups xmm0, xmmword ptr [rcx + 4*rdi] 31244 movups xmm1, xmmword ptr [rcx + 4*rdi + 16] 31245 movaps xmm2, xmm0 31246 cmpeqps xmm2, xmm4 31247 packssdw xmm2, xmm2 31248 movaps xmm3, xmm1 31249 cmpeqps xmm3, xmm4 31250 packssdw xmm3, xmm3 31251 pcmpgtd xmm0, xmm8 31252 packssdw xmm0, xmm0 31253 pcmpgtd xmm1, xmm8 31254 packssdw xmm1, xmm1 31255 pcmpeqd xmm7, xmm7 31256 pblendvb xmm7, xmm6, xmm0 31257 pcmpeqd xmm5, xmm5 31258 movdqa xmm0, xmm1 31259 pblendvb xmm5, xmm6, xmm0 31260 movdqa xmm0, xmm2 31261 pblendvb xmm7, xmm4, xmm0 31262 movdqa xmm0, xmm3 31263 pblendvb xmm5, xmm4, xmm0 31264 punpcklqdq xmm7, xmm5 # xmm7 = xmm7[0],xmm5[0] 31265 movdqu xmmword ptr [r8 + 2*rdi], xmm7 31266 movups xmm0, xmmword ptr [rcx + 4*rdi + 32] 31267 movups xmm1, xmmword ptr [rcx + 4*rdi + 48] 31268 movaps xmm2, xmm0 31269 cmpeqps xmm2, xmm4 31270 packssdw xmm2, xmm2 31271 movaps xmm3, xmm1 31272 cmpeqps xmm3, xmm4 31273 packssdw xmm3, xmm3 31274 pcmpgtd xmm0, xmm8 31275 packssdw xmm0, xmm0 31276 pcmpgtd xmm1, xmm8 31277 pcmpeqd xmm5, xmm5 31278 pblendvb xmm5, xmm6, xmm0 31279 packssdw xmm1, xmm1 31280 pcmpeqd xmm7, xmm7 31281 movdqa xmm0, xmm1 31282 pblendvb xmm7, xmm6, xmm0 31283 movdqa xmm0, xmm2 31284 pblendvb xmm5, xmm4, xmm0 31285 movdqa xmm0, xmm3 31286 pblendvb xmm7, xmm4, xmm0 31287 punpcklqdq xmm5, xmm7 # xmm5 = xmm5[0],xmm7[0] 31288 movdqu xmmword ptr [r8 + 2*rdi + 16], xmm5 31289 add rdi, 16 31290 add rdx, 2 31291 jne .LBB4_775 31292 jmp .LBB4_1172 31293 .LBB4_786: 31294 mov edx, r10d 31295 and edx, -8 31296 lea rsi, [rdx - 8] 31297 mov r9, rsi 31298 shr r9, 3 31299 add r9, 1 31300 test rsi, rsi 31301 je .LBB4_1043 31302 # %bb.787: 31303 mov rdi, r9 31304 and rdi, -2 31305 neg rdi 31306 xor esi, esi 31307 pxor xmm2, xmm2 31308 pcmpeqd xmm3, xmm3 31309 movdqa xmm4, xmmword ptr [rip + .LCPI4_11] # xmm4 = <1,1,1,1,u,u,u,u> 31310 .LBB4_788: # =>This Inner Loop Header: Depth=1 31311 movdqu xmm5, xmmword ptr [rcx + 4*rsi] 31312 movdqu xmm6, xmmword ptr [rcx + 4*rsi + 16] 31313 movdqa xmm0, xmm5 31314 pcmpgtd xmm0, xmm2 31315 packssdw xmm0, xmm0 31316 movdqa xmm1, xmm6 31317 pcmpgtd xmm1, xmm2 31318 packssdw xmm1, xmm1 31319 pcmpeqd xmm5, xmm2 31320 pxor xmm5, xmm3 31321 packssdw xmm5, xmm5 31322 pcmpeqd xmm6, xmm2 31323 pxor xmm6, xmm3 31324 packssdw xmm6, xmm6 31325 pblendvb xmm5, xmm4, xmm0 31326 movdqa xmm0, xmm1 31327 pblendvb xmm6, xmm4, xmm0 31328 punpcklqdq xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 31329 movdqu xmmword ptr [r8 + 2*rsi], xmm5 31330 movdqu xmm5, xmmword ptr [rcx + 4*rsi + 32] 31331 movdqu xmm6, xmmword ptr [rcx + 4*rsi + 48] 31332 movdqa xmm0, xmm5 31333 pcmpgtd xmm0, xmm2 31334 packssdw xmm0, xmm0 31335 movdqa xmm1, xmm6 31336 pcmpgtd xmm1, xmm2 31337 packssdw xmm1, xmm1 31338 pcmpeqd xmm5, xmm2 31339 pxor xmm5, xmm3 31340 packssdw xmm5, xmm5 31341 pcmpeqd xmm6, xmm2 31342 pxor xmm6, xmm3 31343 packssdw xmm6, xmm6 31344 pblendvb xmm5, xmm4, xmm0 31345 movdqa xmm0, xmm1 31346 pblendvb xmm6, xmm4, xmm0 31347 punpcklqdq xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 31348 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm5 31349 add rsi, 16 31350 add rdi, 2 31351 jne .LBB4_788 31352 jmp .LBB4_1044 31353 .LBB4_789: 31354 mov edx, r10d 31355 and edx, -8 31356 lea rsi, [rdx - 8] 31357 mov r9, rsi 31358 shr r9, 3 31359 add r9, 1 31360 test rsi, rsi 31361 je .LBB4_1049 31362 # %bb.790: 31363 mov rdi, r9 31364 and rdi, -2 31365 neg rdi 31366 xor esi, esi 31367 pxor xmm2, xmm2 31368 pcmpeqd xmm3, xmm3 31369 movdqa xmm4, xmmword ptr [rip + .LCPI4_11] # xmm4 = <1,1,1,1,u,u,u,u> 31370 .LBB4_791: # =>This Inner Loop Header: Depth=1 31371 movdqu xmm5, xmmword ptr [rcx + 4*rsi] 31372 movdqu xmm6, xmmword ptr [rcx + 4*rsi + 16] 31373 movdqa xmm0, xmm5 31374 pcmpgtd xmm0, xmm2 31375 packssdw xmm0, xmm0 31376 movdqa xmm1, xmm6 31377 pcmpgtd xmm1, xmm2 31378 packssdw xmm1, xmm1 31379 pcmpeqd xmm5, xmm2 31380 pxor xmm5, xmm3 31381 packssdw xmm5, xmm5 31382 pcmpeqd xmm6, xmm2 31383 pxor xmm6, xmm3 31384 packssdw xmm6, xmm6 31385 pblendvb xmm5, xmm4, xmm0 31386 movdqa xmm0, xmm1 31387 pblendvb xmm6, xmm4, xmm0 31388 punpcklqdq xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 31389 movdqu xmmword ptr [r8 + 2*rsi], xmm5 31390 movdqu xmm5, xmmword ptr [rcx + 4*rsi + 32] 31391 movdqu xmm6, xmmword ptr [rcx + 4*rsi + 48] 31392 movdqa xmm0, xmm5 31393 pcmpgtd xmm0, xmm2 31394 packssdw xmm0, xmm0 31395 movdqa xmm1, xmm6 31396 pcmpgtd xmm1, xmm2 31397 packssdw xmm1, xmm1 31398 pcmpeqd xmm5, xmm2 31399 pxor xmm5, xmm3 31400 packssdw xmm5, xmm5 31401 pcmpeqd xmm6, xmm2 31402 pxor xmm6, xmm3 31403 packssdw xmm6, xmm6 31404 pblendvb xmm5, xmm4, xmm0 31405 movdqa xmm0, xmm1 31406 pblendvb xmm6, xmm4, xmm0 31407 punpcklqdq xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 31408 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm5 31409 add rsi, 16 31410 add rdi, 2 31411 jne .LBB4_791 31412 jmp .LBB4_1050 31413 .LBB4_792: 31414 mov edx, eax 31415 and edx, -4 31416 lea rsi, [rdx - 4] 31417 mov r9, rsi 31418 shr r9, 2 31419 add r9, 1 31420 test rsi, rsi 31421 je .LBB4_1177 31422 # %bb.793: 31423 mov rdi, r9 31424 and rdi, -2 31425 neg rdi 31426 xor esi, esi 31427 pxor xmm0, xmm0 31428 pcmpeqd xmm1, xmm1 31429 movdqa xmm2, xmmword ptr [rip + .LCPI4_15] # xmm2 = [1,1] 31430 .LBB4_794: # =>This Inner Loop Header: Depth=1 31431 movq xmm3, qword ptr [rcx + 4*rsi] # xmm3 = mem[0],zero 31432 movq xmm4, qword ptr [rcx + 4*rsi + 8] # xmm4 = mem[0],zero 31433 pcmpeqd xmm3, xmm0 31434 pxor xmm3, xmm1 31435 pmovzxdq xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero 31436 pand xmm3, xmm2 31437 pcmpeqd xmm4, xmm0 31438 pxor xmm4, xmm1 31439 pmovzxdq xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero 31440 pand xmm4, xmm2 31441 movdqu xmmword ptr [r8 + 8*rsi], xmm3 31442 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm4 31443 movq xmm3, qword ptr [rcx + 4*rsi + 16] # xmm3 = mem[0],zero 31444 movq xmm4, qword ptr [rcx + 4*rsi + 24] # xmm4 = mem[0],zero 31445 pcmpeqd xmm3, xmm0 31446 pxor xmm3, xmm1 31447 pmovzxdq xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero 31448 pand xmm3, xmm2 31449 pcmpeqd xmm4, xmm0 31450 pxor xmm4, xmm1 31451 pmovzxdq xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero 31452 pand xmm4, xmm2 31453 movdqu xmmword ptr [r8 + 8*rsi + 32], xmm3 31454 movdqu xmmword ptr [r8 + 8*rsi + 48], xmm4 31455 add rsi, 8 31456 add rdi, 2 31457 jne .LBB4_794 31458 jmp .LBB4_1178 31459 .LBB4_795: 31460 mov edx, eax 31461 and edx, -8 31462 lea rsi, [rdx - 8] 31463 mov r9, rsi 31464 shr r9, 3 31465 add r9, 1 31466 test rsi, rsi 31467 je .LBB4_1182 31468 # %bb.796: 31469 mov rdi, r9 31470 and rdi, -2 31471 neg rdi 31472 xor esi, esi 31473 pxor xmm0, xmm0 31474 movdqa xmm1, xmmword ptr [rip + .LCPI4_19] # xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 31475 .LBB4_797: # =>This Inner Loop Header: Depth=1 31476 movdqu xmm2, xmmword ptr [rcx + 4*rsi] 31477 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 16] 31478 pcmpeqd xmm2, xmm0 31479 pandn xmm2, xmm1 31480 pcmpeqd xmm3, xmm0 31481 pandn xmm3, xmm1 31482 movdqu xmmword ptr [r8 + 4*rsi], xmm2 31483 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm3 31484 movdqu xmm2, xmmword ptr [rcx + 4*rsi + 32] 31485 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 48] 31486 pcmpeqd xmm2, xmm0 31487 pandn xmm2, xmm1 31488 pcmpeqd xmm3, xmm0 31489 pandn xmm3, xmm1 31490 movdqu xmmword ptr [r8 + 4*rsi + 32], xmm2 31491 movdqu xmmword ptr [r8 + 4*rsi + 48], xmm3 31492 add rsi, 16 31493 add rdi, 2 31494 jne .LBB4_797 31495 jmp .LBB4_1183 31496 .LBB4_798: 31497 mov edx, eax 31498 and edx, -4 31499 lea rsi, [rdx - 4] 31500 mov r9, rsi 31501 shr r9, 2 31502 add r9, 1 31503 test rsi, rsi 31504 je .LBB4_1190 31505 # %bb.799: 31506 mov rdi, r9 31507 and rdi, -2 31508 neg rdi 31509 xor esi, esi 31510 xorpd xmm0, xmm0 31511 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 31512 movapd xmm2, xmmword ptr [rip + .LCPI4_1] # xmm2 = [1.0E+0,1.0E+0] 31513 .LBB4_800: # =>This Inner Loop Header: Depth=1 31514 movupd xmm3, xmmword ptr [rcx + 8*rsi] 31515 movupd xmm4, xmmword ptr [rcx + 8*rsi + 16] 31516 movapd xmm5, xmm3 31517 andpd xmm5, xmm1 31518 orpd xmm5, xmm2 31519 movapd xmm6, xmm4 31520 andpd xmm6, xmm1 31521 orpd xmm6, xmm2 31522 cvttsd2si rbx, xmm5 31523 movq xmm7, rbx 31524 pshufd xmm5, xmm5, 238 # xmm5 = xmm5[2,3,2,3] 31525 cvttsd2si rbx, xmm5 31526 movq xmm5, rbx 31527 punpcklqdq xmm7, xmm5 # xmm7 = xmm7[0],xmm5[0] 31528 cvttsd2si rbx, xmm6 31529 movq xmm5, rbx 31530 pshufd xmm6, xmm6, 238 # xmm6 = xmm6[2,3,2,3] 31531 cvttsd2si rbx, xmm6 31532 movq xmm6, rbx 31533 punpcklqdq xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 31534 cmpneqpd xmm3, xmm0 31535 andpd xmm3, xmm7 31536 cmpneqpd xmm4, xmm0 31537 andpd xmm4, xmm5 31538 movupd xmmword ptr [r8 + 8*rsi], xmm3 31539 movupd xmmword ptr [r8 + 8*rsi + 16], xmm4 31540 movupd xmm3, xmmword ptr [rcx + 8*rsi + 32] 31541 movupd xmm4, xmmword ptr [rcx + 8*rsi + 48] 31542 movapd xmm5, xmm3 31543 andpd xmm5, xmm1 31544 orpd xmm5, xmm2 31545 movapd xmm6, xmm4 31546 andpd xmm6, xmm1 31547 orpd xmm6, xmm2 31548 cvttsd2si rbx, xmm5 31549 movq xmm7, rbx 31550 pshufd xmm5, xmm5, 238 # xmm5 = xmm5[2,3,2,3] 31551 cvttsd2si rbx, xmm5 31552 movq xmm5, rbx 31553 punpcklqdq xmm7, xmm5 # xmm7 = xmm7[0],xmm5[0] 31554 cvttsd2si rbx, xmm6 31555 movq xmm5, rbx 31556 pshufd xmm6, xmm6, 238 # xmm6 = xmm6[2,3,2,3] 31557 cvttsd2si rbx, xmm6 31558 movq xmm6, rbx 31559 punpcklqdq xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 31560 cmpneqpd xmm3, xmm0 31561 andpd xmm3, xmm7 31562 cmpneqpd xmm4, xmm0 31563 andpd xmm4, xmm5 31564 movupd xmmword ptr [r8 + 8*rsi + 32], xmm3 31565 movupd xmmword ptr [r8 + 8*rsi + 48], xmm4 31566 add rsi, 8 31567 add rdi, 2 31568 jne .LBB4_800 31569 jmp .LBB4_1191 31570 .LBB4_801: 31571 mov edx, eax 31572 and edx, -4 31573 lea rsi, [rdx - 4] 31574 mov r9, rsi 31575 shr r9, 2 31576 add r9, 1 31577 test rsi, rsi 31578 je .LBB4_1196 31579 # %bb.802: 31580 mov rdi, r9 31581 and rdi, -2 31582 neg rdi 31583 xor esi, esi 31584 xorpd xmm8, xmm8 31585 cvtpd2ps xmm1, xmmword ptr [rip + .LCPI4_1] 31586 movaps xmm9, xmmword ptr [rip + .LCPI4_3] # xmm9 = [NaN,NaN,NaN,NaN] 31587 movshdup xmm3, xmm1 # xmm3 = xmm1[1,1,3,3] 31588 andps xmm3, xmm9 31589 andps xmm1, xmm9 31590 .LBB4_803: # =>This Inner Loop Header: Depth=1 31591 movupd xmm4, xmmword ptr [rcx + 8*rsi] 31592 movupd xmm6, xmmword ptr [rcx + 8*rsi + 16] 31593 xorps xmm5, xmm5 31594 cvtsd2ss xmm5, xmm4 31595 cmpeqpd xmm4, xmm8 31596 shufps xmm4, xmm4, 232 # xmm4 = xmm4[0,2,2,3] 31597 xorps xmm7, xmm7 31598 cvtsd2ss xmm7, xmm6 31599 cmpeqpd xmm6, xmm8 31600 shufps xmm6, xmm6, 232 # xmm6 = xmm6[0,2,2,3] 31601 movsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero 31602 cvtsd2ss xmm0, xmm0 31603 movaps xmm2, xmm9 31604 andnps xmm2, xmm0 31605 orps xmm2, xmm3 31606 movaps xmm0, xmm9 31607 andnps xmm0, xmm5 31608 orps xmm0, xmm1 31609 unpcklps xmm0, xmm2 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 31610 andnps xmm4, xmm0 31611 movsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero 31612 cvtsd2ss xmm0, xmm0 31613 movaps xmm2, xmm9 31614 andnps xmm2, xmm0 31615 orps xmm2, xmm3 31616 movaps xmm0, xmm9 31617 andnps xmm0, xmm7 31618 orps xmm0, xmm1 31619 unpcklps xmm0, xmm2 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 31620 andnps xmm6, xmm0 31621 movlhps xmm4, xmm6 # xmm4 = xmm4[0],xmm6[0] 31622 movups xmmword ptr [r8 + 4*rsi], xmm4 31623 movupd xmm4, xmmword ptr [rcx + 8*rsi + 32] 31624 movupd xmm0, xmmword ptr [rcx + 8*rsi + 48] 31625 xorps xmm2, xmm2 31626 cvtsd2ss xmm2, xmm4 31627 cmpeqpd xmm4, xmm8 31628 shufps xmm4, xmm4, 232 # xmm4 = xmm4[0,2,2,3] 31629 xorps xmm5, xmm5 31630 cvtsd2ss xmm5, xmm0 31631 cmpeqpd xmm0, xmm8 31632 movsd xmm6, qword ptr [rcx + 8*rsi + 40] # xmm6 = mem[0],zero 31633 cvtsd2ss xmm6, xmm6 31634 shufps xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] 31635 movaps xmm7, xmm9 31636 andnps xmm7, xmm6 31637 orps xmm7, xmm3 31638 movaps xmm6, xmm9 31639 andnps xmm6, xmm2 31640 orps xmm6, xmm1 31641 unpcklps xmm6, xmm7 # xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 31642 andnps xmm4, xmm6 31643 movsd xmm2, qword ptr [rcx + 8*rsi + 56] # xmm2 = mem[0],zero 31644 cvtsd2ss xmm2, xmm2 31645 movaps xmm6, xmm9 31646 andnps xmm6, xmm2 31647 orps xmm6, xmm3 31648 movaps xmm2, xmm9 31649 andnps xmm2, xmm5 31650 orps xmm2, xmm1 31651 unpcklps xmm2, xmm6 # xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] 31652 andnps xmm0, xmm2 31653 movlhps xmm4, xmm0 # xmm4 = xmm4[0],xmm0[0] 31654 movups xmmword ptr [r8 + 4*rsi + 16], xmm4 31655 add rsi, 8 31656 add rdi, 2 31657 jne .LBB4_803 31658 jmp .LBB4_1197 31659 .LBB4_819: 31660 and edx, -4 31661 xor esi, esi 31662 movss xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero 31663 jmp .LBB4_821 31664 .LBB4_820: # in Loop: Header=BB4_821 Depth=1 31665 movss dword ptr [r8 + 4*rsi + 12], xmm1 31666 add rsi, 4 31667 cmp rdx, rsi 31668 je .LBB4_387 31669 .LBB4_821: # =>This Inner Loop Header: Depth=1 31670 cmp qword ptr [rcx + 8*rsi], 0 31671 movapd xmm1, xmm0 31672 jne .LBB4_822 31673 # %bb.825: # in Loop: Header=BB4_821 Depth=1 31674 xorpd xmm1, xmm1 31675 movss dword ptr [r8 + 4*rsi], xmm1 31676 cmp qword ptr [rcx + 8*rsi + 8], 0 31677 movapd xmm1, xmm0 31678 je .LBB4_826 31679 .LBB4_823: # in Loop: Header=BB4_821 Depth=1 31680 movss dword ptr [r8 + 4*rsi + 4], xmm1 31681 cmp qword ptr [rcx + 8*rsi + 16], 0 31682 movapd xmm1, xmm0 31683 jne .LBB4_824 31684 .LBB4_827: # in Loop: Header=BB4_821 Depth=1 31685 xorpd xmm1, xmm1 31686 movss dword ptr [r8 + 4*rsi + 8], xmm1 31687 cmp qword ptr [rcx + 8*rsi + 24], 0 31688 movapd xmm1, xmm0 31689 jne .LBB4_820 31690 jmp .LBB4_828 31691 .LBB4_822: # in Loop: Header=BB4_821 Depth=1 31692 movss dword ptr [r8 + 4*rsi], xmm1 31693 cmp qword ptr [rcx + 8*rsi + 8], 0 31694 movapd xmm1, xmm0 31695 jne .LBB4_823 31696 .LBB4_826: # in Loop: Header=BB4_821 Depth=1 31697 xorpd xmm1, xmm1 31698 movss dword ptr [r8 + 4*rsi + 4], xmm1 31699 cmp qword ptr [rcx + 8*rsi + 16], 0 31700 movapd xmm1, xmm0 31701 je .LBB4_827 31702 .LBB4_824: # in Loop: Header=BB4_821 Depth=1 31703 movss dword ptr [r8 + 4*rsi + 8], xmm1 31704 cmp qword ptr [rcx + 8*rsi + 24], 0 31705 movapd xmm1, xmm0 31706 jne .LBB4_820 31707 .LBB4_828: # in Loop: Header=BB4_821 Depth=1 31708 xorpd xmm1, xmm1 31709 jmp .LBB4_820 31710 .LBB4_829: 31711 mov edx, eax 31712 and edx, -4 31713 lea rsi, [rdx - 4] 31714 mov r9, rsi 31715 shr r9, 2 31716 add r9, 1 31717 test rsi, rsi 31718 je .LBB4_1055 31719 # %bb.830: 31720 mov rdi, r9 31721 and rdi, -2 31722 neg rdi 31723 xor esi, esi 31724 pxor xmm0, xmm0 31725 pcmpeqd xmm1, xmm1 31726 movdqa xmm2, xmmword ptr [rip + .LCPI4_15] # xmm2 = [1,1] 31727 .LBB4_831: # =>This Inner Loop Header: Depth=1 31728 movd xmm3, dword ptr [rcx + 2*rsi] # xmm3 = mem[0],zero,zero,zero 31729 movd xmm4, dword ptr [rcx + 2*rsi + 4] # xmm4 = mem[0],zero,zero,zero 31730 pcmpeqw xmm3, xmm0 31731 pxor xmm3, xmm1 31732 pmovzxwq xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 31733 pand xmm3, xmm2 31734 pcmpeqw xmm4, xmm0 31735 pxor xmm4, xmm1 31736 pmovzxwq xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 31737 pand xmm4, xmm2 31738 movdqu xmmword ptr [r8 + 8*rsi], xmm3 31739 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm4 31740 movd xmm3, dword ptr [rcx + 2*rsi + 8] # xmm3 = mem[0],zero,zero,zero 31741 movd xmm4, dword ptr [rcx + 2*rsi + 12] # xmm4 = mem[0],zero,zero,zero 31742 pcmpeqw xmm3, xmm0 31743 pxor xmm3, xmm1 31744 pmovzxwq xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 31745 pand xmm3, xmm2 31746 pcmpeqw xmm4, xmm0 31747 pxor xmm4, xmm1 31748 pmovzxwq xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 31749 pand xmm4, xmm2 31750 movdqu xmmword ptr [r8 + 8*rsi + 32], xmm3 31751 movdqu xmmword ptr [r8 + 8*rsi + 48], xmm4 31752 add rsi, 8 31753 add rdi, 2 31754 jne .LBB4_831 31755 jmp .LBB4_1056 31756 .LBB4_832: 31757 mov edx, eax 31758 and edx, -8 31759 lea rsi, [rdx - 8] 31760 mov r9, rsi 31761 shr r9, 3 31762 add r9, 1 31763 test rsi, rsi 31764 je .LBB4_1204 31765 # %bb.833: 31766 mov rdi, r9 31767 and rdi, -2 31768 neg rdi 31769 xor esi, esi 31770 pxor xmm0, xmm0 31771 pcmpeqd xmm1, xmm1 31772 movdqa xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1] 31773 .LBB4_834: # =>This Inner Loop Header: Depth=1 31774 movq xmm3, qword ptr [rcx + 2*rsi] # xmm3 = mem[0],zero 31775 movq xmm4, qword ptr [rcx + 2*rsi + 8] # xmm4 = mem[0],zero 31776 pcmpeqw xmm3, xmm0 31777 pxor xmm3, xmm1 31778 pmovzxwd xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 31779 pand xmm3, xmm2 31780 cvtdq2ps xmm3, xmm3 31781 pcmpeqw xmm4, xmm0 31782 pxor xmm4, xmm1 31783 pmovzxwd xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 31784 pand xmm4, xmm2 31785 cvtdq2ps xmm4, xmm4 31786 movups xmmword ptr [r8 + 4*rsi], xmm3 31787 movups xmmword ptr [r8 + 4*rsi + 16], xmm4 31788 movq xmm3, qword ptr [rcx + 2*rsi + 16] # xmm3 = mem[0],zero 31789 movq xmm4, qword ptr [rcx + 2*rsi + 24] # xmm4 = mem[0],zero 31790 pcmpeqw xmm3, xmm0 31791 pxor xmm3, xmm1 31792 pmovzxwd xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 31793 pand xmm3, xmm2 31794 cvtdq2ps xmm3, xmm3 31795 pcmpeqw xmm4, xmm0 31796 pxor xmm4, xmm1 31797 pmovzxwd xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 31798 pand xmm4, xmm2 31799 cvtdq2ps xmm4, xmm4 31800 movups xmmword ptr [r8 + 4*rsi + 32], xmm3 31801 movups xmmword ptr [r8 + 4*rsi + 48], xmm4 31802 add rsi, 16 31803 add rdi, 2 31804 jne .LBB4_834 31805 jmp .LBB4_1205 31806 .LBB4_835: 31807 mov edx, r10d 31808 and edx, -4 31809 lea rsi, [rdx - 4] 31810 mov r9, rsi 31811 shr r9, 2 31812 add r9, 1 31813 test rsi, rsi 31814 je .LBB4_1212 31815 # %bb.836: 31816 mov rdi, r9 31817 and rdi, -2 31818 neg rdi 31819 xor esi, esi 31820 pxor xmm2, xmm2 31821 pcmpeqd xmm3, xmm3 31822 movapd xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 31823 .LBB4_837: # =>This Inner Loop Header: Depth=1 31824 movd xmm5, dword ptr [rcx + 2*rsi] # xmm5 = mem[0],zero,zero,zero 31825 movd xmm6, dword ptr [rcx + 2*rsi + 4] # xmm6 = mem[0],zero,zero,zero 31826 movdqa xmm0, xmm5 31827 pcmpgtw xmm0, xmm2 31828 pmovsxwq xmm0, xmm0 31829 movdqa xmm1, xmm6 31830 pcmpgtw xmm1, xmm2 31831 pmovsxwq xmm1, xmm1 31832 pcmpeqw xmm5, xmm2 31833 pxor xmm5, xmm3 31834 pmovsxwq xmm5, xmm5 31835 pcmpeqw xmm6, xmm2 31836 pxor xmm6, xmm3 31837 pmovsxwq xmm6, xmm6 31838 blendvpd xmm5, xmm4, xmm0 31839 movdqa xmm0, xmm1 31840 blendvpd xmm6, xmm4, xmm0 31841 movupd xmmword ptr [r8 + 8*rsi], xmm5 31842 movupd xmmword ptr [r8 + 8*rsi + 16], xmm6 31843 movd xmm5, dword ptr [rcx + 2*rsi + 8] # xmm5 = mem[0],zero,zero,zero 31844 movd xmm6, dword ptr [rcx + 2*rsi + 12] # xmm6 = mem[0],zero,zero,zero 31845 movdqa xmm0, xmm5 31846 pcmpgtw xmm0, xmm2 31847 pmovsxwq xmm0, xmm0 31848 movdqa xmm1, xmm6 31849 pcmpgtw xmm1, xmm2 31850 pmovsxwq xmm1, xmm1 31851 pcmpeqw xmm5, xmm2 31852 pxor xmm5, xmm3 31853 pmovsxwq xmm5, xmm5 31854 pcmpeqw xmm6, xmm2 31855 pxor xmm6, xmm3 31856 pmovsxwq xmm6, xmm6 31857 blendvpd xmm5, xmm4, xmm0 31858 movdqa xmm0, xmm1 31859 blendvpd xmm6, xmm4, xmm0 31860 movupd xmmword ptr [r8 + 8*rsi + 32], xmm5 31861 movupd xmmword ptr [r8 + 8*rsi + 48], xmm6 31862 add rsi, 8 31863 add rdi, 2 31864 jne .LBB4_837 31865 jmp .LBB4_1213 31866 .LBB4_838: 31867 mov edx, eax 31868 and edx, -8 31869 lea rsi, [rdx - 8] 31870 mov r9, rsi 31871 shr r9, 3 31872 add r9, 1 31873 test rsi, rsi 31874 je .LBB4_1218 31875 # %bb.839: 31876 mov rdi, r9 31877 and rdi, -2 31878 neg rdi 31879 xor esi, esi 31880 pxor xmm2, xmm2 31881 pcmpeqd xmm3, xmm3 31882 movaps xmm4, xmmword ptr [rip + .LCPI4_19] # xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 31883 .LBB4_840: # =>This Inner Loop Header: Depth=1 31884 movq xmm5, qword ptr [rcx + 2*rsi] # xmm5 = mem[0],zero 31885 movq xmm6, qword ptr [rcx + 2*rsi + 8] # xmm6 = mem[0],zero 31886 movdqa xmm0, xmm5 31887 pcmpgtw xmm0, xmm2 31888 pmovsxwd xmm0, xmm0 31889 movdqa xmm1, xmm6 31890 pcmpgtw xmm1, xmm2 31891 pmovsxwd xmm1, xmm1 31892 pcmpeqw xmm5, xmm2 31893 pxor xmm5, xmm3 31894 pmovsxwd xmm5, xmm5 31895 cvtdq2ps xmm5, xmm5 31896 pcmpeqw xmm6, xmm2 31897 pxor xmm6, xmm3 31898 pmovsxwd xmm6, xmm6 31899 cvtdq2ps xmm6, xmm6 31900 blendvps xmm5, xmm4, xmm0 31901 movdqa xmm0, xmm1 31902 blendvps xmm6, xmm4, xmm0 31903 movups xmmword ptr [r8 + 4*rsi], xmm5 31904 movups xmmword ptr [r8 + 4*rsi + 16], xmm6 31905 movq xmm5, qword ptr [rcx + 2*rsi + 16] # xmm5 = mem[0],zero 31906 movq xmm6, qword ptr [rcx + 2*rsi + 24] # xmm6 = mem[0],zero 31907 movdqa xmm0, xmm5 31908 pcmpgtw xmm0, xmm2 31909 pmovsxwd xmm0, xmm0 31910 movdqa xmm1, xmm6 31911 pcmpgtw xmm1, xmm2 31912 pmovsxwd xmm1, xmm1 31913 pcmpeqw xmm5, xmm2 31914 pxor xmm5, xmm3 31915 pmovsxwd xmm5, xmm5 31916 cvtdq2ps xmm5, xmm5 31917 pcmpeqw xmm6, xmm2 31918 pxor xmm6, xmm3 31919 pmovsxwd xmm6, xmm6 31920 cvtdq2ps xmm6, xmm6 31921 blendvps xmm5, xmm4, xmm0 31922 movdqa xmm0, xmm1 31923 blendvps xmm6, xmm4, xmm0 31924 movups xmmword ptr [r8 + 4*rsi + 32], xmm5 31925 movups xmmword ptr [r8 + 4*rsi + 48], xmm6 31926 add rsi, 16 31927 add rdi, 2 31928 jne .LBB4_840 31929 jmp .LBB4_1219 31930 .LBB4_846: 31931 mov esi, edx 31932 and esi, -2 31933 xor eax, eax 31934 movss xmm0, dword ptr [rip + .LCPI4_14] # xmm0 = mem[0],zero,zero,zero 31935 movss xmm1, dword ptr [rip + .LCPI4_5] # xmm1 = mem[0],zero,zero,zero 31936 jmp .LBB4_848 31937 .LBB4_847: # in Loop: Header=BB4_848 Depth=1 31938 movss dword ptr [r8 + 4*rax + 4], xmm3 31939 add rax, 2 31940 cmp rsi, rax 31941 je .LBB4_410 31942 .LBB4_848: # =>This Inner Loop Header: Depth=1 31943 cmp qword ptr [rcx + 8*rax], 0 31944 movapd xmm2, xmm0 31945 jne .LBB4_849 31946 # %bb.852: # in Loop: Header=BB4_848 Depth=1 31947 xorpd xmm2, xmm2 31948 movapd xmm3, xmm1 31949 jle .LBB4_853 31950 .LBB4_850: # in Loop: Header=BB4_848 Depth=1 31951 movss dword ptr [r8 + 4*rax], xmm3 31952 cmp qword ptr [rcx + 8*rax + 8], 0 31953 movapd xmm2, xmm0 31954 jne .LBB4_851 31955 .LBB4_854: # in Loop: Header=BB4_848 Depth=1 31956 xorpd xmm2, xmm2 31957 movapd xmm3, xmm1 31958 jg .LBB4_847 31959 jmp .LBB4_855 31960 .LBB4_849: # in Loop: Header=BB4_848 Depth=1 31961 movapd xmm3, xmm1 31962 jg .LBB4_850 31963 .LBB4_853: # in Loop: Header=BB4_848 Depth=1 31964 movapd xmm3, xmm2 31965 movss dword ptr [r8 + 4*rax], xmm3 31966 cmp qword ptr [rcx + 8*rax + 8], 0 31967 movapd xmm2, xmm0 31968 je .LBB4_854 31969 .LBB4_851: # in Loop: Header=BB4_848 Depth=1 31970 movapd xmm3, xmm1 31971 jg .LBB4_847 31972 .LBB4_855: # in Loop: Header=BB4_848 Depth=1 31973 movapd xmm3, xmm2 31974 jmp .LBB4_847 31975 .LBB4_856: 31976 mov esi, edx 31977 and esi, -2 31978 xor eax, eax 31979 xorps xmm0, xmm0 31980 jmp .LBB4_859 31981 .LBB4_857: # in Loop: Header=BB4_859 Depth=1 31982 movmskps edi, xmm1 31983 and edi, 1 31984 neg edi 31985 or edi, 1 31986 xorps xmm1, xmm1 31987 cvtsi2ss xmm1, edi 31988 cvttss2si rdi, xmm1 31989 mov qword ptr [r8 + 8*rax + 8], rdi 31990 add rax, 2 31991 cmp rsi, rax 31992 je .LBB4_416 31993 .LBB4_859: # =>This Inner Loop Header: Depth=1 31994 movss xmm1, dword ptr [rcx + 4*rax] # xmm1 = mem[0],zero,zero,zero 31995 ucomiss xmm0, xmm1 31996 jne .LBB4_861 31997 # %bb.860: # in Loop: Header=BB4_859 Depth=1 31998 xor edi, edi 31999 jmp .LBB4_862 32000 .LBB4_861: # in Loop: Header=BB4_859 Depth=1 32001 movmskps edi, xmm1 32002 and edi, 1 32003 neg edi 32004 or edi, 1 32005 xorps xmm1, xmm1 32006 cvtsi2ss xmm1, edi 32007 cvttss2si rdi, xmm1 32008 .LBB4_862: # in Loop: Header=BB4_859 Depth=1 32009 mov qword ptr [r8 + 8*rax], rdi 32010 movss xmm1, dword ptr [rcx + 4*rax + 4] # xmm1 = mem[0],zero,zero,zero 32011 ucomiss xmm0, xmm1 32012 jne .LBB4_857 32013 # %bb.863: # in Loop: Header=BB4_859 Depth=1 32014 xor edi, edi 32015 mov qword ptr [r8 + 8*rax + 8], rdi 32016 add rax, 2 32017 cmp rsi, rax 32018 jne .LBB4_859 32019 .LBB4_416: 32020 test dl, 1 32021 je .LBB4_1655 32022 # %bb.417: 32023 movss xmm0, dword ptr [rcx + 4*rax] # xmm0 = mem[0],zero,zero,zero 32024 xorps xmm1, xmm1 32025 ucomiss xmm1, xmm0 32026 jne .LBB4_1104 32027 # %bb.418: 32028 xor ecx, ecx 32029 jmp .LBB4_1105 32030 .LBB4_884: 32031 mov edx, r10d 32032 and edx, -4 32033 lea rsi, [rdx - 4] 32034 mov r9, rsi 32035 shr r9, 2 32036 add r9, 1 32037 test rsi, rsi 32038 je .LBB4_1060 32039 # %bb.885: 32040 mov rdi, r9 32041 and rdi, -2 32042 neg rdi 32043 xor esi, esi 32044 pxor xmm2, xmm2 32045 pcmpeqd xmm3, xmm3 32046 movapd xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 32047 .LBB4_886: # =>This Inner Loop Header: Depth=1 32048 movq xmm5, qword ptr [rcx + 4*rsi] # xmm5 = mem[0],zero 32049 movq xmm6, qword ptr [rcx + 4*rsi + 8] # xmm6 = mem[0],zero 32050 movdqa xmm0, xmm5 32051 pcmpgtd xmm0, xmm2 32052 pmovsxdq xmm0, xmm0 32053 movdqa xmm1, xmm6 32054 pcmpgtd xmm1, xmm2 32055 pmovsxdq xmm1, xmm1 32056 pcmpeqd xmm5, xmm2 32057 pxor xmm5, xmm3 32058 pmovsxdq xmm5, xmm5 32059 pcmpeqd xmm6, xmm2 32060 pxor xmm6, xmm3 32061 pmovsxdq xmm6, xmm6 32062 blendvpd xmm5, xmm4, xmm0 32063 movdqa xmm0, xmm1 32064 blendvpd xmm6, xmm4, xmm0 32065 movupd xmmword ptr [r8 + 8*rsi], xmm5 32066 movupd xmmword ptr [r8 + 8*rsi + 16], xmm6 32067 movq xmm5, qword ptr [rcx + 4*rsi + 16] # xmm5 = mem[0],zero 32068 movq xmm6, qword ptr [rcx + 4*rsi + 24] # xmm6 = mem[0],zero 32069 movdqa xmm0, xmm5 32070 pcmpgtd xmm0, xmm2 32071 pmovsxdq xmm0, xmm0 32072 movdqa xmm1, xmm6 32073 pcmpgtd xmm1, xmm2 32074 pmovsxdq xmm1, xmm1 32075 pcmpeqd xmm5, xmm2 32076 pxor xmm5, xmm3 32077 pmovsxdq xmm5, xmm5 32078 pcmpeqd xmm6, xmm2 32079 pxor xmm6, xmm3 32080 pmovsxdq xmm6, xmm6 32081 blendvpd xmm5, xmm4, xmm0 32082 movdqa xmm0, xmm1 32083 blendvpd xmm6, xmm4, xmm0 32084 movupd xmmword ptr [r8 + 8*rsi + 32], xmm5 32085 movupd xmmword ptr [r8 + 8*rsi + 48], xmm6 32086 add rsi, 8 32087 add rdi, 2 32088 jne .LBB4_886 32089 jmp .LBB4_1061 32090 .LBB4_887: 32091 mov edx, eax 32092 and edx, -8 32093 lea rsi, [rdx - 8] 32094 mov r9, rsi 32095 shr r9, 3 32096 add r9, 1 32097 test rsi, rsi 32098 je .LBB4_1066 32099 # %bb.888: 32100 mov rdi, r9 32101 and rdi, -2 32102 neg rdi 32103 xor esi, esi 32104 pxor xmm2, xmm2 32105 pcmpeqd xmm3, xmm3 32106 movaps xmm4, xmmword ptr [rip + .LCPI4_19] # xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 32107 .LBB4_889: # =>This Inner Loop Header: Depth=1 32108 movdqu xmm5, xmmword ptr [rcx + 4*rsi] 32109 movdqu xmm6, xmmword ptr [rcx + 4*rsi + 16] 32110 movdqa xmm0, xmm5 32111 pcmpgtd xmm0, xmm2 32112 movdqa xmm1, xmm6 32113 pcmpgtd xmm1, xmm2 32114 pcmpeqd xmm5, xmm2 32115 pxor xmm5, xmm3 32116 cvtdq2ps xmm5, xmm5 32117 pcmpeqd xmm6, xmm2 32118 pxor xmm6, xmm3 32119 cvtdq2ps xmm6, xmm6 32120 blendvps xmm5, xmm4, xmm0 32121 movdqa xmm0, xmm1 32122 blendvps xmm6, xmm4, xmm0 32123 movups xmmword ptr [r8 + 4*rsi], xmm5 32124 movups xmmword ptr [r8 + 4*rsi + 16], xmm6 32125 movdqu xmm5, xmmword ptr [rcx + 4*rsi + 32] 32126 movdqu xmm6, xmmword ptr [rcx + 4*rsi + 48] 32127 movdqa xmm0, xmm5 32128 pcmpgtd xmm0, xmm2 32129 movdqa xmm1, xmm6 32130 pcmpgtd xmm1, xmm2 32131 pcmpeqd xmm5, xmm2 32132 pxor xmm5, xmm3 32133 cvtdq2ps xmm5, xmm5 32134 pcmpeqd xmm6, xmm2 32135 pxor xmm6, xmm3 32136 cvtdq2ps xmm6, xmm6 32137 blendvps xmm5, xmm4, xmm0 32138 movdqa xmm0, xmm1 32139 blendvps xmm6, xmm4, xmm0 32140 movups xmmword ptr [r8 + 4*rsi + 32], xmm5 32141 movups xmmword ptr [r8 + 4*rsi + 48], xmm6 32142 add rsi, 16 32143 add rdi, 2 32144 jne .LBB4_889 32145 jmp .LBB4_1067 32146 .LBB4_945: 32147 mov esi, eax 32148 and esi, -4 32149 lea rdx, [rsi - 4] 32150 mov r9, rdx 32151 shr r9, 2 32152 add r9, 1 32153 test rdx, rdx 32154 je .LBB4_1076 32155 # %bb.946: 32156 mov rdx, r9 32157 and rdx, -2 32158 neg rdx 32159 xor edi, edi 32160 xorpd xmm0, xmm0 32161 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 32162 movapd xmm2, xmmword ptr [rip + .LCPI4_1] # xmm2 = [1.0E+0,1.0E+0] 32163 .LBB4_947: # =>This Inner Loop Header: Depth=1 32164 movupd xmm3, xmmword ptr [rcx + 8*rdi] 32165 movupd xmm4, xmmword ptr [rcx + 8*rdi + 16] 32166 movapd xmm5, xmm3 32167 cmpeqpd xmm5, xmm0 32168 shufps xmm5, xmm5, 232 # xmm5 = xmm5[0,2,2,3] 32169 movapd xmm6, xmm4 32170 cmpeqpd xmm6, xmm0 32171 shufps xmm6, xmm6, 232 # xmm6 = xmm6[0,2,2,3] 32172 andpd xmm3, xmm1 32173 orpd xmm3, xmm2 32174 andpd xmm4, xmm1 32175 orpd xmm4, xmm2 32176 cvttpd2dq xmm3, xmm3 32177 cvttpd2dq xmm4, xmm4 32178 andnps xmm5, xmm3 32179 andnps xmm6, xmm4 32180 movlhps xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 32181 movups xmmword ptr [r8 + 4*rdi], xmm5 32182 movupd xmm3, xmmword ptr [rcx + 8*rdi + 32] 32183 movupd xmm4, xmmword ptr [rcx + 8*rdi + 48] 32184 movapd xmm5, xmm3 32185 cmpeqpd xmm5, xmm0 32186 shufps xmm5, xmm5, 232 # xmm5 = xmm5[0,2,2,3] 32187 movapd xmm6, xmm4 32188 cmpeqpd xmm6, xmm0 32189 shufps xmm6, xmm6, 232 # xmm6 = xmm6[0,2,2,3] 32190 andpd xmm3, xmm1 32191 orpd xmm3, xmm2 32192 andpd xmm4, xmm1 32193 orpd xmm4, xmm2 32194 cvttpd2dq xmm3, xmm3 32195 andnps xmm5, xmm3 32196 cvttpd2dq xmm3, xmm4 32197 andnps xmm6, xmm3 32198 movlhps xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 32199 movups xmmword ptr [r8 + 4*rdi + 16], xmm5 32200 add rdi, 8 32201 add rdx, 2 32202 jne .LBB4_947 32203 jmp .LBB4_1077 32204 .LBB4_953: 32205 mov edx, eax 32206 and edx, -4 32207 lea rsi, [rdx - 4] 32208 mov r9, rsi 32209 shr r9, 2 32210 add r9, 1 32211 test rsi, rsi 32212 je .LBB4_1082 32213 # %bb.954: 32214 mov rdi, r9 32215 and rdi, -2 32216 neg rdi 32217 xor esi, esi 32218 pxor xmm0, xmm0 32219 movdqa xmm1, xmmword ptr [rip + .LCPI4_16] # xmm1 = <1,1,u,u> 32220 .LBB4_955: # =>This Inner Loop Header: Depth=1 32221 movdqu xmm2, xmmword ptr [rcx + 8*rsi] 32222 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 16] 32223 pcmpeqq xmm2, xmm0 32224 pshufd xmm2, xmm2, 232 # xmm2 = xmm2[0,2,2,3] 32225 pandn xmm2, xmm1 32226 pcmpeqq xmm3, xmm0 32227 pshufd xmm3, xmm3, 232 # xmm3 = xmm3[0,2,2,3] 32228 pandn xmm3, xmm1 32229 punpcklqdq xmm2, xmm3 # xmm2 = xmm2[0],xmm3[0] 32230 movdqu xmmword ptr [r8 + 4*rsi], xmm2 32231 movdqu xmm2, xmmword ptr [rcx + 8*rsi + 32] 32232 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 48] 32233 pcmpeqq xmm2, xmm0 32234 pshufd xmm2, xmm2, 232 # xmm2 = xmm2[0,2,2,3] 32235 pandn xmm2, xmm1 32236 pcmpeqq xmm3, xmm0 32237 pshufd xmm3, xmm3, 232 # xmm3 = xmm3[0,2,2,3] 32238 pandn xmm3, xmm1 32239 punpcklqdq xmm2, xmm3 # xmm2 = xmm2[0],xmm3[0] 32240 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm2 32241 add rsi, 8 32242 add rdi, 2 32243 jne .LBB4_955 32244 jmp .LBB4_1083 32245 .LBB4_956: 32246 mov edx, eax 32247 and edx, -8 32248 lea rsi, [rdx - 8] 32249 mov r9, rsi 32250 shr r9, 3 32251 add r9, 1 32252 test rsi, rsi 32253 je .LBB4_1087 32254 # %bb.957: 32255 mov rdi, r9 32256 and rdi, -2 32257 neg rdi 32258 xor esi, esi 32259 pxor xmm0, xmm0 32260 pcmpeqd xmm1, xmm1 32261 movdqa xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1] 32262 .LBB4_958: # =>This Inner Loop Header: Depth=1 32263 movq xmm3, qword ptr [rcx + 2*rsi] # xmm3 = mem[0],zero 32264 movq xmm4, qword ptr [rcx + 2*rsi + 8] # xmm4 = mem[0],zero 32265 pcmpeqw xmm3, xmm0 32266 pxor xmm3, xmm1 32267 pmovzxwd xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 32268 pand xmm3, xmm2 32269 pcmpeqw xmm4, xmm0 32270 pxor xmm4, xmm1 32271 pmovzxwd xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 32272 pand xmm4, xmm2 32273 movdqu xmmword ptr [r8 + 4*rsi], xmm3 32274 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm4 32275 movq xmm3, qword ptr [rcx + 2*rsi + 16] # xmm3 = mem[0],zero 32276 movq xmm4, qword ptr [rcx + 2*rsi + 24] # xmm4 = mem[0],zero 32277 pcmpeqw xmm3, xmm0 32278 pxor xmm3, xmm1 32279 pmovzxwd xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 32280 pand xmm3, xmm2 32281 pcmpeqw xmm4, xmm0 32282 pxor xmm4, xmm1 32283 pmovzxwd xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 32284 pand xmm4, xmm2 32285 movdqu xmmword ptr [r8 + 4*rsi + 32], xmm3 32286 movdqu xmmword ptr [r8 + 4*rsi + 48], xmm4 32287 add rsi, 16 32288 add rdi, 2 32289 jne .LBB4_958 32290 jmp .LBB4_1088 32291 .LBB4_959: 32292 mov edx, r10d 32293 and edx, -8 32294 lea rsi, [rdx - 8] 32295 mov r9, rsi 32296 shr r9, 3 32297 add r9, 1 32298 test rsi, rsi 32299 je .LBB4_1092 32300 # %bb.960: 32301 mov rdi, r9 32302 and rdi, -2 32303 neg rdi 32304 xor esi, esi 32305 pxor xmm2, xmm2 32306 pcmpeqd xmm3, xmm3 32307 movaps xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 32308 .LBB4_961: # =>This Inner Loop Header: Depth=1 32309 movq xmm5, qword ptr [rcx + 2*rsi] # xmm5 = mem[0],zero 32310 movq xmm6, qword ptr [rcx + 2*rsi + 8] # xmm6 = mem[0],zero 32311 movdqa xmm0, xmm5 32312 pcmpgtw xmm0, xmm2 32313 pmovsxwd xmm0, xmm0 32314 movdqa xmm1, xmm6 32315 pcmpgtw xmm1, xmm2 32316 pmovsxwd xmm1, xmm1 32317 pcmpeqw xmm5, xmm2 32318 pxor xmm5, xmm3 32319 pmovsxwd xmm5, xmm5 32320 pcmpeqw xmm6, xmm2 32321 pxor xmm6, xmm3 32322 pmovsxwd xmm6, xmm6 32323 blendvps xmm5, xmm4, xmm0 32324 movdqa xmm0, xmm1 32325 blendvps xmm6, xmm4, xmm0 32326 movups xmmword ptr [r8 + 4*rsi], xmm5 32327 movups xmmword ptr [r8 + 4*rsi + 16], xmm6 32328 movq xmm5, qword ptr [rcx + 2*rsi + 16] # xmm5 = mem[0],zero 32329 movq xmm6, qword ptr [rcx + 2*rsi + 24] # xmm6 = mem[0],zero 32330 movdqa xmm0, xmm5 32331 pcmpgtw xmm0, xmm2 32332 pmovsxwd xmm0, xmm0 32333 movdqa xmm1, xmm6 32334 pcmpgtw xmm1, xmm2 32335 pmovsxwd xmm1, xmm1 32336 pcmpeqw xmm5, xmm2 32337 pxor xmm5, xmm3 32338 pmovsxwd xmm5, xmm5 32339 pcmpeqw xmm6, xmm2 32340 pxor xmm6, xmm3 32341 pmovsxwd xmm6, xmm6 32342 blendvps xmm5, xmm4, xmm0 32343 movdqa xmm0, xmm1 32344 blendvps xmm6, xmm4, xmm0 32345 movups xmmword ptr [r8 + 4*rsi + 32], xmm5 32346 movups xmmword ptr [r8 + 4*rsi + 48], xmm6 32347 add rsi, 16 32348 add rdi, 2 32349 jne .LBB4_961 32350 jmp .LBB4_1093 32351 .LBB4_962: 32352 mov edx, r10d 32353 and edx, -4 32354 lea rsi, [rdx - 4] 32355 mov r9, rsi 32356 shr r9, 2 32357 add r9, 1 32358 test rsi, rsi 32359 je .LBB4_1098 32360 # %bb.963: 32361 mov rdi, r9 32362 and rdi, -2 32363 neg rdi 32364 xor esi, esi 32365 pxor xmm2, xmm2 32366 pcmpeqd xmm3, xmm3 32367 movaps xmm4, xmmword ptr [rip + .LCPI4_16] # xmm4 = <1,1,u,u> 32368 .LBB4_964: # =>This Inner Loop Header: Depth=1 32369 movdqu xmm5, xmmword ptr [rcx + 8*rsi] 32370 movdqu xmm6, xmmword ptr [rcx + 8*rsi + 16] 32371 movdqa xmm0, xmm5 32372 pcmpgtq xmm0, xmm2 32373 pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] 32374 movdqa xmm1, xmm6 32375 pcmpgtq xmm1, xmm2 32376 pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] 32377 pcmpeqq xmm5, xmm2 32378 pshufd xmm5, xmm5, 232 # xmm5 = xmm5[0,2,2,3] 32379 pxor xmm5, xmm3 32380 pcmpeqq xmm6, xmm2 32381 pshufd xmm6, xmm6, 232 # xmm6 = xmm6[0,2,2,3] 32382 pxor xmm6, xmm3 32383 blendvps xmm5, xmm4, xmm0 32384 movdqa xmm0, xmm1 32385 blendvps xmm6, xmm4, xmm0 32386 movlhps xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 32387 movups xmmword ptr [r8 + 4*rsi], xmm5 32388 movdqu xmm5, xmmword ptr [rcx + 8*rsi + 32] 32389 movdqu xmm6, xmmword ptr [rcx + 8*rsi + 48] 32390 movdqa xmm0, xmm5 32391 pcmpgtq xmm0, xmm2 32392 pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] 32393 movdqa xmm1, xmm6 32394 pcmpgtq xmm1, xmm2 32395 pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] 32396 pcmpeqq xmm5, xmm2 32397 pshufd xmm5, xmm5, 232 # xmm5 = xmm5[0,2,2,3] 32398 pxor xmm5, xmm3 32399 pcmpeqq xmm6, xmm2 32400 pshufd xmm6, xmm6, 232 # xmm6 = xmm6[0,2,2,3] 32401 pxor xmm6, xmm3 32402 blendvps xmm5, xmm4, xmm0 32403 movdqa xmm0, xmm1 32404 blendvps xmm6, xmm4, xmm0 32405 movlhps xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 32406 movups xmmword ptr [r8 + 4*rsi + 16], xmm5 32407 add rsi, 8 32408 add rdi, 2 32409 jne .LBB4_964 32410 jmp .LBB4_1099 32411 .LBB4_965: 32412 mov edx, eax 32413 and edx, -8 32414 xor esi, esi 32415 xorps xmm0, xmm0 32416 movdqa xmm1, xmmword ptr [rip + .LCPI4_8] # xmm1 = [1,1,1,1] 32417 .LBB4_966: # =>This Inner Loop Header: Depth=1 32418 movdqu xmm2, xmmword ptr [rcx + 4*rsi] 32419 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 16] 32420 movdqa xmm4, xmm2 32421 psrad xmm4, 31 32422 por xmm4, xmm1 32423 movdqa xmm5, xmm3 32424 psrad xmm5, 31 32425 por xmm5, xmm1 32426 cvtdq2ps xmm4, xmm4 32427 cvtdq2ps xmm5, xmm5 32428 cvttps2dq xmm4, xmm4 32429 cvttps2dq xmm5, xmm5 32430 cmpneqps xmm2, xmm0 32431 andps xmm2, xmm4 32432 cmpneqps xmm3, xmm0 32433 andps xmm3, xmm5 32434 movups xmmword ptr [r8 + 4*rsi], xmm2 32435 movups xmmword ptr [r8 + 4*rsi + 16], xmm3 32436 add rsi, 8 32437 cmp rdx, rsi 32438 jne .LBB4_966 32439 # %bb.967: 32440 cmp rdx, rax 32441 je .LBB4_1655 32442 .LBB4_968: 32443 xorps xmm0, xmm0 32444 jmp .LBB4_970 32445 .LBB4_969: # in Loop: Header=BB4_970 Depth=1 32446 mov dword ptr [r8 + 4*rdx], esi 32447 add rdx, 1 32448 cmp rax, rdx 32449 je .LBB4_1655 32450 .LBB4_970: # =>This Inner Loop Header: Depth=1 32451 movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero 32452 xor esi, esi 32453 ucomiss xmm0, xmm1 32454 je .LBB4_969 32455 # %bb.971: # in Loop: Header=BB4_970 Depth=1 32456 movmskps esi, xmm1 32457 and esi, 1 32458 neg esi 32459 or esi, 1 32460 xorps xmm1, xmm1 32461 cvtsi2ss xmm1, esi 32462 cvttss2si esi, xmm1 32463 jmp .LBB4_969 32464 .LBB4_496: 32465 mov edx, r10d 32466 and edx, -8 32467 lea rsi, [rdx - 8] 32468 mov r9, rsi 32469 shr r9, 3 32470 add r9, 1 32471 test rsi, rsi 32472 je .LBB4_1228 32473 # %bb.497: 32474 mov rdi, r9 32475 and rdi, -2 32476 neg rdi 32477 xor esi, esi 32478 pxor xmm0, xmm0 32479 movdqa xmm1, xmmword ptr [rip + .LCPI4_8] # xmm1 = [1,1,1,1] 32480 .LBB4_498: # =>This Inner Loop Header: Depth=1 32481 movdqu xmm2, xmmword ptr [rcx + 4*rsi] 32482 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 16] 32483 pcmpeqd xmm2, xmm0 32484 pandn xmm2, xmm1 32485 pcmpeqd xmm3, xmm0 32486 pandn xmm3, xmm1 32487 movdqu xmmword ptr [r8 + 4*rsi], xmm2 32488 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm3 32489 movdqu xmm2, xmmword ptr [rcx + 4*rsi + 32] 32490 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 48] 32491 pcmpeqd xmm2, xmm0 32492 pandn xmm2, xmm1 32493 pcmpeqd xmm3, xmm0 32494 pandn xmm3, xmm1 32495 movdqu xmmword ptr [r8 + 4*rsi + 32], xmm2 32496 movdqu xmmword ptr [r8 + 4*rsi + 48], xmm3 32497 add rsi, 16 32498 add rdi, 2 32499 jne .LBB4_498 32500 jmp .LBB4_1229 32501 .LBB4_504: 32502 mov edx, r10d 32503 and edx, -8 32504 lea rsi, [rdx - 8] 32505 mov r9, rsi 32506 shr r9, 3 32507 add r9, 1 32508 test rsi, rsi 32509 je .LBB4_1236 32510 # %bb.505: 32511 mov rdi, r9 32512 and rdi, -2 32513 neg rdi 32514 xor esi, esi 32515 pxor xmm2, xmm2 32516 pcmpeqd xmm3, xmm3 32517 movaps xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 32518 .LBB4_506: # =>This Inner Loop Header: Depth=1 32519 movd xmm5, dword ptr [rcx + rsi] # xmm5 = mem[0],zero,zero,zero 32520 movd xmm6, dword ptr [rcx + rsi + 4] # xmm6 = mem[0],zero,zero,zero 32521 movdqa xmm0, xmm5 32522 pcmpgtb xmm0, xmm2 32523 pmovsxbd xmm0, xmm0 32524 movdqa xmm1, xmm6 32525 pcmpgtb xmm1, xmm2 32526 pmovsxbd xmm1, xmm1 32527 pcmpeqb xmm5, xmm2 32528 pxor xmm5, xmm3 32529 pmovsxbd xmm5, xmm5 32530 pcmpeqb xmm6, xmm2 32531 pxor xmm6, xmm3 32532 pmovsxbd xmm6, xmm6 32533 blendvps xmm5, xmm4, xmm0 32534 movdqa xmm0, xmm1 32535 blendvps xmm6, xmm4, xmm0 32536 movups xmmword ptr [r8 + 4*rsi], xmm5 32537 movups xmmword ptr [r8 + 4*rsi + 16], xmm6 32538 movd xmm5, dword ptr [rcx + rsi + 8] # xmm5 = mem[0],zero,zero,zero 32539 movd xmm6, dword ptr [rcx + rsi + 12] # xmm6 = mem[0],zero,zero,zero 32540 movdqa xmm0, xmm5 32541 pcmpgtb xmm0, xmm2 32542 pmovsxbd xmm0, xmm0 32543 movdqa xmm1, xmm6 32544 pcmpgtb xmm1, xmm2 32545 pmovsxbd xmm1, xmm1 32546 pcmpeqb xmm5, xmm2 32547 pxor xmm5, xmm3 32548 pmovsxbd xmm5, xmm5 32549 pcmpeqb xmm6, xmm2 32550 pxor xmm6, xmm3 32551 pmovsxbd xmm6, xmm6 32552 blendvps xmm5, xmm4, xmm0 32553 movdqa xmm0, xmm1 32554 blendvps xmm6, xmm4, xmm0 32555 movups xmmword ptr [r8 + 4*rsi + 32], xmm5 32556 movups xmmword ptr [r8 + 4*rsi + 48], xmm6 32557 add rsi, 16 32558 add rdi, 2 32559 jne .LBB4_506 32560 jmp .LBB4_1237 32561 .LBB4_524: 32562 mov edx, r10d 32563 and edx, -8 32564 lea rsi, [rdx - 8] 32565 mov r9, rsi 32566 shr r9, 3 32567 add r9, 1 32568 test rsi, rsi 32569 je .LBB4_1245 32570 # %bb.525: 32571 mov rdi, r9 32572 and rdi, -2 32573 neg rdi 32574 xor esi, esi 32575 pxor xmm0, xmm0 32576 pcmpeqd xmm1, xmm1 32577 movdqa xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1] 32578 .LBB4_526: # =>This Inner Loop Header: Depth=1 32579 movd xmm3, dword ptr [rcx + rsi] # xmm3 = mem[0],zero,zero,zero 32580 movd xmm4, dword ptr [rcx + rsi + 4] # xmm4 = mem[0],zero,zero,zero 32581 pcmpeqb xmm3, xmm0 32582 pxor xmm3, xmm1 32583 pmovzxbd xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 32584 pand xmm3, xmm2 32585 pcmpeqb xmm4, xmm0 32586 pxor xmm4, xmm1 32587 pmovzxbd xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 32588 pand xmm4, xmm2 32589 movdqu xmmword ptr [r8 + 4*rsi], xmm3 32590 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm4 32591 movd xmm3, dword ptr [rcx + rsi + 8] # xmm3 = mem[0],zero,zero,zero 32592 movd xmm4, dword ptr [rcx + rsi + 12] # xmm4 = mem[0],zero,zero,zero 32593 pcmpeqb xmm3, xmm0 32594 pxor xmm3, xmm1 32595 pmovzxbd xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 32596 pand xmm3, xmm2 32597 pcmpeqb xmm4, xmm0 32598 pxor xmm4, xmm1 32599 pmovzxbd xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 32600 pand xmm4, xmm2 32601 movdqu xmmword ptr [r8 + 4*rsi + 32], xmm3 32602 movdqu xmmword ptr [r8 + 4*rsi + 48], xmm4 32603 add rsi, 16 32604 add rdi, 2 32605 jne .LBB4_526 32606 jmp .LBB4_1246 32607 .LBB4_529: 32608 mov edx, r11d 32609 and edx, -8 32610 lea rsi, [rdx - 8] 32611 mov r9, rsi 32612 shr r9, 3 32613 add r9, 1 32614 test rsi, rsi 32615 je .LBB4_1253 32616 # %bb.530: 32617 mov rdi, r9 32618 and rdi, -2 32619 neg rdi 32620 xor esi, esi 32621 pxor xmm2, xmm2 32622 pcmpeqd xmm3, xmm3 32623 movdqa xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 32624 .LBB4_531: # =>This Inner Loop Header: Depth=1 32625 movdqu xmm5, xmmword ptr [rcx + 4*rsi] 32626 movdqu xmm6, xmmword ptr [rcx + 4*rsi + 16] 32627 movdqa xmm0, xmm4 32628 pcmpgtd xmm0, xmm5 32629 pcmpeqd xmm5, xmm2 32630 pxor xmm5, xmm3 32631 movdqa xmm1, xmm4 32632 pcmpgtd xmm1, xmm6 32633 pcmpeqd xmm6, xmm2 32634 pxor xmm6, xmm3 32635 movdqa xmm7, xmm4 32636 blendvps xmm7, xmm5, xmm0 32637 movdqa xmm5, xmm4 32638 movdqa xmm0, xmm1 32639 blendvps xmm5, xmm6, xmm0 32640 movups xmmword ptr [r8 + 4*rsi], xmm7 32641 movups xmmword ptr [r8 + 4*rsi + 16], xmm5 32642 movdqu xmm5, xmmword ptr [rcx + 4*rsi + 32] 32643 movdqu xmm6, xmmword ptr [rcx + 4*rsi + 48] 32644 movdqa xmm0, xmm4 32645 pcmpgtd xmm0, xmm5 32646 pcmpeqd xmm5, xmm2 32647 pxor xmm5, xmm3 32648 movdqa xmm1, xmm4 32649 pcmpgtd xmm1, xmm6 32650 pcmpeqd xmm6, xmm2 32651 pxor xmm6, xmm3 32652 movdqa xmm7, xmm4 32653 blendvps xmm7, xmm5, xmm0 32654 movdqa xmm5, xmm4 32655 movdqa xmm0, xmm1 32656 blendvps xmm5, xmm6, xmm0 32657 movups xmmword ptr [r8 + 4*rsi + 32], xmm7 32658 movups xmmword ptr [r8 + 4*rsi + 48], xmm5 32659 add rsi, 16 32660 add rdi, 2 32661 jne .LBB4_531 32662 jmp .LBB4_1254 32663 .LBB4_544: 32664 mov edx, eax 32665 and edx, -4 32666 lea rsi, [rdx - 4] 32667 mov r9, rsi 32668 shr r9, 2 32669 add r9, 1 32670 test rsi, rsi 32671 je .LBB4_1262 32672 # %bb.545: 32673 mov rdi, r9 32674 and rdi, -2 32675 neg rdi 32676 xor esi, esi 32677 xorpd xmm0, xmm0 32678 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 32679 movapd xmm2, xmmword ptr [rip + .LCPI4_1] # xmm2 = [1.0E+0,1.0E+0] 32680 .LBB4_546: # =>This Inner Loop Header: Depth=1 32681 movupd xmm3, xmmword ptr [rcx + 8*rsi] 32682 movupd xmm4, xmmword ptr [rcx + 8*rsi + 16] 32683 movapd xmm5, xmm3 32684 andpd xmm5, xmm1 32685 orpd xmm5, xmm2 32686 movapd xmm6, xmm4 32687 andpd xmm6, xmm1 32688 orpd xmm6, xmm2 32689 cmpneqpd xmm3, xmm0 32690 andpd xmm3, xmm5 32691 cmpneqpd xmm4, xmm0 32692 andpd xmm4, xmm6 32693 movupd xmmword ptr [r8 + 8*rsi], xmm3 32694 movupd xmmword ptr [r8 + 8*rsi + 16], xmm4 32695 movupd xmm3, xmmword ptr [rcx + 8*rsi + 32] 32696 movupd xmm4, xmmword ptr [rcx + 8*rsi + 48] 32697 movapd xmm5, xmm3 32698 andpd xmm5, xmm1 32699 orpd xmm5, xmm2 32700 movapd xmm6, xmm4 32701 andpd xmm6, xmm1 32702 orpd xmm6, xmm2 32703 cmpneqpd xmm3, xmm0 32704 andpd xmm3, xmm5 32705 cmpneqpd xmm4, xmm0 32706 andpd xmm4, xmm6 32707 movupd xmmword ptr [r8 + 8*rsi + 32], xmm3 32708 movupd xmmword ptr [r8 + 8*rsi + 48], xmm4 32709 add rsi, 8 32710 add rdi, 2 32711 jne .LBB4_546 32712 jmp .LBB4_1263 32713 .LBB4_625: 32714 mov edx, eax 32715 and edx, -8 32716 lea rsi, [rdx - 8] 32717 mov r9, rsi 32718 shr r9, 3 32719 add r9, 1 32720 test rsi, rsi 32721 je .LBB4_1271 32722 # %bb.626: 32723 mov rdi, r9 32724 and rdi, -2 32725 neg rdi 32726 xor esi, esi 32727 pxor xmm0, xmm0 32728 pcmpeqd xmm1, xmm1 32729 movdqa xmm2, xmmword ptr [rip + .LCPI4_12] # xmm2 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u> 32730 .LBB4_627: # =>This Inner Loop Header: Depth=1 32731 movdqu xmm3, xmmword ptr [rcx + 4*rsi] 32732 movdqu xmm4, xmmword ptr [rcx + 4*rsi + 16] 32733 pcmpeqd xmm3, xmm0 32734 pxor xmm3, xmm1 32735 packssdw xmm3, xmm3 32736 packsswb xmm3, xmm3 32737 pand xmm3, xmm2 32738 pcmpeqd xmm4, xmm0 32739 pxor xmm4, xmm1 32740 packssdw xmm4, xmm4 32741 packsswb xmm4, xmm4 32742 pand xmm4, xmm2 32743 movd dword ptr [r8 + rsi], xmm3 32744 movd dword ptr [r8 + rsi + 4], xmm4 32745 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 32] 32746 movdqu xmm4, xmmword ptr [rcx + 4*rsi + 48] 32747 pcmpeqd xmm3, xmm0 32748 pxor xmm3, xmm1 32749 packssdw xmm3, xmm3 32750 packsswb xmm3, xmm3 32751 pand xmm3, xmm2 32752 pcmpeqd xmm4, xmm0 32753 pxor xmm4, xmm1 32754 packssdw xmm4, xmm4 32755 packsswb xmm4, xmm4 32756 pand xmm4, xmm2 32757 movd dword ptr [r8 + rsi + 8], xmm3 32758 movd dword ptr [r8 + rsi + 12], xmm4 32759 add rsi, 16 32760 add rdi, 2 32761 jne .LBB4_627 32762 jmp .LBB4_1272 32763 .LBB4_630: 32764 mov edx, eax 32765 and edx, -4 32766 lea rsi, [rdx - 4] 32767 mov r9, rsi 32768 shr r9, 2 32769 add r9, 1 32770 test rsi, rsi 32771 je .LBB4_1279 32772 # %bb.631: 32773 mov rdi, r9 32774 and rdi, -2 32775 neg rdi 32776 xor esi, esi 32777 xorpd xmm2, xmm2 32778 movapd xmm3, xmmword ptr [rip + .LCPI4_0] # xmm3 = [-0.0E+0,-0.0E+0] 32779 movapd xmm4, xmmword ptr [rip + .LCPI4_1] # xmm4 = [1.0E+0,1.0E+0] 32780 movdqa xmm5, xmmword ptr [rip + .LCPI4_7] # xmm5 = <0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 32781 .LBB4_632: # =>This Inner Loop Header: Depth=1 32782 movupd xmm6, xmmword ptr [rcx + 8*rsi] 32783 movupd xmm7, xmmword ptr [rcx + 8*rsi + 16] 32784 movapd xmm0, xmm6 32785 cmpeqpd xmm0, xmm2 32786 packssdw xmm0, xmm0 32787 packssdw xmm0, xmm0 32788 packsswb xmm0, xmm0 32789 movapd xmm1, xmm7 32790 cmpeqpd xmm1, xmm2 32791 packssdw xmm1, xmm1 32792 packssdw xmm1, xmm1 32793 packsswb xmm1, xmm1 32794 andpd xmm6, xmm3 32795 orpd xmm6, xmm4 32796 andpd xmm7, xmm3 32797 orpd xmm7, xmm4 32798 cvttpd2dq xmm6, xmm6 32799 pshufb xmm6, xmm5 32800 cvttpd2dq xmm7, xmm7 32801 pshufb xmm7, xmm5 32802 pblendvb xmm6, xmm2, xmm0 32803 movdqa xmm0, xmm1 32804 pblendvb xmm7, xmm2, xmm0 32805 pextrw word ptr [r8 + rsi], xmm6, 0 32806 pextrw word ptr [r8 + rsi + 2], xmm7, 0 32807 movupd xmm6, xmmword ptr [rcx + 8*rsi + 32] 32808 movupd xmm7, xmmword ptr [rcx + 8*rsi + 48] 32809 movapd xmm0, xmm6 32810 cmpeqpd xmm0, xmm2 32811 packssdw xmm0, xmm0 32812 packssdw xmm0, xmm0 32813 packsswb xmm0, xmm0 32814 movapd xmm1, xmm7 32815 cmpeqpd xmm1, xmm2 32816 packssdw xmm1, xmm1 32817 packssdw xmm1, xmm1 32818 packsswb xmm1, xmm1 32819 andpd xmm6, xmm3 32820 orpd xmm6, xmm4 32821 andpd xmm7, xmm3 32822 orpd xmm7, xmm4 32823 cvttpd2dq xmm6, xmm6 32824 pshufb xmm6, xmm5 32825 cvttpd2dq xmm7, xmm7 32826 pshufb xmm7, xmm5 32827 pblendvb xmm6, xmm2, xmm0 32828 movdqa xmm0, xmm1 32829 pblendvb xmm7, xmm2, xmm0 32830 pextrw word ptr [r8 + rsi + 4], xmm6, 0 32831 pextrw word ptr [r8 + rsi + 6], xmm7, 0 32832 add rsi, 8 32833 add rdi, 2 32834 jne .LBB4_632 32835 jmp .LBB4_1280 32836 .LBB4_635: 32837 mov esi, r10d 32838 and esi, -32 32839 lea rax, [rsi - 32] 32840 mov r9, rax 32841 shr r9, 5 32842 add r9, 1 32843 test rax, rax 32844 je .LBB4_1288 32845 # %bb.636: 32846 mov rdi, r9 32847 and rdi, -2 32848 neg rdi 32849 xor eax, eax 32850 pxor xmm2, xmm2 32851 pcmpeqd xmm3, xmm3 32852 movdqa xmm4, xmmword ptr [rip + .LCPI4_22] # xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 32853 .LBB4_637: # =>This Inner Loop Header: Depth=1 32854 movdqu xmm5, xmmword ptr [rcx + rax] 32855 movdqu xmm6, xmmword ptr [rcx + rax + 16] 32856 movdqa xmm0, xmm4 32857 pcmpgtb xmm0, xmm5 32858 pcmpeqb xmm5, xmm2 32859 pxor xmm5, xmm3 32860 movdqa xmm1, xmm4 32861 pcmpgtb xmm1, xmm6 32862 pcmpeqb xmm6, xmm2 32863 pxor xmm6, xmm3 32864 movdqa xmm7, xmm4 32865 pblendvb xmm7, xmm5, xmm0 32866 movdqa xmm5, xmm4 32867 movdqa xmm0, xmm1 32868 pblendvb xmm5, xmm6, xmm0 32869 movdqu xmmword ptr [r8 + rax], xmm7 32870 movdqu xmmword ptr [r8 + rax + 16], xmm5 32871 movdqu xmm5, xmmword ptr [rcx + rax + 32] 32872 movdqu xmm6, xmmword ptr [rcx + rax + 48] 32873 movdqa xmm0, xmm4 32874 pcmpgtb xmm0, xmm5 32875 pcmpeqb xmm5, xmm2 32876 pxor xmm5, xmm3 32877 movdqa xmm1, xmm4 32878 pcmpgtb xmm1, xmm6 32879 pcmpeqb xmm6, xmm2 32880 pxor xmm6, xmm3 32881 movdqa xmm7, xmm4 32882 pblendvb xmm7, xmm5, xmm0 32883 movdqa xmm5, xmm4 32884 movdqa xmm0, xmm1 32885 pblendvb xmm5, xmm6, xmm0 32886 movdqu xmmword ptr [r8 + rax + 32], xmm7 32887 movdqu xmmword ptr [r8 + rax + 48], xmm5 32888 add rax, 64 32889 add rdi, 2 32890 jne .LBB4_637 32891 jmp .LBB4_1289 32892 .LBB4_640: 32893 mov edx, eax 32894 and edx, -4 32895 lea rsi, [rdx - 4] 32896 mov r9, rsi 32897 shr r9, 2 32898 add r9, 1 32899 test rsi, rsi 32900 je .LBB4_1297 32901 # %bb.641: 32902 mov rdi, r9 32903 and rdi, -2 32904 neg rdi 32905 xor esi, esi 32906 pxor xmm0, xmm0 32907 pcmpeqd xmm1, xmm1 32908 movdqa xmm2, xmmword ptr [rip + .LCPI4_18] # xmm2 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 32909 .LBB4_642: # =>This Inner Loop Header: Depth=1 32910 movdqu xmm3, xmmword ptr [rcx + 8*rsi] 32911 movdqu xmm4, xmmword ptr [rcx + 8*rsi + 16] 32912 pcmpeqq xmm3, xmm0 32913 pxor xmm3, xmm1 32914 packssdw xmm3, xmm3 32915 packssdw xmm3, xmm3 32916 packsswb xmm3, xmm3 32917 pand xmm3, xmm2 32918 pcmpeqq xmm4, xmm0 32919 pxor xmm4, xmm1 32920 packssdw xmm4, xmm4 32921 packssdw xmm4, xmm4 32922 packsswb xmm4, xmm4 32923 pextrw word ptr [r8 + rsi], xmm3, 0 32924 pand xmm4, xmm2 32925 pextrw word ptr [r8 + rsi + 2], xmm4, 0 32926 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 32] 32927 movdqu xmm4, xmmword ptr [rcx + 8*rsi + 48] 32928 pcmpeqq xmm3, xmm0 32929 pxor xmm3, xmm1 32930 packssdw xmm3, xmm3 32931 packssdw xmm3, xmm3 32932 packsswb xmm3, xmm3 32933 pand xmm3, xmm2 32934 pcmpeqq xmm4, xmm0 32935 pxor xmm4, xmm1 32936 packssdw xmm4, xmm4 32937 packssdw xmm4, xmm4 32938 packsswb xmm4, xmm4 32939 pextrw word ptr [r8 + rsi + 4], xmm3, 0 32940 pand xmm4, xmm2 32941 pextrw word ptr [r8 + rsi + 6], xmm4, 0 32942 add rsi, 8 32943 add rdi, 2 32944 jne .LBB4_642 32945 jmp .LBB4_1298 32946 .LBB4_645: 32947 mov edx, eax 32948 and edx, -16 32949 lea rsi, [rdx - 16] 32950 mov r9, rsi 32951 shr r9, 4 32952 add r9, 1 32953 test rsi, rsi 32954 je .LBB4_1305 32955 # %bb.646: 32956 mov rdi, r9 32957 and rdi, -2 32958 neg rdi 32959 xor esi, esi 32960 pxor xmm0, xmm0 32961 pcmpeqd xmm1, xmm1 32962 movdqa xmm2, xmmword ptr [rip + .LCPI4_21] # xmm2 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u> 32963 .LBB4_647: # =>This Inner Loop Header: Depth=1 32964 movdqu xmm3, xmmword ptr [rcx + 2*rsi] 32965 movdqu xmm4, xmmword ptr [rcx + 2*rsi + 16] 32966 pcmpeqw xmm3, xmm0 32967 pxor xmm3, xmm1 32968 packsswb xmm3, xmm3 32969 pand xmm3, xmm2 32970 pcmpeqw xmm4, xmm0 32971 pxor xmm4, xmm1 32972 packsswb xmm4, xmm4 32973 pand xmm4, xmm2 32974 punpcklqdq xmm3, xmm4 # xmm3 = xmm3[0],xmm4[0] 32975 movdqu xmmword ptr [r8 + rsi], xmm3 32976 movdqu xmm3, xmmword ptr [rcx + 2*rsi + 32] 32977 movdqu xmm4, xmmword ptr [rcx + 2*rsi + 48] 32978 pcmpeqw xmm3, xmm0 32979 pxor xmm3, xmm1 32980 packsswb xmm3, xmm3 32981 pand xmm3, xmm2 32982 pcmpeqw xmm4, xmm0 32983 pxor xmm4, xmm1 32984 packsswb xmm4, xmm4 32985 pand xmm4, xmm2 32986 punpcklqdq xmm3, xmm4 # xmm3 = xmm3[0],xmm4[0] 32987 movdqu xmmword ptr [r8 + rsi + 16], xmm3 32988 add rsi, 32 32989 add rdi, 2 32990 jne .LBB4_647 32991 jmp .LBB4_1306 32992 .LBB4_650: 32993 mov esi, r10d 32994 and esi, -16 32995 lea rax, [rsi - 16] 32996 mov r9, rax 32997 shr r9, 4 32998 add r9, 1 32999 test rax, rax 33000 je .LBB4_1313 33001 # %bb.651: 33002 mov rdi, r9 33003 and rdi, -2 33004 neg rdi 33005 xor eax, eax 33006 pxor xmm2, xmm2 33007 pcmpeqd xmm3, xmm3 33008 movdqa xmm4, xmmword ptr [rip + .LCPI4_21] # xmm4 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u> 33009 .LBB4_652: # =>This Inner Loop Header: Depth=1 33010 movdqu xmm5, xmmword ptr [rcx + 2*rax] 33011 movdqu xmm6, xmmword ptr [rcx + 2*rax + 16] 33012 movdqa xmm0, xmm5 33013 pcmpgtw xmm0, xmm2 33014 packsswb xmm0, xmm0 33015 movdqa xmm1, xmm6 33016 pcmpgtw xmm1, xmm2 33017 packsswb xmm1, xmm1 33018 pcmpeqw xmm5, xmm2 33019 pxor xmm5, xmm3 33020 packsswb xmm5, xmm5 33021 pcmpeqw xmm6, xmm2 33022 pxor xmm6, xmm3 33023 packsswb xmm6, xmm6 33024 pblendvb xmm5, xmm4, xmm0 33025 movdqa xmm0, xmm1 33026 pblendvb xmm6, xmm4, xmm0 33027 punpcklqdq xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 33028 movdqu xmmword ptr [r8 + rax], xmm5 33029 movdqu xmm5, xmmword ptr [rcx + 2*rax + 32] 33030 movdqu xmm6, xmmword ptr [rcx + 2*rax + 48] 33031 movdqa xmm0, xmm5 33032 pcmpgtw xmm0, xmm2 33033 packsswb xmm0, xmm0 33034 movdqa xmm1, xmm6 33035 pcmpgtw xmm1, xmm2 33036 packsswb xmm1, xmm1 33037 pcmpeqw xmm5, xmm2 33038 pxor xmm5, xmm3 33039 packsswb xmm5, xmm5 33040 pcmpeqw xmm6, xmm2 33041 pxor xmm6, xmm3 33042 packsswb xmm6, xmm6 33043 pblendvb xmm5, xmm4, xmm0 33044 movdqa xmm0, xmm1 33045 pblendvb xmm6, xmm4, xmm0 33046 punpcklqdq xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 33047 movdqu xmmword ptr [r8 + rax + 16], xmm5 33048 add rax, 32 33049 add rdi, 2 33050 jne .LBB4_652 33051 jmp .LBB4_1314 33052 .LBB4_655: 33053 mov esi, r10d 33054 and esi, -4 33055 lea rax, [rsi - 4] 33056 mov r9, rax 33057 shr r9, 2 33058 add r9, 1 33059 test rax, rax 33060 je .LBB4_1322 33061 # %bb.656: 33062 mov rdi, r9 33063 and rdi, -2 33064 neg rdi 33065 xor eax, eax 33066 pxor xmm2, xmm2 33067 pcmpeqd xmm3, xmm3 33068 movdqa xmm4, xmmword ptr [rip + .LCPI4_18] # xmm4 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 33069 .LBB4_657: # =>This Inner Loop Header: Depth=1 33070 movdqu xmm5, xmmword ptr [rcx + 8*rax] 33071 movdqu xmm6, xmmword ptr [rcx + 8*rax + 16] 33072 movdqa xmm0, xmm5 33073 pcmpgtq xmm0, xmm2 33074 packssdw xmm0, xmm0 33075 packssdw xmm0, xmm0 33076 packsswb xmm0, xmm0 33077 movdqa xmm1, xmm6 33078 pcmpgtq xmm1, xmm2 33079 packssdw xmm1, xmm1 33080 packssdw xmm1, xmm1 33081 packsswb xmm1, xmm1 33082 pcmpeqq xmm5, xmm2 33083 pxor xmm5, xmm3 33084 packssdw xmm5, xmm5 33085 packssdw xmm5, xmm5 33086 packsswb xmm5, xmm5 33087 pcmpeqq xmm6, xmm2 33088 pxor xmm6, xmm3 33089 packssdw xmm6, xmm6 33090 packssdw xmm6, xmm6 33091 packsswb xmm6, xmm6 33092 pblendvb xmm5, xmm4, xmm0 33093 movdqa xmm0, xmm1 33094 pblendvb xmm6, xmm4, xmm0 33095 pextrw word ptr [r8 + rax], xmm5, 0 33096 pextrw word ptr [r8 + rax + 2], xmm6, 0 33097 movdqu xmm5, xmmword ptr [rcx + 8*rax + 32] 33098 movdqu xmm6, xmmword ptr [rcx + 8*rax + 48] 33099 movdqa xmm0, xmm5 33100 pcmpgtq xmm0, xmm2 33101 packssdw xmm0, xmm0 33102 packssdw xmm0, xmm0 33103 packsswb xmm0, xmm0 33104 movdqa xmm1, xmm6 33105 pcmpgtq xmm1, xmm2 33106 packssdw xmm1, xmm1 33107 packssdw xmm1, xmm1 33108 packsswb xmm1, xmm1 33109 pcmpeqq xmm5, xmm2 33110 pxor xmm5, xmm3 33111 packssdw xmm5, xmm5 33112 packssdw xmm5, xmm5 33113 packsswb xmm5, xmm5 33114 pcmpeqq xmm6, xmm2 33115 pxor xmm6, xmm3 33116 packssdw xmm6, xmm6 33117 packssdw xmm6, xmm6 33118 packsswb xmm6, xmm6 33119 pblendvb xmm5, xmm4, xmm0 33120 movdqa xmm0, xmm1 33121 pblendvb xmm6, xmm4, xmm0 33122 pextrw word ptr [r8 + rax + 4], xmm5, 0 33123 pextrw word ptr [r8 + rax + 6], xmm6, 0 33124 add rax, 8 33125 add rdi, 2 33126 jne .LBB4_657 33127 jmp .LBB4_1323 33128 .LBB4_660: 33129 mov edx, r10d 33130 and edx, -8 33131 lea rsi, [rdx - 8] 33132 mov r9, rsi 33133 shr r9, 3 33134 add r9, 1 33135 test rsi, rsi 33136 je .LBB4_1331 33137 # %bb.661: 33138 mov rdi, r9 33139 and rdi, -2 33140 neg rdi 33141 xor esi, esi 33142 xorps xmm4, xmm4 33143 pcmpeqd xmm8, xmm8 33144 movdqa xmm6, xmmword ptr [rip + .LCPI4_12] # xmm6 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u> 33145 .LBB4_662: # =>This Inner Loop Header: Depth=1 33146 movups xmm0, xmmword ptr [rcx + 4*rsi] 33147 movups xmm1, xmmword ptr [rcx + 4*rsi + 16] 33148 movaps xmm2, xmm0 33149 cmpeqps xmm2, xmm4 33150 packssdw xmm2, xmm2 33151 packsswb xmm2, xmm2 33152 movaps xmm3, xmm1 33153 cmpeqps xmm3, xmm4 33154 packssdw xmm3, xmm3 33155 packsswb xmm3, xmm3 33156 pcmpgtd xmm0, xmm8 33157 packssdw xmm0, xmm0 33158 packsswb xmm0, xmm0 33159 pcmpgtd xmm1, xmm8 33160 packssdw xmm1, xmm1 33161 packsswb xmm1, xmm1 33162 pcmpeqd xmm7, xmm7 33163 pblendvb xmm7, xmm6, xmm0 33164 pcmpeqd xmm5, xmm5 33165 movdqa xmm0, xmm1 33166 pblendvb xmm5, xmm6, xmm0 33167 movdqa xmm0, xmm2 33168 pblendvb xmm7, xmm4, xmm0 33169 movdqa xmm0, xmm3 33170 pblendvb xmm5, xmm4, xmm0 33171 movd dword ptr [r8 + rsi], xmm7 33172 movd dword ptr [r8 + rsi + 4], xmm5 33173 movups xmm0, xmmword ptr [rcx + 4*rsi + 32] 33174 movups xmm1, xmmword ptr [rcx + 4*rsi + 48] 33175 movaps xmm2, xmm0 33176 cmpeqps xmm2, xmm4 33177 packssdw xmm2, xmm2 33178 packsswb xmm2, xmm2 33179 movaps xmm3, xmm1 33180 cmpeqps xmm3, xmm4 33181 packssdw xmm3, xmm3 33182 packsswb xmm3, xmm3 33183 pcmpgtd xmm0, xmm8 33184 packssdw xmm0, xmm0 33185 packsswb xmm0, xmm0 33186 pcmpgtd xmm1, xmm8 33187 packssdw xmm1, xmm1 33188 pcmpeqd xmm5, xmm5 33189 pblendvb xmm5, xmm6, xmm0 33190 packsswb xmm1, xmm1 33191 pcmpeqd xmm7, xmm7 33192 movdqa xmm0, xmm1 33193 pblendvb xmm7, xmm6, xmm0 33194 movdqa xmm0, xmm2 33195 pblendvb xmm5, xmm4, xmm0 33196 movdqa xmm0, xmm3 33197 pblendvb xmm7, xmm4, xmm0 33198 movd dword ptr [r8 + rsi + 8], xmm5 33199 movd dword ptr [r8 + rsi + 12], xmm7 33200 add rsi, 16 33201 add rdi, 2 33202 jne .LBB4_662 33203 jmp .LBB4_1332 33204 .LBB4_665: 33205 mov edx, eax 33206 and edx, -32 33207 lea rsi, [rdx - 32] 33208 mov r9, rsi 33209 shr r9, 5 33210 add r9, 1 33211 test rsi, rsi 33212 je .LBB4_1340 33213 # %bb.666: 33214 mov rdi, r9 33215 and rdi, -2 33216 neg rdi 33217 xor esi, esi 33218 pxor xmm0, xmm0 33219 movdqa xmm1, xmmword ptr [rip + .LCPI4_22] # xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 33220 .LBB4_667: # =>This Inner Loop Header: Depth=1 33221 movdqu xmm2, xmmword ptr [rcx + rsi] 33222 movdqu xmm3, xmmword ptr [rcx + rsi + 16] 33223 pcmpeqb xmm2, xmm0 33224 pandn xmm2, xmm1 33225 pcmpeqb xmm3, xmm0 33226 pandn xmm3, xmm1 33227 movdqu xmmword ptr [r8 + rsi], xmm2 33228 movdqu xmmword ptr [r8 + rsi + 16], xmm3 33229 movdqu xmm2, xmmword ptr [rcx + rsi + 32] 33230 movdqu xmm3, xmmword ptr [rcx + rsi + 48] 33231 pcmpeqb xmm2, xmm0 33232 pandn xmm2, xmm1 33233 pcmpeqb xmm3, xmm0 33234 pandn xmm3, xmm1 33235 movdqu xmmword ptr [r8 + rsi + 32], xmm2 33236 movdqu xmmword ptr [r8 + rsi + 48], xmm3 33237 add rsi, 64 33238 add rdi, 2 33239 jne .LBB4_667 33240 jmp .LBB4_1341 33241 .LBB4_670: 33242 mov esi, r10d 33243 and esi, -8 33244 lea rax, [rsi - 8] 33245 mov r9, rax 33246 shr r9, 3 33247 add r9, 1 33248 test rax, rax 33249 je .LBB4_1348 33250 # %bb.671: 33251 mov rdi, r9 33252 and rdi, -2 33253 neg rdi 33254 xor eax, eax 33255 pxor xmm2, xmm2 33256 pcmpeqd xmm3, xmm3 33257 movdqa xmm4, xmmword ptr [rip + .LCPI4_12] # xmm4 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u> 33258 .LBB4_672: # =>This Inner Loop Header: Depth=1 33259 movdqu xmm5, xmmword ptr [rcx + 4*rax] 33260 movdqu xmm6, xmmword ptr [rcx + 4*rax + 16] 33261 movdqa xmm0, xmm5 33262 pcmpgtd xmm0, xmm2 33263 packssdw xmm0, xmm0 33264 packsswb xmm0, xmm0 33265 movdqa xmm1, xmm6 33266 pcmpgtd xmm1, xmm2 33267 packssdw xmm1, xmm1 33268 packsswb xmm1, xmm1 33269 pcmpeqd xmm5, xmm2 33270 pxor xmm5, xmm3 33271 packssdw xmm5, xmm5 33272 packsswb xmm5, xmm5 33273 pcmpeqd xmm6, xmm2 33274 pxor xmm6, xmm3 33275 packssdw xmm6, xmm6 33276 packsswb xmm6, xmm6 33277 pblendvb xmm5, xmm4, xmm0 33278 movdqa xmm0, xmm1 33279 pblendvb xmm6, xmm4, xmm0 33280 movd dword ptr [r8 + rax], xmm5 33281 movd dword ptr [r8 + rax + 4], xmm6 33282 movdqu xmm5, xmmword ptr [rcx + 4*rax + 32] 33283 movdqu xmm6, xmmword ptr [rcx + 4*rax + 48] 33284 movdqa xmm0, xmm5 33285 pcmpgtd xmm0, xmm2 33286 packssdw xmm0, xmm0 33287 packsswb xmm0, xmm0 33288 movdqa xmm1, xmm6 33289 pcmpgtd xmm1, xmm2 33290 packssdw xmm1, xmm1 33291 packsswb xmm1, xmm1 33292 pcmpeqd xmm5, xmm2 33293 pxor xmm5, xmm3 33294 packssdw xmm5, xmm5 33295 packsswb xmm5, xmm5 33296 pcmpeqd xmm6, xmm2 33297 pxor xmm6, xmm3 33298 packssdw xmm6, xmm6 33299 packsswb xmm6, xmm6 33300 pblendvb xmm5, xmm4, xmm0 33301 movdqa xmm0, xmm1 33302 pblendvb xmm6, xmm4, xmm0 33303 movd dword ptr [r8 + rax + 8], xmm5 33304 movd dword ptr [r8 + rax + 12], xmm6 33305 add rax, 16 33306 add rdi, 2 33307 jne .LBB4_672 33308 jmp .LBB4_1349 33309 .LBB4_681: 33310 mov edx, r10d 33311 and edx, -4 33312 lea rsi, [rdx - 4] 33313 mov r9, rsi 33314 shr r9, 2 33315 add r9, 1 33316 test rsi, rsi 33317 je .LBB4_1357 33318 # %bb.682: 33319 mov rdi, r9 33320 and rdi, -2 33321 neg rdi 33322 xor esi, esi 33323 pxor xmm2, xmm2 33324 pcmpeqd xmm3, xmm3 33325 movapd xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 33326 .LBB4_683: # =>This Inner Loop Header: Depth=1 33327 movzx eax, word ptr [rcx + rsi] 33328 movd xmm5, eax 33329 movzx eax, word ptr [rcx + rsi + 2] 33330 movd xmm6, eax 33331 movdqa xmm0, xmm5 33332 pcmpgtb xmm0, xmm2 33333 pmovsxbq xmm0, xmm0 33334 movdqa xmm1, xmm6 33335 pcmpgtb xmm1, xmm2 33336 pmovsxbq xmm1, xmm1 33337 pcmpeqb xmm5, xmm2 33338 pxor xmm5, xmm3 33339 pmovsxbq xmm5, xmm5 33340 pcmpeqb xmm6, xmm2 33341 pxor xmm6, xmm3 33342 pmovsxbq xmm6, xmm6 33343 blendvpd xmm5, xmm4, xmm0 33344 movdqa xmm0, xmm1 33345 blendvpd xmm6, xmm4, xmm0 33346 movupd xmmword ptr [r8 + 8*rsi], xmm5 33347 movupd xmmword ptr [r8 + 8*rsi + 16], xmm6 33348 movzx eax, word ptr [rcx + rsi + 4] 33349 movd xmm5, eax 33350 movzx eax, word ptr [rcx + rsi + 6] 33351 movd xmm6, eax 33352 movdqa xmm0, xmm5 33353 pcmpgtb xmm0, xmm2 33354 pmovsxbq xmm0, xmm0 33355 movdqa xmm1, xmm6 33356 pcmpgtb xmm1, xmm2 33357 pmovsxbq xmm1, xmm1 33358 pcmpeqb xmm5, xmm2 33359 pxor xmm5, xmm3 33360 pmovsxbq xmm5, xmm5 33361 pcmpeqb xmm6, xmm2 33362 pxor xmm6, xmm3 33363 pmovsxbq xmm6, xmm6 33364 blendvpd xmm5, xmm4, xmm0 33365 movdqa xmm0, xmm1 33366 blendvpd xmm6, xmm4, xmm0 33367 movupd xmmword ptr [r8 + 8*rsi + 32], xmm5 33368 movupd xmmword ptr [r8 + 8*rsi + 48], xmm6 33369 add rsi, 8 33370 add rdi, 2 33371 jne .LBB4_683 33372 jmp .LBB4_1358 33373 .LBB4_686: 33374 mov edx, r10d 33375 and edx, -4 33376 lea rsi, [rdx - 4] 33377 mov r9, rsi 33378 shr r9, 2 33379 add r9, 1 33380 test rsi, rsi 33381 je .LBB4_1366 33382 # %bb.687: 33383 mov rdi, r9 33384 and rdi, -2 33385 neg rdi 33386 xor esi, esi 33387 pxor xmm0, xmm0 33388 movdqa xmm1, xmmword ptr [rip + .LCPI4_15] # xmm1 = [1,1] 33389 .LBB4_688: # =>This Inner Loop Header: Depth=1 33390 movdqu xmm2, xmmword ptr [rcx + 8*rsi] 33391 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 16] 33392 pcmpeqq xmm2, xmm0 33393 pandn xmm2, xmm1 33394 pcmpeqq xmm3, xmm0 33395 pandn xmm3, xmm1 33396 movdqu xmmword ptr [r8 + 8*rsi], xmm2 33397 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm3 33398 movdqu xmm2, xmmword ptr [rcx + 8*rsi + 32] 33399 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 48] 33400 pcmpeqq xmm2, xmm0 33401 pandn xmm2, xmm1 33402 pcmpeqq xmm3, xmm0 33403 pandn xmm3, xmm1 33404 movdqu xmmword ptr [r8 + 8*rsi + 32], xmm2 33405 movdqu xmmword ptr [r8 + 8*rsi + 48], xmm3 33406 add rsi, 8 33407 add rdi, 2 33408 jne .LBB4_688 33409 jmp .LBB4_1367 33410 .LBB4_697: 33411 mov edx, r11d 33412 and edx, -4 33413 lea rsi, [rdx - 4] 33414 mov r9, rsi 33415 shr r9, 2 33416 add r9, 1 33417 test rsi, rsi 33418 je .LBB4_1374 33419 # %bb.698: 33420 mov rdi, r9 33421 and rdi, -2 33422 neg rdi 33423 xor esi, esi 33424 pxor xmm2, xmm2 33425 pcmpeqd xmm3, xmm3 33426 movdqa xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 33427 .LBB4_699: # =>This Inner Loop Header: Depth=1 33428 movdqu xmm5, xmmword ptr [rcx + 8*rsi] 33429 movdqu xmm6, xmmword ptr [rcx + 8*rsi + 16] 33430 movdqa xmm0, xmm4 33431 pcmpgtq xmm0, xmm5 33432 pcmpeqq xmm5, xmm2 33433 pxor xmm5, xmm3 33434 movdqa xmm1, xmm4 33435 pcmpgtq xmm1, xmm6 33436 pcmpeqq xmm6, xmm2 33437 pxor xmm6, xmm3 33438 movdqa xmm7, xmm4 33439 blendvpd xmm7, xmm5, xmm0 33440 movdqa xmm5, xmm4 33441 movdqa xmm0, xmm1 33442 blendvpd xmm5, xmm6, xmm0 33443 movupd xmmword ptr [r8 + 8*rsi], xmm7 33444 movupd xmmword ptr [r8 + 8*rsi + 16], xmm5 33445 movdqu xmm5, xmmword ptr [rcx + 8*rsi + 32] 33446 movdqu xmm6, xmmword ptr [rcx + 8*rsi + 48] 33447 movdqa xmm0, xmm4 33448 pcmpgtq xmm0, xmm5 33449 pcmpeqq xmm5, xmm2 33450 pxor xmm5, xmm3 33451 movdqa xmm1, xmm4 33452 pcmpgtq xmm1, xmm6 33453 pcmpeqq xmm6, xmm2 33454 pxor xmm6, xmm3 33455 movdqa xmm7, xmm4 33456 blendvpd xmm7, xmm5, xmm0 33457 movdqa xmm5, xmm4 33458 movdqa xmm0, xmm1 33459 blendvpd xmm5, xmm6, xmm0 33460 movupd xmmword ptr [r8 + 8*rsi + 32], xmm7 33461 movupd xmmword ptr [r8 + 8*rsi + 48], xmm5 33462 add rsi, 8 33463 add rdi, 2 33464 jne .LBB4_699 33465 jmp .LBB4_1375 33466 .LBB4_710: 33467 mov edx, r10d 33468 and edx, -4 33469 lea rsi, [rdx - 4] 33470 mov r9, rsi 33471 shr r9, 2 33472 add r9, 1 33473 test rsi, rsi 33474 je .LBB4_1383 33475 # %bb.711: 33476 mov rdi, r9 33477 and rdi, -2 33478 neg rdi 33479 xor esi, esi 33480 pxor xmm0, xmm0 33481 pcmpeqd xmm1, xmm1 33482 movdqa xmm2, xmmword ptr [rip + .LCPI4_15] # xmm2 = [1,1] 33483 .LBB4_712: # =>This Inner Loop Header: Depth=1 33484 movzx eax, word ptr [rcx + rsi] 33485 movd xmm3, eax 33486 movzx eax, word ptr [rcx + rsi + 2] 33487 movd xmm4, eax 33488 pcmpeqb xmm3, xmm0 33489 pxor xmm3, xmm1 33490 pmovzxbq xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 33491 pand xmm3, xmm2 33492 pcmpeqb xmm4, xmm0 33493 pxor xmm4, xmm1 33494 pmovzxbq xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero 33495 pand xmm4, xmm2 33496 movdqu xmmword ptr [r8 + 8*rsi], xmm3 33497 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm4 33498 movzx eax, word ptr [rcx + rsi + 4] 33499 movd xmm3, eax 33500 movzx eax, word ptr [rcx + rsi + 6] 33501 movd xmm4, eax 33502 pcmpeqb xmm3, xmm0 33503 pxor xmm3, xmm1 33504 pmovzxbq xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 33505 pand xmm3, xmm2 33506 pcmpeqb xmm4, xmm0 33507 pxor xmm4, xmm1 33508 pmovzxbq xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero 33509 pand xmm4, xmm2 33510 movdqu xmmword ptr [r8 + 8*rsi + 32], xmm3 33511 movdqu xmmword ptr [r8 + 8*rsi + 48], xmm4 33512 add rsi, 8 33513 add rdi, 2 33514 jne .LBB4_712 33515 jmp .LBB4_1384 33516 .LBB4_730: 33517 mov edx, r10d 33518 and edx, -16 33519 lea rsi, [rdx - 16] 33520 mov r9, rsi 33521 shr r9, 4 33522 add r9, 1 33523 test rsi, rsi 33524 je .LBB4_1391 33525 # %bb.731: 33526 mov rdi, r9 33527 and rdi, -2 33528 neg rdi 33529 xor esi, esi 33530 pxor xmm2, xmm2 33531 pcmpeqd xmm3, xmm3 33532 movdqa xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1] 33533 .LBB4_732: # =>This Inner Loop Header: Depth=1 33534 movq xmm5, qword ptr [rcx + rsi] # xmm5 = mem[0],zero 33535 movq xmm6, qword ptr [rcx + rsi + 8] # xmm6 = mem[0],zero 33536 movdqa xmm0, xmm5 33537 pcmpgtb xmm0, xmm2 33538 pmovsxbw xmm0, xmm0 33539 movdqa xmm1, xmm6 33540 pcmpgtb xmm1, xmm2 33541 pmovsxbw xmm1, xmm1 33542 pcmpeqb xmm5, xmm2 33543 pxor xmm5, xmm3 33544 pmovsxbw xmm5, xmm5 33545 pcmpeqb xmm6, xmm2 33546 pxor xmm6, xmm3 33547 pmovsxbw xmm6, xmm6 33548 pblendvb xmm5, xmm4, xmm0 33549 movdqa xmm0, xmm1 33550 pblendvb xmm6, xmm4, xmm0 33551 movdqu xmmword ptr [r8 + 2*rsi], xmm5 33552 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm6 33553 movq xmm5, qword ptr [rcx + rsi + 16] # xmm5 = mem[0],zero 33554 movq xmm6, qword ptr [rcx + rsi + 24] # xmm6 = mem[0],zero 33555 movdqa xmm0, xmm5 33556 pcmpgtb xmm0, xmm2 33557 pmovsxbw xmm0, xmm0 33558 movdqa xmm1, xmm6 33559 pcmpgtb xmm1, xmm2 33560 pmovsxbw xmm1, xmm1 33561 pcmpeqb xmm5, xmm2 33562 pxor xmm5, xmm3 33563 pmovsxbw xmm5, xmm5 33564 pcmpeqb xmm6, xmm2 33565 pxor xmm6, xmm3 33566 pmovsxbw xmm6, xmm6 33567 pblendvb xmm5, xmm4, xmm0 33568 movdqa xmm0, xmm1 33569 pblendvb xmm6, xmm4, xmm0 33570 movdqu xmmword ptr [r8 + 2*rsi + 32], xmm5 33571 movdqu xmmword ptr [r8 + 2*rsi + 48], xmm6 33572 add rsi, 32 33573 add rdi, 2 33574 jne .LBB4_732 33575 jmp .LBB4_1392 33576 .LBB4_735: 33577 mov edx, r10d 33578 and edx, -16 33579 lea rsi, [rdx - 16] 33580 mov r9, rsi 33581 shr r9, 4 33582 add r9, 1 33583 test rsi, rsi 33584 je .LBB4_1400 33585 # %bb.736: 33586 mov rdi, r9 33587 and rdi, -2 33588 neg rdi 33589 xor esi, esi 33590 pxor xmm2, xmm2 33591 pcmpeqd xmm3, xmm3 33592 movdqa xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1] 33593 .LBB4_737: # =>This Inner Loop Header: Depth=1 33594 movq xmm5, qword ptr [rcx + rsi] # xmm5 = mem[0],zero 33595 movq xmm6, qword ptr [rcx + rsi + 8] # xmm6 = mem[0],zero 33596 movdqa xmm0, xmm5 33597 pcmpgtb xmm0, xmm2 33598 pmovsxbw xmm0, xmm0 33599 movdqa xmm1, xmm6 33600 pcmpgtb xmm1, xmm2 33601 pmovsxbw xmm1, xmm1 33602 pcmpeqb xmm5, xmm2 33603 pxor xmm5, xmm3 33604 pmovsxbw xmm5, xmm5 33605 pcmpeqb xmm6, xmm2 33606 pxor xmm6, xmm3 33607 pmovsxbw xmm6, xmm6 33608 pblendvb xmm5, xmm4, xmm0 33609 movdqa xmm0, xmm1 33610 pblendvb xmm6, xmm4, xmm0 33611 movdqu xmmword ptr [r8 + 2*rsi], xmm5 33612 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm6 33613 movq xmm5, qword ptr [rcx + rsi + 16] # xmm5 = mem[0],zero 33614 movq xmm6, qword ptr [rcx + rsi + 24] # xmm6 = mem[0],zero 33615 movdqa xmm0, xmm5 33616 pcmpgtb xmm0, xmm2 33617 pmovsxbw xmm0, xmm0 33618 movdqa xmm1, xmm6 33619 pcmpgtb xmm1, xmm2 33620 pmovsxbw xmm1, xmm1 33621 pcmpeqb xmm5, xmm2 33622 pxor xmm5, xmm3 33623 pmovsxbw xmm5, xmm5 33624 pcmpeqb xmm6, xmm2 33625 pxor xmm6, xmm3 33626 pmovsxbw xmm6, xmm6 33627 pblendvb xmm5, xmm4, xmm0 33628 movdqa xmm0, xmm1 33629 pblendvb xmm6, xmm4, xmm0 33630 movdqu xmmword ptr [r8 + 2*rsi + 32], xmm5 33631 movdqu xmmword ptr [r8 + 2*rsi + 48], xmm6 33632 add rsi, 32 33633 add rdi, 2 33634 jne .LBB4_737 33635 jmp .LBB4_1401 33636 .LBB4_746: 33637 mov edx, r10d 33638 and edx, -16 33639 lea rsi, [rdx - 16] 33640 mov r9, rsi 33641 shr r9, 4 33642 add r9, 1 33643 test rsi, rsi 33644 je .LBB4_1409 33645 # %bb.747: 33646 mov rdi, r9 33647 and rdi, -2 33648 neg rdi 33649 xor esi, esi 33650 pxor xmm0, xmm0 33651 movdqa xmm1, xmmword ptr [rip + .LCPI4_20] # xmm1 = [1,1,1,1,1,1,1,1] 33652 .LBB4_748: # =>This Inner Loop Header: Depth=1 33653 movdqu xmm2, xmmword ptr [rcx + 2*rsi] 33654 movdqu xmm3, xmmword ptr [rcx + 2*rsi + 16] 33655 pcmpeqw xmm2, xmm0 33656 pandn xmm2, xmm1 33657 pcmpeqw xmm3, xmm0 33658 pandn xmm3, xmm1 33659 movdqu xmmword ptr [r8 + 2*rsi], xmm2 33660 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm3 33661 movdqu xmm2, xmmword ptr [rcx + 2*rsi + 32] 33662 movdqu xmm3, xmmword ptr [rcx + 2*rsi + 48] 33663 pcmpeqw xmm2, xmm0 33664 pandn xmm2, xmm1 33665 pcmpeqw xmm3, xmm0 33666 pandn xmm3, xmm1 33667 movdqu xmmword ptr [r8 + 2*rsi + 32], xmm2 33668 movdqu xmmword ptr [r8 + 2*rsi + 48], xmm3 33669 add rsi, 32 33670 add rdi, 2 33671 jne .LBB4_748 33672 jmp .LBB4_1410 33673 .LBB4_751: 33674 mov edx, r10d 33675 and edx, -16 33676 lea rsi, [rdx - 16] 33677 mov r9, rsi 33678 shr r9, 4 33679 add r9, 1 33680 test rsi, rsi 33681 je .LBB4_1417 33682 # %bb.752: 33683 mov rdi, r9 33684 and rdi, -2 33685 neg rdi 33686 xor esi, esi 33687 pxor xmm0, xmm0 33688 movdqa xmm1, xmmword ptr [rip + .LCPI4_20] # xmm1 = [1,1,1,1,1,1,1,1] 33689 .LBB4_753: # =>This Inner Loop Header: Depth=1 33690 movdqu xmm2, xmmword ptr [rcx + 2*rsi] 33691 movdqu xmm3, xmmword ptr [rcx + 2*rsi + 16] 33692 pcmpeqw xmm2, xmm0 33693 pandn xmm2, xmm1 33694 pcmpeqw xmm3, xmm0 33695 pandn xmm3, xmm1 33696 movdqu xmmword ptr [r8 + 2*rsi], xmm2 33697 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm3 33698 movdqu xmm2, xmmword ptr [rcx + 2*rsi + 32] 33699 movdqu xmm3, xmmword ptr [rcx + 2*rsi + 48] 33700 pcmpeqw xmm2, xmm0 33701 pandn xmm2, xmm1 33702 pcmpeqw xmm3, xmm0 33703 pandn xmm3, xmm1 33704 movdqu xmmword ptr [r8 + 2*rsi + 32], xmm2 33705 movdqu xmmword ptr [r8 + 2*rsi + 48], xmm3 33706 add rsi, 32 33707 add rdi, 2 33708 jne .LBB4_753 33709 jmp .LBB4_1418 33710 .LBB4_756: 33711 mov edx, r11d 33712 and edx, -16 33713 lea rsi, [rdx - 16] 33714 mov r9, rsi 33715 shr r9, 4 33716 add r9, 1 33717 test rsi, rsi 33718 je .LBB4_1425 33719 # %bb.757: 33720 mov rdi, r9 33721 and rdi, -2 33722 neg rdi 33723 xor esi, esi 33724 pxor xmm2, xmm2 33725 pcmpeqd xmm3, xmm3 33726 movdqa xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1] 33727 .LBB4_758: # =>This Inner Loop Header: Depth=1 33728 movdqu xmm5, xmmword ptr [rcx + 2*rsi] 33729 movdqu xmm6, xmmword ptr [rcx + 2*rsi + 16] 33730 movdqa xmm0, xmm4 33731 pcmpgtw xmm0, xmm5 33732 pcmpeqw xmm5, xmm2 33733 pxor xmm5, xmm3 33734 movdqa xmm1, xmm4 33735 pcmpgtw xmm1, xmm6 33736 pcmpeqw xmm6, xmm2 33737 pxor xmm6, xmm3 33738 movdqa xmm7, xmm4 33739 pblendvb xmm7, xmm5, xmm0 33740 movdqa xmm5, xmm4 33741 movdqa xmm0, xmm1 33742 pblendvb xmm5, xmm6, xmm0 33743 movdqu xmmword ptr [r8 + 2*rsi], xmm7 33744 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm5 33745 movdqu xmm5, xmmword ptr [rcx + 2*rsi + 32] 33746 movdqu xmm6, xmmword ptr [rcx + 2*rsi + 48] 33747 movdqa xmm0, xmm4 33748 pcmpgtw xmm0, xmm5 33749 pcmpeqw xmm5, xmm2 33750 pxor xmm5, xmm3 33751 movdqa xmm1, xmm4 33752 pcmpgtw xmm1, xmm6 33753 pcmpeqw xmm6, xmm2 33754 pxor xmm6, xmm3 33755 movdqa xmm7, xmm4 33756 pblendvb xmm7, xmm5, xmm0 33757 movdqa xmm5, xmm4 33758 movdqa xmm0, xmm1 33759 pblendvb xmm5, xmm6, xmm0 33760 movdqu xmmword ptr [r8 + 2*rsi + 32], xmm7 33761 movdqu xmmword ptr [r8 + 2*rsi + 48], xmm5 33762 add rsi, 32 33763 add rdi, 2 33764 jne .LBB4_758 33765 jmp .LBB4_1426 33766 .LBB4_761: 33767 mov edx, r11d 33768 and edx, -16 33769 lea rsi, [rdx - 16] 33770 mov r9, rsi 33771 shr r9, 4 33772 add r9, 1 33773 test rsi, rsi 33774 je .LBB4_1434 33775 # %bb.762: 33776 mov rdi, r9 33777 and rdi, -2 33778 neg rdi 33779 xor esi, esi 33780 pxor xmm2, xmm2 33781 pcmpeqd xmm3, xmm3 33782 movdqa xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1] 33783 .LBB4_763: # =>This Inner Loop Header: Depth=1 33784 movdqu xmm5, xmmword ptr [rcx + 2*rsi] 33785 movdqu xmm6, xmmword ptr [rcx + 2*rsi + 16] 33786 movdqa xmm0, xmm4 33787 pcmpgtw xmm0, xmm5 33788 pcmpeqw xmm5, xmm2 33789 pxor xmm5, xmm3 33790 movdqa xmm1, xmm4 33791 pcmpgtw xmm1, xmm6 33792 pcmpeqw xmm6, xmm2 33793 pxor xmm6, xmm3 33794 movdqa xmm7, xmm4 33795 pblendvb xmm7, xmm5, xmm0 33796 movdqa xmm5, xmm4 33797 movdqa xmm0, xmm1 33798 pblendvb xmm5, xmm6, xmm0 33799 movdqu xmmword ptr [r8 + 2*rsi], xmm7 33800 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm5 33801 movdqu xmm5, xmmword ptr [rcx + 2*rsi + 32] 33802 movdqu xmm6, xmmword ptr [rcx + 2*rsi + 48] 33803 movdqa xmm0, xmm4 33804 pcmpgtw xmm0, xmm5 33805 pcmpeqw xmm5, xmm2 33806 pxor xmm5, xmm3 33807 movdqa xmm1, xmm4 33808 pcmpgtw xmm1, xmm6 33809 pcmpeqw xmm6, xmm2 33810 pxor xmm6, xmm3 33811 movdqa xmm7, xmm4 33812 pblendvb xmm7, xmm5, xmm0 33813 movdqa xmm5, xmm4 33814 movdqa xmm0, xmm1 33815 pblendvb xmm5, xmm6, xmm0 33816 movdqu xmmword ptr [r8 + 2*rsi + 32], xmm7 33817 movdqu xmmword ptr [r8 + 2*rsi + 48], xmm5 33818 add rsi, 32 33819 add rdi, 2 33820 jne .LBB4_763 33821 jmp .LBB4_1435 33822 .LBB4_778: 33823 mov edx, r10d 33824 and edx, -16 33825 lea rsi, [rdx - 16] 33826 mov r9, rsi 33827 shr r9, 4 33828 add r9, 1 33829 test rsi, rsi 33830 je .LBB4_1443 33831 # %bb.779: 33832 mov rdi, r9 33833 and rdi, -2 33834 neg rdi 33835 xor esi, esi 33836 pxor xmm0, xmm0 33837 pcmpeqd xmm1, xmm1 33838 movdqa xmm2, xmmword ptr [rip + .LCPI4_20] # xmm2 = [1,1,1,1,1,1,1,1] 33839 .LBB4_780: # =>This Inner Loop Header: Depth=1 33840 movq xmm3, qword ptr [rcx + rsi] # xmm3 = mem[0],zero 33841 movq xmm4, qword ptr [rcx + rsi + 8] # xmm4 = mem[0],zero 33842 pcmpeqb xmm3, xmm0 33843 pxor xmm3, xmm1 33844 pmovzxbw xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 33845 pand xmm3, xmm2 33846 pcmpeqb xmm4, xmm0 33847 pxor xmm4, xmm1 33848 pmovzxbw xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 33849 pand xmm4, xmm2 33850 movdqu xmmword ptr [r8 + 2*rsi], xmm3 33851 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm4 33852 movq xmm3, qword ptr [rcx + rsi + 16] # xmm3 = mem[0],zero 33853 movq xmm4, qword ptr [rcx + rsi + 24] # xmm4 = mem[0],zero 33854 pcmpeqb xmm3, xmm0 33855 pxor xmm3, xmm1 33856 pmovzxbw xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 33857 pand xmm3, xmm2 33858 pcmpeqb xmm4, xmm0 33859 pxor xmm4, xmm1 33860 pmovzxbw xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 33861 pand xmm4, xmm2 33862 movdqu xmmword ptr [r8 + 2*rsi + 32], xmm3 33863 movdqu xmmword ptr [r8 + 2*rsi + 48], xmm4 33864 add rsi, 32 33865 add rdi, 2 33866 jne .LBB4_780 33867 jmp .LBB4_1444 33868 .LBB4_783: 33869 mov edx, r10d 33870 and edx, -16 33871 lea rsi, [rdx - 16] 33872 mov r9, rsi 33873 shr r9, 4 33874 add r9, 1 33875 test rsi, rsi 33876 je .LBB4_1451 33877 # %bb.784: 33878 mov rdi, r9 33879 and rdi, -2 33880 neg rdi 33881 xor esi, esi 33882 pxor xmm0, xmm0 33883 pcmpeqd xmm1, xmm1 33884 movdqa xmm2, xmmword ptr [rip + .LCPI4_20] # xmm2 = [1,1,1,1,1,1,1,1] 33885 .LBB4_785: # =>This Inner Loop Header: Depth=1 33886 movq xmm3, qword ptr [rcx + rsi] # xmm3 = mem[0],zero 33887 movq xmm4, qword ptr [rcx + rsi + 8] # xmm4 = mem[0],zero 33888 pcmpeqb xmm3, xmm0 33889 pxor xmm3, xmm1 33890 pmovzxbw xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 33891 pand xmm3, xmm2 33892 pcmpeqb xmm4, xmm0 33893 pxor xmm4, xmm1 33894 pmovzxbw xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 33895 pand xmm4, xmm2 33896 movdqu xmmword ptr [r8 + 2*rsi], xmm3 33897 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm4 33898 movq xmm3, qword ptr [rcx + rsi + 16] # xmm3 = mem[0],zero 33899 movq xmm4, qword ptr [rcx + rsi + 24] # xmm4 = mem[0],zero 33900 pcmpeqb xmm3, xmm0 33901 pxor xmm3, xmm1 33902 pmovzxbw xmm3, xmm3 # xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 33903 pand xmm3, xmm2 33904 pcmpeqb xmm4, xmm0 33905 pxor xmm4, xmm1 33906 pmovzxbw xmm4, xmm4 # xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 33907 pand xmm4, xmm2 33908 movdqu xmmword ptr [r8 + 2*rsi + 32], xmm3 33909 movdqu xmmword ptr [r8 + 2*rsi + 48], xmm4 33910 add rsi, 32 33911 add rdi, 2 33912 jne .LBB4_785 33913 jmp .LBB4_1452 33914 .LBB4_806: 33915 mov edx, r10d 33916 and edx, -4 33917 lea rsi, [rdx - 4] 33918 mov r9, rsi 33919 shr r9, 2 33920 add r9, 1 33921 test rsi, rsi 33922 je .LBB4_1459 33923 # %bb.807: 33924 mov rdi, r9 33925 and rdi, -2 33926 neg rdi 33927 xor esi, esi 33928 pxor xmm2, xmm2 33929 pcmpeqd xmm3, xmm3 33930 movapd xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 33931 .LBB4_808: # =>This Inner Loop Header: Depth=1 33932 movzx eax, word ptr [rcx + rsi] 33933 movd xmm5, eax 33934 movzx eax, word ptr [rcx + rsi + 2] 33935 movd xmm6, eax 33936 movdqa xmm0, xmm5 33937 pcmpgtb xmm0, xmm2 33938 pmovsxbq xmm0, xmm0 33939 movdqa xmm1, xmm6 33940 pcmpgtb xmm1, xmm2 33941 pmovsxbq xmm1, xmm1 33942 pcmpeqb xmm5, xmm2 33943 pxor xmm5, xmm3 33944 pmovsxbq xmm5, xmm5 33945 pcmpeqb xmm6, xmm2 33946 pxor xmm6, xmm3 33947 pmovsxbq xmm6, xmm6 33948 blendvpd xmm5, xmm4, xmm0 33949 movdqa xmm0, xmm1 33950 blendvpd xmm6, xmm4, xmm0 33951 movupd xmmword ptr [r8 + 8*rsi], xmm5 33952 movupd xmmword ptr [r8 + 8*rsi + 16], xmm6 33953 movzx eax, word ptr [rcx + rsi + 4] 33954 movd xmm5, eax 33955 movzx eax, word ptr [rcx + rsi + 6] 33956 movd xmm6, eax 33957 movdqa xmm0, xmm5 33958 pcmpgtb xmm0, xmm2 33959 pmovsxbq xmm0, xmm0 33960 movdqa xmm1, xmm6 33961 pcmpgtb xmm1, xmm2 33962 pmovsxbq xmm1, xmm1 33963 pcmpeqb xmm5, xmm2 33964 pxor xmm5, xmm3 33965 pmovsxbq xmm5, xmm5 33966 pcmpeqb xmm6, xmm2 33967 pxor xmm6, xmm3 33968 pmovsxbq xmm6, xmm6 33969 blendvpd xmm5, xmm4, xmm0 33970 movdqa xmm0, xmm1 33971 blendvpd xmm6, xmm4, xmm0 33972 movupd xmmword ptr [r8 + 8*rsi + 32], xmm5 33973 movupd xmmword ptr [r8 + 8*rsi + 48], xmm6 33974 add rsi, 8 33975 add rdi, 2 33976 jne .LBB4_808 33977 jmp .LBB4_1460 33978 .LBB4_811: 33979 mov edx, eax 33980 and edx, -8 33981 lea rsi, [rdx - 8] 33982 mov r9, rsi 33983 shr r9, 3 33984 add r9, 1 33985 test rsi, rsi 33986 je .LBB4_1468 33987 # %bb.812: 33988 mov rdi, r9 33989 and rdi, -2 33990 neg rdi 33991 xor esi, esi 33992 pxor xmm2, xmm2 33993 pcmpeqd xmm3, xmm3 33994 movaps xmm4, xmmword ptr [rip + .LCPI4_19] # xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 33995 .LBB4_813: # =>This Inner Loop Header: Depth=1 33996 movd xmm5, dword ptr [rcx + rsi] # xmm5 = mem[0],zero,zero,zero 33997 movd xmm6, dword ptr [rcx + rsi + 4] # xmm6 = mem[0],zero,zero,zero 33998 movdqa xmm0, xmm5 33999 pcmpgtb xmm0, xmm2 34000 pmovsxbd xmm0, xmm0 34001 movdqa xmm1, xmm6 34002 pcmpgtb xmm1, xmm2 34003 pmovsxbd xmm1, xmm1 34004 pcmpeqb xmm5, xmm2 34005 pxor xmm5, xmm3 34006 pmovsxbd xmm5, xmm5 34007 cvtdq2ps xmm5, xmm5 34008 pcmpeqb xmm6, xmm2 34009 pxor xmm6, xmm3 34010 pmovsxbd xmm6, xmm6 34011 cvtdq2ps xmm6, xmm6 34012 blendvps xmm5, xmm4, xmm0 34013 movdqa xmm0, xmm1 34014 blendvps xmm6, xmm4, xmm0 34015 movups xmmword ptr [r8 + 4*rsi], xmm5 34016 movups xmmword ptr [r8 + 4*rsi + 16], xmm6 34017 movd xmm5, dword ptr [rcx + rsi + 8] # xmm5 = mem[0],zero,zero,zero 34018 movd xmm6, dword ptr [rcx + rsi + 12] # xmm6 = mem[0],zero,zero,zero 34019 movdqa xmm0, xmm5 34020 pcmpgtb xmm0, xmm2 34021 pmovsxbd xmm0, xmm0 34022 movdqa xmm1, xmm6 34023 pcmpgtb xmm1, xmm2 34024 pmovsxbd xmm1, xmm1 34025 pcmpeqb xmm5, xmm2 34026 pxor xmm5, xmm3 34027 pmovsxbd xmm5, xmm5 34028 cvtdq2ps xmm5, xmm5 34029 pcmpeqb xmm6, xmm2 34030 pxor xmm6, xmm3 34031 pmovsxbd xmm6, xmm6 34032 cvtdq2ps xmm6, xmm6 34033 blendvps xmm5, xmm4, xmm0 34034 movdqa xmm0, xmm1 34035 blendvps xmm6, xmm4, xmm0 34036 movups xmmword ptr [r8 + 4*rsi + 32], xmm5 34037 movups xmmword ptr [r8 + 4*rsi + 48], xmm6 34038 add rsi, 16 34039 add rdi, 2 34040 jne .LBB4_813 34041 jmp .LBB4_1469 34042 .LBB4_816: 34043 mov edx, r10d 34044 and edx, -4 34045 lea rsi, [rdx - 4] 34046 mov r9, rsi 34047 shr r9, 2 34048 add r9, 1 34049 test rsi, rsi 34050 je .LBB4_1490 34051 # %bb.817: 34052 mov rdi, r9 34053 and rdi, -2 34054 neg rdi 34055 xor esi, esi 34056 pxor xmm0, xmm0 34057 movdqa xmm1, xmmword ptr [rip + .LCPI4_15] # xmm1 = [1,1] 34058 .LBB4_818: # =>This Inner Loop Header: Depth=1 34059 movdqu xmm2, xmmword ptr [rcx + 8*rsi] 34060 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 16] 34061 pcmpeqq xmm2, xmm0 34062 pandn xmm2, xmm1 34063 pcmpeqq xmm3, xmm0 34064 pandn xmm3, xmm1 34065 movdqu xmmword ptr [r8 + 8*rsi], xmm2 34066 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm3 34067 movdqu xmm2, xmmword ptr [rcx + 8*rsi + 32] 34068 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 48] 34069 pcmpeqq xmm2, xmm0 34070 pandn xmm2, xmm1 34071 pcmpeqq xmm3, xmm0 34072 pandn xmm3, xmm1 34073 movdqu xmmword ptr [r8 + 8*rsi + 32], xmm2 34074 movdqu xmmword ptr [r8 + 8*rsi + 48], xmm3 34075 add rsi, 8 34076 add rdi, 2 34077 jne .LBB4_818 34078 jmp .LBB4_1491 34079 .LBB4_843: 34080 mov edx, r11d 34081 and edx, -4 34082 lea rsi, [rdx - 4] 34083 mov r9, rsi 34084 shr r9, 2 34085 add r9, 1 34086 test rsi, rsi 34087 je .LBB4_1498 34088 # %bb.844: 34089 mov rdi, r9 34090 and rdi, -2 34091 neg rdi 34092 xor esi, esi 34093 pxor xmm2, xmm2 34094 pcmpeqd xmm3, xmm3 34095 movdqa xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 34096 .LBB4_845: # =>This Inner Loop Header: Depth=1 34097 movdqu xmm5, xmmword ptr [rcx + 8*rsi] 34098 movdqu xmm6, xmmword ptr [rcx + 8*rsi + 16] 34099 movdqa xmm0, xmm4 34100 pcmpgtq xmm0, xmm5 34101 pcmpeqq xmm5, xmm2 34102 pxor xmm5, xmm3 34103 movdqa xmm1, xmm4 34104 pcmpgtq xmm1, xmm6 34105 pcmpeqq xmm6, xmm2 34106 pxor xmm6, xmm3 34107 movdqa xmm7, xmm4 34108 blendvpd xmm7, xmm5, xmm0 34109 movdqa xmm5, xmm4 34110 movdqa xmm0, xmm1 34111 blendvpd xmm5, xmm6, xmm0 34112 movupd xmmword ptr [r8 + 8*rsi], xmm7 34113 movupd xmmword ptr [r8 + 8*rsi + 16], xmm5 34114 movdqu xmm5, xmmword ptr [rcx + 8*rsi + 32] 34115 movdqu xmm6, xmmword ptr [rcx + 8*rsi + 48] 34116 movdqa xmm0, xmm4 34117 pcmpgtq xmm0, xmm5 34118 pcmpeqq xmm5, xmm2 34119 pxor xmm5, xmm3 34120 movdqa xmm1, xmm4 34121 pcmpgtq xmm1, xmm6 34122 pcmpeqq xmm6, xmm2 34123 pxor xmm6, xmm3 34124 movdqa xmm7, xmm4 34125 blendvpd xmm7, xmm5, xmm0 34126 movdqa xmm5, xmm4 34127 movdqa xmm0, xmm1 34128 blendvpd xmm5, xmm6, xmm0 34129 movupd xmmword ptr [r8 + 8*rsi + 32], xmm7 34130 movupd xmmword ptr [r8 + 8*rsi + 48], xmm5 34131 add rsi, 8 34132 add rdi, 2 34133 jne .LBB4_845 34134 jmp .LBB4_1499 34135 .LBB4_989: 34136 movss xmm0, dword ptr [rip + .LCPI4_14] # xmm0 = mem[0],zero,zero,zero 34137 .LBB4_990: 34138 jle .LBB4_992 34139 # %bb.991: 34140 movss xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero 34141 .LBB4_992: 34142 movss dword ptr [r8 + 4*rax], xmm0 34143 jmp .LBB4_1655 34144 .LBB4_866: 34145 mov edx, eax 34146 and edx, -8 34147 xor esi, esi 34148 xorps xmm0, xmm0 34149 movdqa xmm1, xmmword ptr [rip + .LCPI4_8] # xmm1 = [1,1,1,1] 34150 .LBB4_867: # =>This Inner Loop Header: Depth=1 34151 movdqu xmm2, xmmword ptr [rcx + 4*rsi] 34152 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 16] 34153 movdqa xmm4, xmm2 34154 psrad xmm4, 31 34155 por xmm4, xmm1 34156 movdqa xmm5, xmm3 34157 psrad xmm5, 31 34158 por xmm5, xmm1 34159 cvtdq2ps xmm4, xmm4 34160 cvtdq2ps xmm5, xmm5 34161 cmpneqps xmm2, xmm0 34162 andps xmm2, xmm4 34163 cmpneqps xmm3, xmm0 34164 andps xmm3, xmm5 34165 movups xmmword ptr [r8 + 4*rsi], xmm2 34166 movups xmmword ptr [r8 + 4*rsi + 16], xmm3 34167 add rsi, 8 34168 cmp rdx, rsi 34169 jne .LBB4_867 34170 # %bb.868: 34171 cmp rdx, rax 34172 je .LBB4_1655 34173 jmp .LBB4_869 34174 .LBB4_876: 34175 mov edx, r10d 34176 and edx, -4 34177 lea rsi, [rdx - 4] 34178 mov r9, rsi 34179 shr r9, 2 34180 add r9, 1 34181 test rsi, rsi 34182 je .LBB4_1507 34183 # %bb.877: 34184 mov rdi, r9 34185 and rdi, -2 34186 neg rdi 34187 xor esi, esi 34188 pxor xmm0, xmm0 34189 pcmpeqd xmm1, xmm1 34190 movdqa xmm2, xmmword ptr [rip + .LCPI4_15] # xmm2 = [1,1] 34191 .LBB4_878: # =>This Inner Loop Header: Depth=1 34192 movzx eax, word ptr [rcx + rsi] 34193 movd xmm3, eax 34194 movzx eax, word ptr [rcx + rsi + 2] 34195 movd xmm4, eax 34196 pcmpeqb xmm3, xmm0 34197 pxor xmm3, xmm1 34198 pmovzxbq xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 34199 pand xmm3, xmm2 34200 pcmpeqb xmm4, xmm0 34201 pxor xmm4, xmm1 34202 pmovzxbq xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero 34203 pand xmm4, xmm2 34204 movdqu xmmword ptr [r8 + 8*rsi], xmm3 34205 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm4 34206 movzx eax, word ptr [rcx + rsi + 4] 34207 movd xmm3, eax 34208 movzx eax, word ptr [rcx + rsi + 6] 34209 movd xmm4, eax 34210 pcmpeqb xmm3, xmm0 34211 pxor xmm3, xmm1 34212 pmovzxbq xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 34213 pand xmm3, xmm2 34214 pcmpeqb xmm4, xmm0 34215 pxor xmm4, xmm1 34216 pmovzxbq xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero 34217 pand xmm4, xmm2 34218 movdqu xmmword ptr [r8 + 8*rsi + 32], xmm3 34219 movdqu xmmword ptr [r8 + 8*rsi + 48], xmm4 34220 add rsi, 8 34221 add rdi, 2 34222 jne .LBB4_878 34223 jmp .LBB4_1508 34224 .LBB4_881: 34225 mov edx, eax 34226 and edx, -8 34227 lea rsi, [rdx - 8] 34228 mov r9, rsi 34229 shr r9, 3 34230 add r9, 1 34231 test rsi, rsi 34232 je .LBB4_1515 34233 # %bb.882: 34234 mov rdi, r9 34235 and rdi, -2 34236 neg rdi 34237 xor esi, esi 34238 pxor xmm0, xmm0 34239 pcmpeqd xmm1, xmm1 34240 movdqa xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1] 34241 .LBB4_883: # =>This Inner Loop Header: Depth=1 34242 movd xmm3, dword ptr [rcx + rsi] # xmm3 = mem[0],zero,zero,zero 34243 movd xmm4, dword ptr [rcx + rsi + 4] # xmm4 = mem[0],zero,zero,zero 34244 pcmpeqb xmm3, xmm0 34245 pxor xmm3, xmm1 34246 pmovzxbd xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 34247 pand xmm3, xmm2 34248 cvtdq2ps xmm3, xmm3 34249 pcmpeqb xmm4, xmm0 34250 pxor xmm4, xmm1 34251 pmovzxbd xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 34252 pand xmm4, xmm2 34253 cvtdq2ps xmm4, xmm4 34254 movups xmmword ptr [r8 + 4*rsi], xmm3 34255 movups xmmword ptr [r8 + 4*rsi + 16], xmm4 34256 movd xmm3, dword ptr [rcx + rsi + 8] # xmm3 = mem[0],zero,zero,zero 34257 movd xmm4, dword ptr [rcx + rsi + 12] # xmm4 = mem[0],zero,zero,zero 34258 pcmpeqb xmm3, xmm0 34259 pxor xmm3, xmm1 34260 pmovzxbd xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 34261 pand xmm3, xmm2 34262 cvtdq2ps xmm3, xmm3 34263 pcmpeqb xmm4, xmm0 34264 pxor xmm4, xmm1 34265 pmovzxbd xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 34266 pand xmm4, xmm2 34267 cvtdq2ps xmm4, xmm4 34268 movups xmmword ptr [r8 + 4*rsi + 32], xmm3 34269 movups xmmword ptr [r8 + 4*rsi + 48], xmm4 34270 add rsi, 16 34271 add rdi, 2 34272 jne .LBB4_883 34273 jmp .LBB4_1516 34274 .LBB4_892: 34275 mov edx, eax 34276 and edx, -8 34277 lea rsi, [rdx - 8] 34278 mov r9, rsi 34279 shr r9, 3 34280 add r9, 1 34281 test rsi, rsi 34282 je .LBB4_1535 34283 # %bb.893: 34284 mov rdi, r9 34285 and rdi, -2 34286 neg rdi 34287 xor esi, esi 34288 pxor xmm0, xmm0 34289 pcmpeqd xmm1, xmm1 34290 movdqa xmm2, xmmword ptr [rip + .LCPI4_12] # xmm2 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u> 34291 .LBB4_894: # =>This Inner Loop Header: Depth=1 34292 movdqu xmm3, xmmword ptr [rcx + 4*rsi] 34293 movdqu xmm4, xmmword ptr [rcx + 4*rsi + 16] 34294 pcmpeqd xmm3, xmm0 34295 pxor xmm3, xmm1 34296 packssdw xmm3, xmm3 34297 packsswb xmm3, xmm3 34298 pand xmm3, xmm2 34299 pcmpeqd xmm4, xmm0 34300 pxor xmm4, xmm1 34301 packssdw xmm4, xmm4 34302 packsswb xmm4, xmm4 34303 pand xmm4, xmm2 34304 movd dword ptr [r8 + rsi], xmm3 34305 movd dword ptr [r8 + rsi + 4], xmm4 34306 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 32] 34307 movdqu xmm4, xmmword ptr [rcx + 4*rsi + 48] 34308 pcmpeqd xmm3, xmm0 34309 pxor xmm3, xmm1 34310 packssdw xmm3, xmm3 34311 packsswb xmm3, xmm3 34312 pand xmm3, xmm2 34313 pcmpeqd xmm4, xmm0 34314 pxor xmm4, xmm1 34315 packssdw xmm4, xmm4 34316 packsswb xmm4, xmm4 34317 pand xmm4, xmm2 34318 movd dword ptr [r8 + rsi + 8], xmm3 34319 movd dword ptr [r8 + rsi + 12], xmm4 34320 add rsi, 16 34321 add rdi, 2 34322 jne .LBB4_894 34323 jmp .LBB4_1536 34324 .LBB4_897: 34325 mov edx, eax 34326 and edx, -4 34327 lea rsi, [rdx - 4] 34328 mov r9, rsi 34329 shr r9, 2 34330 add r9, 1 34331 test rsi, rsi 34332 je .LBB4_1543 34333 # %bb.898: 34334 mov rdi, r9 34335 and rdi, -2 34336 neg rdi 34337 xor esi, esi 34338 xorpd xmm2, xmm2 34339 movapd xmm3, xmmword ptr [rip + .LCPI4_0] # xmm3 = [-0.0E+0,-0.0E+0] 34340 movapd xmm4, xmmword ptr [rip + .LCPI4_1] # xmm4 = [1.0E+0,1.0E+0] 34341 movdqa xmm5, xmmword ptr [rip + .LCPI4_7] # xmm5 = <0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 34342 .LBB4_899: # =>This Inner Loop Header: Depth=1 34343 movupd xmm6, xmmword ptr [rcx + 8*rsi] 34344 movupd xmm7, xmmword ptr [rcx + 8*rsi + 16] 34345 movapd xmm0, xmm6 34346 cmpeqpd xmm0, xmm2 34347 packssdw xmm0, xmm0 34348 packssdw xmm0, xmm0 34349 packsswb xmm0, xmm0 34350 movapd xmm1, xmm7 34351 cmpeqpd xmm1, xmm2 34352 packssdw xmm1, xmm1 34353 packssdw xmm1, xmm1 34354 packsswb xmm1, xmm1 34355 andpd xmm6, xmm3 34356 orpd xmm6, xmm4 34357 andpd xmm7, xmm3 34358 orpd xmm7, xmm4 34359 cvttpd2dq xmm6, xmm6 34360 pshufb xmm6, xmm5 34361 cvttpd2dq xmm7, xmm7 34362 pshufb xmm7, xmm5 34363 pblendvb xmm6, xmm2, xmm0 34364 movdqa xmm0, xmm1 34365 pblendvb xmm7, xmm2, xmm0 34366 pextrw word ptr [r8 + rsi], xmm6, 0 34367 pextrw word ptr [r8 + rsi + 2], xmm7, 0 34368 movupd xmm6, xmmword ptr [rcx + 8*rsi + 32] 34369 movupd xmm7, xmmword ptr [rcx + 8*rsi + 48] 34370 movapd xmm0, xmm6 34371 cmpeqpd xmm0, xmm2 34372 packssdw xmm0, xmm0 34373 packssdw xmm0, xmm0 34374 packsswb xmm0, xmm0 34375 movapd xmm1, xmm7 34376 cmpeqpd xmm1, xmm2 34377 packssdw xmm1, xmm1 34378 packssdw xmm1, xmm1 34379 packsswb xmm1, xmm1 34380 andpd xmm6, xmm3 34381 orpd xmm6, xmm4 34382 andpd xmm7, xmm3 34383 orpd xmm7, xmm4 34384 cvttpd2dq xmm6, xmm6 34385 pshufb xmm6, xmm5 34386 cvttpd2dq xmm7, xmm7 34387 pshufb xmm7, xmm5 34388 pblendvb xmm6, xmm2, xmm0 34389 movdqa xmm0, xmm1 34390 pblendvb xmm7, xmm2, xmm0 34391 pextrw word ptr [r8 + rsi + 4], xmm6, 0 34392 pextrw word ptr [r8 + rsi + 6], xmm7, 0 34393 add rsi, 8 34394 add rdi, 2 34395 jne .LBB4_899 34396 jmp .LBB4_1544 34397 .LBB4_902: 34398 mov esi, r10d 34399 and esi, -32 34400 lea rax, [rsi - 32] 34401 mov r9, rax 34402 shr r9, 5 34403 add r9, 1 34404 test rax, rax 34405 je .LBB4_1552 34406 # %bb.903: 34407 mov rdi, r9 34408 and rdi, -2 34409 neg rdi 34410 xor eax, eax 34411 pxor xmm2, xmm2 34412 pcmpeqd xmm3, xmm3 34413 movdqa xmm4, xmmword ptr [rip + .LCPI4_22] # xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 34414 .LBB4_904: # =>This Inner Loop Header: Depth=1 34415 movdqu xmm5, xmmword ptr [rcx + rax] 34416 movdqu xmm6, xmmword ptr [rcx + rax + 16] 34417 movdqa xmm0, xmm4 34418 pcmpgtb xmm0, xmm5 34419 pcmpeqb xmm5, xmm2 34420 pxor xmm5, xmm3 34421 movdqa xmm1, xmm4 34422 pcmpgtb xmm1, xmm6 34423 pcmpeqb xmm6, xmm2 34424 pxor xmm6, xmm3 34425 movdqa xmm7, xmm4 34426 pblendvb xmm7, xmm5, xmm0 34427 movdqa xmm5, xmm4 34428 movdqa xmm0, xmm1 34429 pblendvb xmm5, xmm6, xmm0 34430 movdqu xmmword ptr [r8 + rax], xmm7 34431 movdqu xmmword ptr [r8 + rax + 16], xmm5 34432 movdqu xmm5, xmmword ptr [rcx + rax + 32] 34433 movdqu xmm6, xmmword ptr [rcx + rax + 48] 34434 movdqa xmm0, xmm4 34435 pcmpgtb xmm0, xmm5 34436 pcmpeqb xmm5, xmm2 34437 pxor xmm5, xmm3 34438 movdqa xmm1, xmm4 34439 pcmpgtb xmm1, xmm6 34440 pcmpeqb xmm6, xmm2 34441 pxor xmm6, xmm3 34442 movdqa xmm7, xmm4 34443 pblendvb xmm7, xmm5, xmm0 34444 movdqa xmm5, xmm4 34445 movdqa xmm0, xmm1 34446 pblendvb xmm5, xmm6, xmm0 34447 movdqu xmmword ptr [r8 + rax + 32], xmm7 34448 movdqu xmmword ptr [r8 + rax + 48], xmm5 34449 add rax, 64 34450 add rdi, 2 34451 jne .LBB4_904 34452 jmp .LBB4_1553 34453 .LBB4_907: 34454 mov edx, eax 34455 and edx, -4 34456 lea rsi, [rdx - 4] 34457 mov r9, rsi 34458 shr r9, 2 34459 add r9, 1 34460 test rsi, rsi 34461 je .LBB4_1561 34462 # %bb.908: 34463 mov rdi, r9 34464 and rdi, -2 34465 neg rdi 34466 xor esi, esi 34467 pxor xmm0, xmm0 34468 pcmpeqd xmm1, xmm1 34469 movdqa xmm2, xmmword ptr [rip + .LCPI4_18] # xmm2 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 34470 .LBB4_909: # =>This Inner Loop Header: Depth=1 34471 movdqu xmm3, xmmword ptr [rcx + 8*rsi] 34472 movdqu xmm4, xmmword ptr [rcx + 8*rsi + 16] 34473 pcmpeqq xmm3, xmm0 34474 pxor xmm3, xmm1 34475 packssdw xmm3, xmm3 34476 packssdw xmm3, xmm3 34477 packsswb xmm3, xmm3 34478 pand xmm3, xmm2 34479 pcmpeqq xmm4, xmm0 34480 pxor xmm4, xmm1 34481 packssdw xmm4, xmm4 34482 packssdw xmm4, xmm4 34483 packsswb xmm4, xmm4 34484 pextrw word ptr [r8 + rsi], xmm3, 0 34485 pand xmm4, xmm2 34486 pextrw word ptr [r8 + rsi + 2], xmm4, 0 34487 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 32] 34488 movdqu xmm4, xmmword ptr [rcx + 8*rsi + 48] 34489 pcmpeqq xmm3, xmm0 34490 pxor xmm3, xmm1 34491 packssdw xmm3, xmm3 34492 packssdw xmm3, xmm3 34493 packsswb xmm3, xmm3 34494 pand xmm3, xmm2 34495 pcmpeqq xmm4, xmm0 34496 pxor xmm4, xmm1 34497 packssdw xmm4, xmm4 34498 packssdw xmm4, xmm4 34499 packsswb xmm4, xmm4 34500 pextrw word ptr [r8 + rsi + 4], xmm3, 0 34501 pand xmm4, xmm2 34502 pextrw word ptr [r8 + rsi + 6], xmm4, 0 34503 add rsi, 8 34504 add rdi, 2 34505 jne .LBB4_909 34506 jmp .LBB4_1562 34507 .LBB4_912: 34508 mov edx, eax 34509 and edx, -16 34510 lea rsi, [rdx - 16] 34511 mov r9, rsi 34512 shr r9, 4 34513 add r9, 1 34514 test rsi, rsi 34515 je .LBB4_1569 34516 # %bb.913: 34517 mov rdi, r9 34518 and rdi, -2 34519 neg rdi 34520 xor esi, esi 34521 pxor xmm0, xmm0 34522 pcmpeqd xmm1, xmm1 34523 movdqa xmm2, xmmword ptr [rip + .LCPI4_21] # xmm2 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u> 34524 .LBB4_914: # =>This Inner Loop Header: Depth=1 34525 movdqu xmm3, xmmword ptr [rcx + 2*rsi] 34526 movdqu xmm4, xmmword ptr [rcx + 2*rsi + 16] 34527 pcmpeqw xmm3, xmm0 34528 pxor xmm3, xmm1 34529 packsswb xmm3, xmm3 34530 pand xmm3, xmm2 34531 pcmpeqw xmm4, xmm0 34532 pxor xmm4, xmm1 34533 packsswb xmm4, xmm4 34534 pand xmm4, xmm2 34535 punpcklqdq xmm3, xmm4 # xmm3 = xmm3[0],xmm4[0] 34536 movdqu xmmword ptr [r8 + rsi], xmm3 34537 movdqu xmm3, xmmword ptr [rcx + 2*rsi + 32] 34538 movdqu xmm4, xmmword ptr [rcx + 2*rsi + 48] 34539 pcmpeqw xmm3, xmm0 34540 pxor xmm3, xmm1 34541 packsswb xmm3, xmm3 34542 pand xmm3, xmm2 34543 pcmpeqw xmm4, xmm0 34544 pxor xmm4, xmm1 34545 packsswb xmm4, xmm4 34546 pand xmm4, xmm2 34547 punpcklqdq xmm3, xmm4 # xmm3 = xmm3[0],xmm4[0] 34548 movdqu xmmword ptr [r8 + rsi + 16], xmm3 34549 add rsi, 32 34550 add rdi, 2 34551 jne .LBB4_914 34552 jmp .LBB4_1570 34553 .LBB4_917: 34554 mov esi, r10d 34555 and esi, -16 34556 lea rax, [rsi - 16] 34557 mov r9, rax 34558 shr r9, 4 34559 add r9, 1 34560 test rax, rax 34561 je .LBB4_1577 34562 # %bb.918: 34563 mov rdi, r9 34564 and rdi, -2 34565 neg rdi 34566 xor eax, eax 34567 pxor xmm2, xmm2 34568 pcmpeqd xmm3, xmm3 34569 movdqa xmm4, xmmword ptr [rip + .LCPI4_21] # xmm4 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u> 34570 .LBB4_919: # =>This Inner Loop Header: Depth=1 34571 movdqu xmm5, xmmword ptr [rcx + 2*rax] 34572 movdqu xmm6, xmmword ptr [rcx + 2*rax + 16] 34573 movdqa xmm0, xmm5 34574 pcmpgtw xmm0, xmm2 34575 packsswb xmm0, xmm0 34576 movdqa xmm1, xmm6 34577 pcmpgtw xmm1, xmm2 34578 packsswb xmm1, xmm1 34579 pcmpeqw xmm5, xmm2 34580 pxor xmm5, xmm3 34581 packsswb xmm5, xmm5 34582 pcmpeqw xmm6, xmm2 34583 pxor xmm6, xmm3 34584 packsswb xmm6, xmm6 34585 pblendvb xmm5, xmm4, xmm0 34586 movdqa xmm0, xmm1 34587 pblendvb xmm6, xmm4, xmm0 34588 punpcklqdq xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 34589 movdqu xmmword ptr [r8 + rax], xmm5 34590 movdqu xmm5, xmmword ptr [rcx + 2*rax + 32] 34591 movdqu xmm6, xmmword ptr [rcx + 2*rax + 48] 34592 movdqa xmm0, xmm5 34593 pcmpgtw xmm0, xmm2 34594 packsswb xmm0, xmm0 34595 movdqa xmm1, xmm6 34596 pcmpgtw xmm1, xmm2 34597 packsswb xmm1, xmm1 34598 pcmpeqw xmm5, xmm2 34599 pxor xmm5, xmm3 34600 packsswb xmm5, xmm5 34601 pcmpeqw xmm6, xmm2 34602 pxor xmm6, xmm3 34603 packsswb xmm6, xmm6 34604 pblendvb xmm5, xmm4, xmm0 34605 movdqa xmm0, xmm1 34606 pblendvb xmm6, xmm4, xmm0 34607 punpcklqdq xmm5, xmm6 # xmm5 = xmm5[0],xmm6[0] 34608 movdqu xmmword ptr [r8 + rax + 16], xmm5 34609 add rax, 32 34610 add rdi, 2 34611 jne .LBB4_919 34612 jmp .LBB4_1578 34613 .LBB4_922: 34614 mov esi, r10d 34615 and esi, -4 34616 lea rax, [rsi - 4] 34617 mov r9, rax 34618 shr r9, 2 34619 add r9, 1 34620 test rax, rax 34621 je .LBB4_1586 34622 # %bb.923: 34623 mov rdi, r9 34624 and rdi, -2 34625 neg rdi 34626 xor eax, eax 34627 pxor xmm2, xmm2 34628 pcmpeqd xmm3, xmm3 34629 movdqa xmm4, xmmword ptr [rip + .LCPI4_18] # xmm4 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 34630 .LBB4_924: # =>This Inner Loop Header: Depth=1 34631 movdqu xmm5, xmmword ptr [rcx + 8*rax] 34632 movdqu xmm6, xmmword ptr [rcx + 8*rax + 16] 34633 movdqa xmm0, xmm5 34634 pcmpgtq xmm0, xmm2 34635 packssdw xmm0, xmm0 34636 packssdw xmm0, xmm0 34637 packsswb xmm0, xmm0 34638 movdqa xmm1, xmm6 34639 pcmpgtq xmm1, xmm2 34640 packssdw xmm1, xmm1 34641 packssdw xmm1, xmm1 34642 packsswb xmm1, xmm1 34643 pcmpeqq xmm5, xmm2 34644 pxor xmm5, xmm3 34645 packssdw xmm5, xmm5 34646 packssdw xmm5, xmm5 34647 packsswb xmm5, xmm5 34648 pcmpeqq xmm6, xmm2 34649 pxor xmm6, xmm3 34650 packssdw xmm6, xmm6 34651 packssdw xmm6, xmm6 34652 packsswb xmm6, xmm6 34653 pblendvb xmm5, xmm4, xmm0 34654 movdqa xmm0, xmm1 34655 pblendvb xmm6, xmm4, xmm0 34656 pextrw word ptr [r8 + rax], xmm5, 0 34657 pextrw word ptr [r8 + rax + 2], xmm6, 0 34658 movdqu xmm5, xmmword ptr [rcx + 8*rax + 32] 34659 movdqu xmm6, xmmword ptr [rcx + 8*rax + 48] 34660 movdqa xmm0, xmm5 34661 pcmpgtq xmm0, xmm2 34662 packssdw xmm0, xmm0 34663 packssdw xmm0, xmm0 34664 packsswb xmm0, xmm0 34665 movdqa xmm1, xmm6 34666 pcmpgtq xmm1, xmm2 34667 packssdw xmm1, xmm1 34668 packssdw xmm1, xmm1 34669 packsswb xmm1, xmm1 34670 pcmpeqq xmm5, xmm2 34671 pxor xmm5, xmm3 34672 packssdw xmm5, xmm5 34673 packssdw xmm5, xmm5 34674 packsswb xmm5, xmm5 34675 pcmpeqq xmm6, xmm2 34676 pxor xmm6, xmm3 34677 packssdw xmm6, xmm6 34678 packssdw xmm6, xmm6 34679 packsswb xmm6, xmm6 34680 pblendvb xmm5, xmm4, xmm0 34681 movdqa xmm0, xmm1 34682 pblendvb xmm6, xmm4, xmm0 34683 pextrw word ptr [r8 + rax + 4], xmm5, 0 34684 pextrw word ptr [r8 + rax + 6], xmm6, 0 34685 add rax, 8 34686 add rdi, 2 34687 jne .LBB4_924 34688 jmp .LBB4_1587 34689 .LBB4_927: 34690 mov edx, r10d 34691 and edx, -8 34692 lea rsi, [rdx - 8] 34693 mov r9, rsi 34694 shr r9, 3 34695 add r9, 1 34696 test rsi, rsi 34697 je .LBB4_1595 34698 # %bb.928: 34699 mov rdi, r9 34700 and rdi, -2 34701 neg rdi 34702 xor esi, esi 34703 xorps xmm4, xmm4 34704 pcmpeqd xmm8, xmm8 34705 movdqa xmm6, xmmword ptr [rip + .LCPI4_12] # xmm6 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u> 34706 .LBB4_929: # =>This Inner Loop Header: Depth=1 34707 movups xmm0, xmmword ptr [rcx + 4*rsi] 34708 movups xmm1, xmmword ptr [rcx + 4*rsi + 16] 34709 movaps xmm2, xmm0 34710 cmpeqps xmm2, xmm4 34711 packssdw xmm2, xmm2 34712 packsswb xmm2, xmm2 34713 movaps xmm3, xmm1 34714 cmpeqps xmm3, xmm4 34715 packssdw xmm3, xmm3 34716 packsswb xmm3, xmm3 34717 pcmpgtd xmm0, xmm8 34718 packssdw xmm0, xmm0 34719 packsswb xmm0, xmm0 34720 pcmpgtd xmm1, xmm8 34721 packssdw xmm1, xmm1 34722 packsswb xmm1, xmm1 34723 pcmpeqd xmm7, xmm7 34724 pblendvb xmm7, xmm6, xmm0 34725 pcmpeqd xmm5, xmm5 34726 movdqa xmm0, xmm1 34727 pblendvb xmm5, xmm6, xmm0 34728 movdqa xmm0, xmm2 34729 pblendvb xmm7, xmm4, xmm0 34730 movdqa xmm0, xmm3 34731 pblendvb xmm5, xmm4, xmm0 34732 movd dword ptr [r8 + rsi], xmm7 34733 movd dword ptr [r8 + rsi + 4], xmm5 34734 movups xmm0, xmmword ptr [rcx + 4*rsi + 32] 34735 movups xmm1, xmmword ptr [rcx + 4*rsi + 48] 34736 movaps xmm2, xmm0 34737 cmpeqps xmm2, xmm4 34738 packssdw xmm2, xmm2 34739 packsswb xmm2, xmm2 34740 movaps xmm3, xmm1 34741 cmpeqps xmm3, xmm4 34742 packssdw xmm3, xmm3 34743 packsswb xmm3, xmm3 34744 pcmpgtd xmm0, xmm8 34745 packssdw xmm0, xmm0 34746 packsswb xmm0, xmm0 34747 pcmpgtd xmm1, xmm8 34748 packssdw xmm1, xmm1 34749 pcmpeqd xmm5, xmm5 34750 pblendvb xmm5, xmm6, xmm0 34751 packsswb xmm1, xmm1 34752 pcmpeqd xmm7, xmm7 34753 movdqa xmm0, xmm1 34754 pblendvb xmm7, xmm6, xmm0 34755 movdqa xmm0, xmm2 34756 pblendvb xmm5, xmm4, xmm0 34757 movdqa xmm0, xmm3 34758 pblendvb xmm7, xmm4, xmm0 34759 movd dword ptr [r8 + rsi + 8], xmm5 34760 movd dword ptr [r8 + rsi + 12], xmm7 34761 add rsi, 16 34762 add rdi, 2 34763 jne .LBB4_929 34764 jmp .LBB4_1596 34765 .LBB4_932: 34766 mov edx, eax 34767 and edx, -32 34768 lea rsi, [rdx - 32] 34769 mov r9, rsi 34770 shr r9, 5 34771 add r9, 1 34772 test rsi, rsi 34773 je .LBB4_1604 34774 # %bb.933: 34775 mov rdi, r9 34776 and rdi, -2 34777 neg rdi 34778 xor esi, esi 34779 pxor xmm0, xmm0 34780 movdqa xmm1, xmmword ptr [rip + .LCPI4_22] # xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 34781 .LBB4_934: # =>This Inner Loop Header: Depth=1 34782 movdqu xmm2, xmmword ptr [rcx + rsi] 34783 movdqu xmm3, xmmword ptr [rcx + rsi + 16] 34784 pcmpeqb xmm2, xmm0 34785 pandn xmm2, xmm1 34786 pcmpeqb xmm3, xmm0 34787 pandn xmm3, xmm1 34788 movdqu xmmword ptr [r8 + rsi], xmm2 34789 movdqu xmmword ptr [r8 + rsi + 16], xmm3 34790 movdqu xmm2, xmmword ptr [rcx + rsi + 32] 34791 movdqu xmm3, xmmword ptr [rcx + rsi + 48] 34792 pcmpeqb xmm2, xmm0 34793 pandn xmm2, xmm1 34794 pcmpeqb xmm3, xmm0 34795 pandn xmm3, xmm1 34796 movdqu xmmword ptr [r8 + rsi + 32], xmm2 34797 movdqu xmmword ptr [r8 + rsi + 48], xmm3 34798 add rsi, 64 34799 add rdi, 2 34800 jne .LBB4_934 34801 jmp .LBB4_1605 34802 .LBB4_937: 34803 mov esi, r10d 34804 and esi, -8 34805 lea rax, [rsi - 8] 34806 mov r9, rax 34807 shr r9, 3 34808 add r9, 1 34809 test rax, rax 34810 je .LBB4_1612 34811 # %bb.938: 34812 mov rdi, r9 34813 and rdi, -2 34814 neg rdi 34815 xor eax, eax 34816 pxor xmm2, xmm2 34817 pcmpeqd xmm3, xmm3 34818 movdqa xmm4, xmmword ptr [rip + .LCPI4_12] # xmm4 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u> 34819 .LBB4_939: # =>This Inner Loop Header: Depth=1 34820 movdqu xmm5, xmmword ptr [rcx + 4*rax] 34821 movdqu xmm6, xmmword ptr [rcx + 4*rax + 16] 34822 movdqa xmm0, xmm5 34823 pcmpgtd xmm0, xmm2 34824 packssdw xmm0, xmm0 34825 packsswb xmm0, xmm0 34826 movdqa xmm1, xmm6 34827 pcmpgtd xmm1, xmm2 34828 packssdw xmm1, xmm1 34829 packsswb xmm1, xmm1 34830 pcmpeqd xmm5, xmm2 34831 pxor xmm5, xmm3 34832 packssdw xmm5, xmm5 34833 packsswb xmm5, xmm5 34834 pcmpeqd xmm6, xmm2 34835 pxor xmm6, xmm3 34836 packssdw xmm6, xmm6 34837 packsswb xmm6, xmm6 34838 pblendvb xmm5, xmm4, xmm0 34839 movdqa xmm0, xmm1 34840 pblendvb xmm6, xmm4, xmm0 34841 movd dword ptr [r8 + rax], xmm5 34842 movd dword ptr [r8 + rax + 4], xmm6 34843 movdqu xmm5, xmmword ptr [rcx + 4*rax + 32] 34844 movdqu xmm6, xmmword ptr [rcx + 4*rax + 48] 34845 movdqa xmm0, xmm5 34846 pcmpgtd xmm0, xmm2 34847 packssdw xmm0, xmm0 34848 packsswb xmm0, xmm0 34849 movdqa xmm1, xmm6 34850 pcmpgtd xmm1, xmm2 34851 packssdw xmm1, xmm1 34852 packsswb xmm1, xmm1 34853 pcmpeqd xmm5, xmm2 34854 pxor xmm5, xmm3 34855 packssdw xmm5, xmm5 34856 packsswb xmm5, xmm5 34857 pcmpeqd xmm6, xmm2 34858 pxor xmm6, xmm3 34859 packssdw xmm6, xmm6 34860 packsswb xmm6, xmm6 34861 pblendvb xmm5, xmm4, xmm0 34862 movdqa xmm0, xmm1 34863 pblendvb xmm6, xmm4, xmm0 34864 movd dword ptr [r8 + rax + 8], xmm5 34865 movd dword ptr [r8 + rax + 12], xmm6 34866 add rax, 16 34867 add rdi, 2 34868 jne .LBB4_939 34869 jmp .LBB4_1613 34870 .LBB4_942: 34871 mov edx, r10d 34872 and edx, -8 34873 lea rsi, [rdx - 8] 34874 mov r9, rsi 34875 shr r9, 3 34876 add r9, 1 34877 test rsi, rsi 34878 je .LBB4_1621 34879 # %bb.943: 34880 mov rdi, r9 34881 and rdi, -2 34882 neg rdi 34883 xor esi, esi 34884 pxor xmm0, xmm0 34885 movdqa xmm1, xmmword ptr [rip + .LCPI4_8] # xmm1 = [1,1,1,1] 34886 .LBB4_944: # =>This Inner Loop Header: Depth=1 34887 movdqu xmm2, xmmword ptr [rcx + 4*rsi] 34888 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 16] 34889 pcmpeqd xmm2, xmm0 34890 pandn xmm2, xmm1 34891 pcmpeqd xmm3, xmm0 34892 pandn xmm3, xmm1 34893 movdqu xmmword ptr [r8 + 4*rsi], xmm2 34894 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm3 34895 movdqu xmm2, xmmword ptr [rcx + 4*rsi + 32] 34896 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 48] 34897 pcmpeqd xmm2, xmm0 34898 pandn xmm2, xmm1 34899 pcmpeqd xmm3, xmm0 34900 pandn xmm3, xmm1 34901 movdqu xmmword ptr [r8 + 4*rsi + 32], xmm2 34902 movdqu xmmword ptr [r8 + 4*rsi + 48], xmm3 34903 add rsi, 16 34904 add rdi, 2 34905 jne .LBB4_944 34906 jmp .LBB4_1622 34907 .LBB4_950: 34908 mov edx, r10d 34909 and edx, -8 34910 lea rsi, [rdx - 8] 34911 mov r9, rsi 34912 shr r9, 3 34913 add r9, 1 34914 test rsi, rsi 34915 je .LBB4_1629 34916 # %bb.951: 34917 mov rdi, r9 34918 and rdi, -2 34919 neg rdi 34920 xor esi, esi 34921 pxor xmm2, xmm2 34922 pcmpeqd xmm3, xmm3 34923 movaps xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 34924 .LBB4_952: # =>This Inner Loop Header: Depth=1 34925 movd xmm5, dword ptr [rcx + rsi] # xmm5 = mem[0],zero,zero,zero 34926 movd xmm6, dword ptr [rcx + rsi + 4] # xmm6 = mem[0],zero,zero,zero 34927 movdqa xmm0, xmm5 34928 pcmpgtb xmm0, xmm2 34929 pmovsxbd xmm0, xmm0 34930 movdqa xmm1, xmm6 34931 pcmpgtb xmm1, xmm2 34932 pmovsxbd xmm1, xmm1 34933 pcmpeqb xmm5, xmm2 34934 pxor xmm5, xmm3 34935 pmovsxbd xmm5, xmm5 34936 pcmpeqb xmm6, xmm2 34937 pxor xmm6, xmm3 34938 pmovsxbd xmm6, xmm6 34939 blendvps xmm5, xmm4, xmm0 34940 movdqa xmm0, xmm1 34941 blendvps xmm6, xmm4, xmm0 34942 movups xmmword ptr [r8 + 4*rsi], xmm5 34943 movups xmmword ptr [r8 + 4*rsi + 16], xmm6 34944 movd xmm5, dword ptr [rcx + rsi + 8] # xmm5 = mem[0],zero,zero,zero 34945 movd xmm6, dword ptr [rcx + rsi + 12] # xmm6 = mem[0],zero,zero,zero 34946 movdqa xmm0, xmm5 34947 pcmpgtb xmm0, xmm2 34948 pmovsxbd xmm0, xmm0 34949 movdqa xmm1, xmm6 34950 pcmpgtb xmm1, xmm2 34951 pmovsxbd xmm1, xmm1 34952 pcmpeqb xmm5, xmm2 34953 pxor xmm5, xmm3 34954 pmovsxbd xmm5, xmm5 34955 pcmpeqb xmm6, xmm2 34956 pxor xmm6, xmm3 34957 pmovsxbd xmm6, xmm6 34958 blendvps xmm5, xmm4, xmm0 34959 movdqa xmm0, xmm1 34960 blendvps xmm6, xmm4, xmm0 34961 movups xmmword ptr [r8 + 4*rsi + 32], xmm5 34962 movups xmmword ptr [r8 + 4*rsi + 48], xmm6 34963 add rsi, 16 34964 add rdi, 2 34965 jne .LBB4_952 34966 jmp .LBB4_1630 34967 .LBB4_974: 34968 mov edx, r10d 34969 and edx, -8 34970 lea rsi, [rdx - 8] 34971 mov r9, rsi 34972 shr r9, 3 34973 add r9, 1 34974 test rsi, rsi 34975 je .LBB4_1638 34976 # %bb.975: 34977 mov rdi, r9 34978 and rdi, -2 34979 neg rdi 34980 xor esi, esi 34981 pxor xmm0, xmm0 34982 pcmpeqd xmm1, xmm1 34983 movdqa xmm2, xmmword ptr [rip + .LCPI4_8] # xmm2 = [1,1,1,1] 34984 .LBB4_976: # =>This Inner Loop Header: Depth=1 34985 movd xmm3, dword ptr [rcx + rsi] # xmm3 = mem[0],zero,zero,zero 34986 movd xmm4, dword ptr [rcx + rsi + 4] # xmm4 = mem[0],zero,zero,zero 34987 pcmpeqb xmm3, xmm0 34988 pxor xmm3, xmm1 34989 pmovzxbd xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 34990 pand xmm3, xmm2 34991 pcmpeqb xmm4, xmm0 34992 pxor xmm4, xmm1 34993 pmovzxbd xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 34994 pand xmm4, xmm2 34995 movdqu xmmword ptr [r8 + 4*rsi], xmm3 34996 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm4 34997 movd xmm3, dword ptr [rcx + rsi + 8] # xmm3 = mem[0],zero,zero,zero 34998 movd xmm4, dword ptr [rcx + rsi + 12] # xmm4 = mem[0],zero,zero,zero 34999 pcmpeqb xmm3, xmm0 35000 pxor xmm3, xmm1 35001 pmovzxbd xmm3, xmm3 # xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 35002 pand xmm3, xmm2 35003 pcmpeqb xmm4, xmm0 35004 pxor xmm4, xmm1 35005 pmovzxbd xmm4, xmm4 # xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 35006 pand xmm4, xmm2 35007 movdqu xmmword ptr [r8 + 4*rsi + 32], xmm3 35008 movdqu xmmword ptr [r8 + 4*rsi + 48], xmm4 35009 add rsi, 16 35010 add rdi, 2 35011 jne .LBB4_976 35012 jmp .LBB4_1639 35013 .LBB4_979: 35014 mov edx, r11d 35015 and edx, -8 35016 lea rsi, [rdx - 8] 35017 mov r9, rsi 35018 shr r9, 3 35019 add r9, 1 35020 test rsi, rsi 35021 je .LBB4_1646 35022 # %bb.980: 35023 mov rdi, r9 35024 and rdi, -2 35025 neg rdi 35026 xor esi, esi 35027 pxor xmm2, xmm2 35028 pcmpeqd xmm3, xmm3 35029 movdqa xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 35030 .LBB4_981: # =>This Inner Loop Header: Depth=1 35031 movdqu xmm5, xmmword ptr [rcx + 4*rsi] 35032 movdqu xmm6, xmmword ptr [rcx + 4*rsi + 16] 35033 movdqa xmm0, xmm4 35034 pcmpgtd xmm0, xmm5 35035 pcmpeqd xmm5, xmm2 35036 pxor xmm5, xmm3 35037 movdqa xmm1, xmm4 35038 pcmpgtd xmm1, xmm6 35039 pcmpeqd xmm6, xmm2 35040 pxor xmm6, xmm3 35041 movdqa xmm7, xmm4 35042 blendvps xmm7, xmm5, xmm0 35043 movdqa xmm5, xmm4 35044 movdqa xmm0, xmm1 35045 blendvps xmm5, xmm6, xmm0 35046 movups xmmword ptr [r8 + 4*rsi], xmm7 35047 movups xmmword ptr [r8 + 4*rsi + 16], xmm5 35048 movdqu xmm5, xmmword ptr [rcx + 4*rsi + 32] 35049 movdqu xmm6, xmmword ptr [rcx + 4*rsi + 48] 35050 movdqa xmm0, xmm4 35051 pcmpgtd xmm0, xmm5 35052 pcmpeqd xmm5, xmm2 35053 pxor xmm5, xmm3 35054 movdqa xmm1, xmm4 35055 pcmpgtd xmm1, xmm6 35056 pcmpeqd xmm6, xmm2 35057 pxor xmm6, xmm3 35058 movdqa xmm7, xmm4 35059 blendvps xmm7, xmm5, xmm0 35060 movdqa xmm5, xmm4 35061 movdqa xmm0, xmm1 35062 blendvps xmm5, xmm6, xmm0 35063 movups xmmword ptr [r8 + 4*rsi + 32], xmm7 35064 movups xmmword ptr [r8 + 4*rsi + 48], xmm5 35065 add rsi, 16 35066 add rdi, 2 35067 jne .LBB4_981 35068 jmp .LBB4_1647 35069 .LBB4_1475: 35070 movd xmm0, dword ptr [rip + .LCPI4_14] # xmm0 = mem[0],zero,zero,zero 35071 .LBB4_1476: 35072 jle .LBB4_1478 35073 # %bb.1477: 35074 movd xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero 35075 .LBB4_1478: 35076 movd dword ptr [r8 + 4*rdx], xmm0 35077 or rdx, 1 35078 .LBB4_1479: 35079 add rsi, rax 35080 je .LBB4_1655 35081 # %bb.1480: 35082 movd xmm0, dword ptr [rip + .LCPI4_14] # xmm0 = mem[0],zero,zero,zero 35083 movd xmm1, dword ptr [rip + .LCPI4_5] # xmm1 = mem[0],zero,zero,zero 35084 jmp .LBB4_1482 35085 .LBB4_1481: # in Loop: Header=BB4_1482 Depth=1 35086 movd dword ptr [r8 + 4*rdx + 4], xmm3 35087 add rdx, 2 35088 cmp rax, rdx 35089 je .LBB4_1655 35090 .LBB4_1482: # =>This Inner Loop Header: Depth=1 35091 cmp byte ptr [rcx + rdx], 0 35092 movdqa xmm2, xmm0 35093 jne .LBB4_1483 35094 # %bb.1486: # in Loop: Header=BB4_1482 Depth=1 35095 pxor xmm2, xmm2 35096 movdqa xmm3, xmm1 35097 jle .LBB4_1487 35098 .LBB4_1484: # in Loop: Header=BB4_1482 Depth=1 35099 movd dword ptr [r8 + 4*rdx], xmm3 35100 cmp byte ptr [rcx + rdx + 1], 0 35101 movdqa xmm2, xmm0 35102 jne .LBB4_1485 35103 .LBB4_1488: # in Loop: Header=BB4_1482 Depth=1 35104 pxor xmm2, xmm2 35105 movdqa xmm3, xmm1 35106 jg .LBB4_1481 35107 jmp .LBB4_1489 35108 .LBB4_1483: # in Loop: Header=BB4_1482 Depth=1 35109 movdqa xmm3, xmm1 35110 jg .LBB4_1484 35111 .LBB4_1487: # in Loop: Header=BB4_1482 Depth=1 35112 movdqa xmm3, xmm2 35113 movd dword ptr [r8 + 4*rdx], xmm3 35114 cmp byte ptr [rcx + rdx + 1], 0 35115 movdqa xmm2, xmm0 35116 je .LBB4_1488 35117 .LBB4_1485: # in Loop: Header=BB4_1482 Depth=1 35118 movdqa xmm3, xmm1 35119 jg .LBB4_1481 35120 .LBB4_1489: # in Loop: Header=BB4_1482 Depth=1 35121 movdqa xmm3, xmm2 35122 jmp .LBB4_1481 35123 .LBB4_994: 35124 xor esi, esi 35125 .LBB4_995: 35126 test r9b, 1 35127 je .LBB4_997 35128 # %bb.996: 35129 movdqu xmm0, xmmword ptr [rcx + 8*rsi] 35130 movdqu xmm1, xmmword ptr [rcx + 8*rsi + 16] 35131 pxor xmm2, xmm2 35132 pcmpeqq xmm0, xmm2 35133 pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] 35134 movdqa xmm3, xmmword ptr [rip + .LCPI4_16] # xmm3 = <1,1,u,u> 35135 pandn xmm0, xmm3 35136 pcmpeqq xmm1, xmm2 35137 pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] 35138 pandn xmm1, xmm3 35139 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] 35140 movdqu xmmword ptr [r8 + 4*rsi], xmm0 35141 .LBB4_997: 35142 cmp rdx, rax 35143 je .LBB4_1655 35144 .LBB4_998: # =>This Inner Loop Header: Depth=1 35145 xor esi, esi 35146 cmp qword ptr [rcx + 8*rdx], 0 35147 setne sil 35148 mov dword ptr [r8 + 4*rdx], esi 35149 add rdx, 1 35150 cmp rax, rdx 35151 jne .LBB4_998 35152 jmp .LBB4_1655 35153 .LBB4_999: 35154 xor esi, esi 35155 .LBB4_1000: 35156 test r9b, 1 35157 je .LBB4_1002 35158 # %bb.1001: 35159 movq xmm0, qword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero 35160 movq xmm1, qword ptr [rcx + 4*rsi + 8] # xmm1 = mem[0],zero 35161 pxor xmm2, xmm2 35162 pcmpeqd xmm0, xmm2 35163 pcmpeqd xmm3, xmm3 35164 pxor xmm0, xmm3 35165 pmovzxdq xmm0, xmm0 # xmm0 = xmm0[0],zero,xmm0[1],zero 35166 movdqa xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 35167 pand xmm0, xmm4 35168 pcmpeqd xmm1, xmm2 35169 pxor xmm1, xmm3 35170 pmovzxdq xmm1, xmm1 # xmm1 = xmm1[0],zero,xmm1[1],zero 35171 pand xmm1, xmm4 35172 movdqu xmmword ptr [r8 + 8*rsi], xmm0 35173 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm1 35174 .LBB4_1002: 35175 cmp rdx, rax 35176 je .LBB4_1655 35177 .LBB4_1003: # =>This Inner Loop Header: Depth=1 35178 xor esi, esi 35179 cmp dword ptr [rcx + 4*rdx], 0 35180 setne sil 35181 mov qword ptr [r8 + 8*rdx], rsi 35182 add rdx, 1 35183 cmp rax, rdx 35184 jne .LBB4_1003 35185 jmp .LBB4_1655 35186 .LBB4_1004: 35187 xor edi, edi 35188 .LBB4_1005: 35189 test r9b, 1 35190 je .LBB4_1007 35191 # %bb.1006: 35192 movupd xmm0, xmmword ptr [rcx + 8*rdi] 35193 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 35194 andpd xmm1, xmm0 35195 orpd xmm1, xmmword ptr [rip + .LCPI4_1] 35196 movsd xmm2, qword ptr [rip + .LCPI4_6] # xmm2 = mem[0],zero 35197 movapd xmm3, xmm1 35198 subsd xmm3, xmm2 35199 cvttsd2si rax, xmm3 35200 xor rax, r11 35201 cvttsd2si rdx, xmm1 35202 ucomisd xmm1, xmm2 35203 cmovae rdx, rax 35204 movq xmm3, rdx 35205 pshufd xmm1, xmm1, 238 # xmm1 = xmm1[2,3,2,3] 35206 movdqa xmm4, xmm1 35207 subsd xmm4, xmm2 35208 cvttsd2si rax, xmm4 35209 xor rax, r11 35210 cvttsd2si rdx, xmm1 35211 ucomisd xmm1, xmm2 35212 xorpd xmm1, xmm1 35213 cmovae rdx, rax 35214 movq xmm2, rdx 35215 punpcklqdq xmm3, xmm2 # xmm3 = xmm3[0],xmm2[0] 35216 cmpneqpd xmm1, xmm0 35217 andpd xmm1, xmm3 35218 movupd xmmword ptr [r8 + 8*rdi], xmm1 35219 .LBB4_1007: 35220 cmp rsi, r10 35221 je .LBB4_1655 35222 .LBB4_1008: 35223 movapd xmm0, xmmword ptr [rip + .LCPI4_0] # xmm0 = [-0.0E+0,-0.0E+0] 35224 movsd xmm1, qword ptr [rip + .LCPI4_2] # xmm1 = mem[0],zero 35225 movsd xmm2, qword ptr [rip + .LCPI4_6] # xmm2 = mem[0],zero 35226 xor eax, eax 35227 xorpd xmm3, xmm3 35228 .LBB4_1009: # =>This Inner Loop Header: Depth=1 35229 movsd xmm4, qword ptr [rcx + 8*rsi] # xmm4 = mem[0],zero 35230 movapd xmm5, xmm4 35231 andpd xmm5, xmm0 35232 orpd xmm5, xmm1 35233 movapd xmm6, xmm5 35234 subsd xmm6, xmm2 35235 cvttsd2si rdx, xmm6 35236 xor rdx, r11 35237 cvttsd2si rdi, xmm5 35238 ucomisd xmm5, xmm2 35239 cmovae rdi, rdx 35240 ucomisd xmm3, xmm4 35241 cmove rdi, rax 35242 mov qword ptr [r8 + 8*rsi], rdi 35243 add rsi, 1 35244 cmp r10, rsi 35245 jne .LBB4_1009 35246 jmp .LBB4_1655 35247 .LBB4_1010: 35248 xor esi, esi 35249 .LBB4_1011: 35250 test r9b, 1 35251 je .LBB4_1013 35252 # %bb.1012: 35253 movd xmm0, dword ptr [rcx + 2*rsi] # xmm0 = mem[0],zero,zero,zero 35254 movd xmm1, dword ptr [rcx + 2*rsi + 4] # xmm1 = mem[0],zero,zero,zero 35255 pxor xmm2, xmm2 35256 pcmpeqw xmm0, xmm2 35257 pcmpeqd xmm3, xmm3 35258 pxor xmm0, xmm3 35259 pmovzxwq xmm0, xmm0 # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 35260 movdqa xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 35261 pand xmm0, xmm4 35262 pcmpeqw xmm1, xmm2 35263 pxor xmm1, xmm3 35264 pmovzxwq xmm1, xmm1 # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 35265 pand xmm1, xmm4 35266 movdqu xmmword ptr [r8 + 8*rsi], xmm0 35267 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm1 35268 .LBB4_1013: 35269 cmp rdx, rax 35270 je .LBB4_1655 35271 .LBB4_1014: # =>This Inner Loop Header: Depth=1 35272 xor esi, esi 35273 cmp word ptr [rcx + 2*rdx], 0 35274 setne sil 35275 mov qword ptr [r8 + 8*rdx], rsi 35276 add rdx, 1 35277 cmp rax, rdx 35278 jne .LBB4_1014 35279 jmp .LBB4_1655 35280 .LBB4_1015: 35281 xor esi, esi 35282 .LBB4_1016: 35283 test r9b, 1 35284 je .LBB4_1018 35285 # %bb.1017: 35286 movd xmm2, dword ptr [rcx + 2*rsi] # xmm2 = mem[0],zero,zero,zero 35287 movd xmm3, dword ptr [rcx + 2*rsi + 4] # xmm3 = mem[0],zero,zero,zero 35288 xorpd xmm4, xmm4 35289 movdqa xmm0, xmm2 35290 pcmpgtw xmm0, xmm4 35291 pmovsxwq xmm0, xmm0 35292 movdqa xmm1, xmm3 35293 pcmpgtw xmm1, xmm4 35294 pmovsxwq xmm1, xmm1 35295 pcmpeqw xmm2, xmm4 35296 pcmpeqd xmm5, xmm5 35297 pxor xmm2, xmm5 35298 pmovsxwq xmm2, xmm2 35299 pcmpeqw xmm3, xmm4 35300 pxor xmm3, xmm5 35301 pmovsxwq xmm3, xmm3 35302 movapd xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 35303 blendvpd xmm2, xmm4, xmm0 35304 movdqa xmm0, xmm1 35305 blendvpd xmm3, xmm4, xmm0 35306 movupd xmmword ptr [r8 + 8*rsi], xmm2 35307 movupd xmmword ptr [r8 + 8*rsi + 16], xmm3 35308 .LBB4_1018: 35309 cmp rdx, r10 35310 je .LBB4_1655 35311 .LBB4_1019: 35312 mov esi, 1 35313 .LBB4_1020: # =>This Inner Loop Header: Depth=1 35314 movzx edi, word ptr [rcx + 2*rdx] 35315 xor eax, eax 35316 test di, di 35317 setne al 35318 neg rax 35319 test di, di 35320 cmovg rax, rsi 35321 mov qword ptr [r8 + 8*rdx], rax 35322 add rdx, 1 35323 cmp r10, rdx 35324 jne .LBB4_1020 35325 jmp .LBB4_1655 35326 .LBB4_993: 35327 movmskps ecx, xmm0 35328 and ecx, 1 35329 neg ecx 35330 or ecx, 1 35331 xorps xmm0, xmm0 35332 cvtsi2ss xmm0, ecx 35333 movss xmm1, dword ptr [rip + .LCPI4_9] # xmm1 = mem[0],zero,zero,zero 35334 movaps xmm2, xmm0 35335 subss xmm2, xmm1 35336 cvttss2si rcx, xmm2 35337 movabs rdx, -9223372036854775808 35338 xor rdx, rcx 35339 cvttss2si rcx, xmm0 35340 ucomiss xmm0, xmm1 35341 cmovae rcx, rdx 35342 mov qword ptr [r8 + 8*rax], rcx 35343 jmp .LBB4_1655 35344 .LBB4_1021: 35345 xor esi, esi 35346 .LBB4_1022: 35347 test r9b, 1 35348 je .LBB4_1024 35349 # %bb.1023: 35350 movq xmm2, qword ptr [rcx + 4*rsi] # xmm2 = mem[0],zero 35351 movq xmm3, qword ptr [rcx + 4*rsi + 8] # xmm3 = mem[0],zero 35352 xorpd xmm4, xmm4 35353 movdqa xmm0, xmm2 35354 pcmpgtd xmm0, xmm4 35355 pmovsxdq xmm0, xmm0 35356 movdqa xmm1, xmm3 35357 pcmpgtd xmm1, xmm4 35358 pmovsxdq xmm1, xmm1 35359 pcmpeqd xmm2, xmm4 35360 pcmpeqd xmm5, xmm5 35361 pxor xmm2, xmm5 35362 pmovsxdq xmm2, xmm2 35363 pcmpeqd xmm3, xmm4 35364 pxor xmm3, xmm5 35365 pmovsxdq xmm3, xmm3 35366 movapd xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 35367 blendvpd xmm2, xmm4, xmm0 35368 movdqa xmm0, xmm1 35369 blendvpd xmm3, xmm4, xmm0 35370 movupd xmmword ptr [r8 + 8*rsi], xmm2 35371 movupd xmmword ptr [r8 + 8*rsi + 16], xmm3 35372 .LBB4_1024: 35373 cmp rdx, r10 35374 je .LBB4_1655 35375 .LBB4_1025: 35376 mov esi, 1 35377 .LBB4_1026: # =>This Inner Loop Header: Depth=1 35378 mov edi, dword ptr [rcx + 4*rdx] 35379 xor eax, eax 35380 test edi, edi 35381 setne al 35382 neg rax 35383 test edi, edi 35384 cmovg rax, rsi 35385 mov qword ptr [r8 + 8*rdx], rax 35386 add rdx, 1 35387 cmp r10, rdx 35388 jne .LBB4_1026 35389 jmp .LBB4_1655 35390 .LBB4_1027: 35391 xor esi, esi 35392 .LBB4_1028: 35393 test r9b, 1 35394 je .LBB4_1030 35395 # %bb.1029: 35396 movdqu xmm0, xmmword ptr [rcx + 8*rsi] 35397 movdqu xmm1, xmmword ptr [rcx + 8*rsi + 16] 35398 pxor xmm2, xmm2 35399 pcmpeqq xmm0, xmm2 35400 pcmpeqd xmm3, xmm3 35401 pxor xmm0, xmm3 35402 packssdw xmm0, xmm0 35403 packssdw xmm0, xmm0 35404 movdqa xmm4, xmmword ptr [rip + .LCPI4_17] # xmm4 = <1,1,u,u,u,u,u,u> 35405 pand xmm0, xmm4 35406 pcmpeqq xmm1, xmm2 35407 pxor xmm1, xmm3 35408 packssdw xmm1, xmm1 35409 packssdw xmm1, xmm1 35410 pand xmm1, xmm4 35411 movd dword ptr [r8 + 2*rsi], xmm0 35412 movd dword ptr [r8 + 2*rsi + 4], xmm1 35413 .LBB4_1030: 35414 cmp rdx, rax 35415 je .LBB4_1655 35416 .LBB4_1031: # =>This Inner Loop Header: Depth=1 35417 xor esi, esi 35418 cmp qword ptr [rcx + 8*rdx], 0 35419 setne sil 35420 mov word ptr [r8 + 2*rdx], si 35421 add rdx, 1 35422 cmp rax, rdx 35423 jne .LBB4_1031 35424 jmp .LBB4_1655 35425 .LBB4_1032: 35426 xor esi, esi 35427 .LBB4_1033: 35428 test r9b, 1 35429 je .LBB4_1035 35430 # %bb.1034: 35431 movdqu xmm0, xmmword ptr [rcx + 8*rsi] 35432 movdqu xmm1, xmmword ptr [rcx + 8*rsi + 16] 35433 pxor xmm2, xmm2 35434 pcmpeqq xmm0, xmm2 35435 pcmpeqd xmm3, xmm3 35436 pxor xmm0, xmm3 35437 packssdw xmm0, xmm0 35438 packssdw xmm0, xmm0 35439 movdqa xmm4, xmmword ptr [rip + .LCPI4_17] # xmm4 = <1,1,u,u,u,u,u,u> 35440 pand xmm0, xmm4 35441 pcmpeqq xmm1, xmm2 35442 pxor xmm1, xmm3 35443 packssdw xmm1, xmm1 35444 packssdw xmm1, xmm1 35445 pand xmm1, xmm4 35446 movd dword ptr [r8 + 2*rsi], xmm0 35447 movd dword ptr [r8 + 2*rsi + 4], xmm1 35448 .LBB4_1035: 35449 cmp rdx, rax 35450 je .LBB4_1655 35451 .LBB4_1036: # =>This Inner Loop Header: Depth=1 35452 xor esi, esi 35453 cmp qword ptr [rcx + 8*rdx], 0 35454 setne sil 35455 mov word ptr [r8 + 2*rdx], si 35456 add rdx, 1 35457 cmp rax, rdx 35458 jne .LBB4_1036 35459 jmp .LBB4_1655 35460 .LBB4_1037: 35461 xor esi, esi 35462 .LBB4_1038: 35463 test r9b, 1 35464 je .LBB4_1040 35465 # %bb.1039: 35466 movdqu xmm2, xmmword ptr [rcx + 8*rsi] 35467 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 16] 35468 pxor xmm4, xmm4 35469 movdqa xmm0, xmm2 35470 pcmpgtq xmm0, xmm4 35471 packssdw xmm0, xmm0 35472 packssdw xmm0, xmm0 35473 movdqa xmm1, xmm3 35474 pcmpgtq xmm1, xmm4 35475 packssdw xmm1, xmm1 35476 packssdw xmm1, xmm1 35477 pcmpeqq xmm2, xmm4 35478 pcmpeqd xmm5, xmm5 35479 pxor xmm2, xmm5 35480 packssdw xmm2, xmm2 35481 packssdw xmm2, xmm2 35482 pcmpeqq xmm3, xmm4 35483 pxor xmm3, xmm5 35484 packssdw xmm3, xmm3 35485 packssdw xmm3, xmm3 35486 movdqa xmm4, xmmword ptr [rip + .LCPI4_17] # xmm4 = <1,1,u,u,u,u,u,u> 35487 pblendvb xmm2, xmm4, xmm0 35488 movdqa xmm0, xmm1 35489 pblendvb xmm3, xmm4, xmm0 35490 movd dword ptr [r8 + 2*rsi], xmm2 35491 movd dword ptr [r8 + 2*rsi + 4], xmm3 35492 .LBB4_1040: 35493 cmp rdx, r10 35494 je .LBB4_1655 35495 .LBB4_1041: 35496 mov esi, 1 35497 .LBB4_1042: # =>This Inner Loop Header: Depth=1 35498 mov rdi, qword ptr [rcx + 8*rdx] 35499 xor eax, eax 35500 test rdi, rdi 35501 setne al 35502 neg eax 35503 test rdi, rdi 35504 cmovg eax, esi 35505 mov word ptr [r8 + 2*rdx], ax 35506 add rdx, 1 35507 cmp r10, rdx 35508 jne .LBB4_1042 35509 jmp .LBB4_1655 35510 .LBB4_1043: 35511 xor esi, esi 35512 .LBB4_1044: 35513 test r9b, 1 35514 je .LBB4_1046 35515 # %bb.1045: 35516 movdqu xmm2, xmmword ptr [rcx + 4*rsi] 35517 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 16] 35518 pxor xmm4, xmm4 35519 movdqa xmm0, xmm2 35520 pcmpgtd xmm0, xmm4 35521 packssdw xmm0, xmm0 35522 movdqa xmm1, xmm3 35523 pcmpgtd xmm1, xmm4 35524 packssdw xmm1, xmm1 35525 pcmpeqd xmm2, xmm4 35526 pcmpeqd xmm5, xmm5 35527 pxor xmm2, xmm5 35528 packssdw xmm2, xmm2 35529 pcmpeqd xmm3, xmm4 35530 pxor xmm3, xmm5 35531 packssdw xmm3, xmm3 35532 movdqa xmm4, xmmword ptr [rip + .LCPI4_11] # xmm4 = <1,1,1,1,u,u,u,u> 35533 pblendvb xmm2, xmm4, xmm0 35534 movdqa xmm0, xmm1 35535 pblendvb xmm3, xmm4, xmm0 35536 punpcklqdq xmm2, xmm3 # xmm2 = xmm2[0],xmm3[0] 35537 movdqu xmmword ptr [r8 + 2*rsi], xmm2 35538 .LBB4_1046: 35539 cmp rdx, r10 35540 je .LBB4_1655 35541 .LBB4_1047: 35542 mov esi, 1 35543 .LBB4_1048: # =>This Inner Loop Header: Depth=1 35544 mov edi, dword ptr [rcx + 4*rdx] 35545 xor eax, eax 35546 test edi, edi 35547 setne al 35548 neg eax 35549 test edi, edi 35550 cmovg eax, esi 35551 mov word ptr [r8 + 2*rdx], ax 35552 add rdx, 1 35553 cmp r10, rdx 35554 jne .LBB4_1048 35555 jmp .LBB4_1655 35556 .LBB4_1049: 35557 xor esi, esi 35558 .LBB4_1050: 35559 test r9b, 1 35560 je .LBB4_1052 35561 # %bb.1051: 35562 movdqu xmm2, xmmword ptr [rcx + 4*rsi] 35563 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 16] 35564 pxor xmm4, xmm4 35565 movdqa xmm0, xmm2 35566 pcmpgtd xmm0, xmm4 35567 packssdw xmm0, xmm0 35568 movdqa xmm1, xmm3 35569 pcmpgtd xmm1, xmm4 35570 packssdw xmm1, xmm1 35571 pcmpeqd xmm2, xmm4 35572 pcmpeqd xmm5, xmm5 35573 pxor xmm2, xmm5 35574 packssdw xmm2, xmm2 35575 pcmpeqd xmm3, xmm4 35576 pxor xmm3, xmm5 35577 packssdw xmm3, xmm3 35578 movdqa xmm4, xmmword ptr [rip + .LCPI4_11] # xmm4 = <1,1,1,1,u,u,u,u> 35579 pblendvb xmm2, xmm4, xmm0 35580 movdqa xmm0, xmm1 35581 pblendvb xmm3, xmm4, xmm0 35582 punpcklqdq xmm2, xmm3 # xmm2 = xmm2[0],xmm3[0] 35583 movdqu xmmword ptr [r8 + 2*rsi], xmm2 35584 .LBB4_1052: 35585 cmp rdx, r10 35586 je .LBB4_1655 35587 .LBB4_1053: 35588 mov esi, 1 35589 .LBB4_1054: # =>This Inner Loop Header: Depth=1 35590 mov edi, dword ptr [rcx + 4*rdx] 35591 xor eax, eax 35592 test edi, edi 35593 setne al 35594 neg eax 35595 test edi, edi 35596 cmovg eax, esi 35597 mov word ptr [r8 + 2*rdx], ax 35598 add rdx, 1 35599 cmp r10, rdx 35600 jne .LBB4_1054 35601 jmp .LBB4_1655 35602 .LBB4_1055: 35603 xor esi, esi 35604 .LBB4_1056: 35605 test r9b, 1 35606 je .LBB4_1058 35607 # %bb.1057: 35608 movd xmm0, dword ptr [rcx + 2*rsi] # xmm0 = mem[0],zero,zero,zero 35609 movd xmm1, dword ptr [rcx + 2*rsi + 4] # xmm1 = mem[0],zero,zero,zero 35610 pxor xmm2, xmm2 35611 pcmpeqw xmm0, xmm2 35612 pcmpeqd xmm3, xmm3 35613 pxor xmm0, xmm3 35614 pmovzxwq xmm0, xmm0 # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 35615 movdqa xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 35616 pand xmm0, xmm4 35617 pcmpeqw xmm1, xmm2 35618 pxor xmm1, xmm3 35619 pmovzxwq xmm1, xmm1 # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 35620 pand xmm1, xmm4 35621 movdqu xmmword ptr [r8 + 8*rsi], xmm0 35622 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm1 35623 .LBB4_1058: 35624 cmp rdx, rax 35625 je .LBB4_1655 35626 .LBB4_1059: # =>This Inner Loop Header: Depth=1 35627 xor esi, esi 35628 cmp word ptr [rcx + 2*rdx], 0 35629 setne sil 35630 mov qword ptr [r8 + 8*rdx], rsi 35631 add rdx, 1 35632 cmp rax, rdx 35633 jne .LBB4_1059 35634 jmp .LBB4_1655 35635 .LBB4_1060: 35636 xor esi, esi 35637 .LBB4_1061: 35638 test r9b, 1 35639 je .LBB4_1063 35640 # %bb.1062: 35641 movq xmm2, qword ptr [rcx + 4*rsi] # xmm2 = mem[0],zero 35642 movq xmm3, qword ptr [rcx + 4*rsi + 8] # xmm3 = mem[0],zero 35643 xorpd xmm4, xmm4 35644 movdqa xmm0, xmm2 35645 pcmpgtd xmm0, xmm4 35646 pmovsxdq xmm0, xmm0 35647 movdqa xmm1, xmm3 35648 pcmpgtd xmm1, xmm4 35649 pmovsxdq xmm1, xmm1 35650 pcmpeqd xmm2, xmm4 35651 pcmpeqd xmm5, xmm5 35652 pxor xmm2, xmm5 35653 pmovsxdq xmm2, xmm2 35654 pcmpeqd xmm3, xmm4 35655 pxor xmm3, xmm5 35656 pmovsxdq xmm3, xmm3 35657 movapd xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 35658 blendvpd xmm2, xmm4, xmm0 35659 movdqa xmm0, xmm1 35660 blendvpd xmm3, xmm4, xmm0 35661 movupd xmmword ptr [r8 + 8*rsi], xmm2 35662 movupd xmmword ptr [r8 + 8*rsi + 16], xmm3 35663 .LBB4_1063: 35664 cmp rdx, r10 35665 je .LBB4_1655 35666 .LBB4_1064: 35667 mov esi, 1 35668 .LBB4_1065: # =>This Inner Loop Header: Depth=1 35669 mov edi, dword ptr [rcx + 4*rdx] 35670 xor eax, eax 35671 test edi, edi 35672 setne al 35673 neg rax 35674 test edi, edi 35675 cmovg rax, rsi 35676 mov qword ptr [r8 + 8*rdx], rax 35677 add rdx, 1 35678 cmp r10, rdx 35679 jne .LBB4_1065 35680 jmp .LBB4_1655 35681 .LBB4_1066: 35682 xor esi, esi 35683 .LBB4_1067: 35684 test r9b, 1 35685 je .LBB4_1069 35686 # %bb.1068: 35687 movdqu xmm2, xmmword ptr [rcx + 4*rsi] 35688 movdqu xmm3, xmmword ptr [rcx + 4*rsi + 16] 35689 xorps xmm4, xmm4 35690 movdqa xmm0, xmm2 35691 pcmpgtd xmm0, xmm4 35692 movdqa xmm1, xmm3 35693 pcmpgtd xmm1, xmm4 35694 pcmpeqd xmm2, xmm4 35695 pcmpeqd xmm5, xmm5 35696 pxor xmm2, xmm5 35697 cvtdq2ps xmm2, xmm2 35698 pcmpeqd xmm3, xmm4 35699 pxor xmm3, xmm5 35700 cvtdq2ps xmm3, xmm3 35701 movaps xmm4, xmmword ptr [rip + .LCPI4_19] # xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 35702 blendvps xmm2, xmm4, xmm0 35703 movdqa xmm0, xmm1 35704 blendvps xmm3, xmm4, xmm0 35705 movups xmmword ptr [r8 + 4*rsi], xmm2 35706 movups xmmword ptr [r8 + 4*rsi + 16], xmm3 35707 .LBB4_1069: 35708 cmp rdx, rax 35709 je .LBB4_1655 35710 .LBB4_1070: 35711 movd xmm0, dword ptr [rip + .LCPI4_14] # xmm0 = mem[0],zero,zero,zero 35712 movd xmm1, dword ptr [rip + .LCPI4_5] # xmm1 = mem[0],zero,zero,zero 35713 jmp .LBB4_1072 35714 .LBB4_1071: # in Loop: Header=BB4_1072 Depth=1 35715 movd dword ptr [r8 + 4*rdx], xmm3 35716 add rdx, 1 35717 cmp rax, rdx 35718 je .LBB4_1655 35719 .LBB4_1072: # =>This Inner Loop Header: Depth=1 35720 cmp dword ptr [rcx + 4*rdx], 0 35721 movdqa xmm2, xmm0 35722 jne .LBB4_1074 35723 # %bb.1073: # in Loop: Header=BB4_1072 Depth=1 35724 pxor xmm2, xmm2 35725 .LBB4_1074: # in Loop: Header=BB4_1072 Depth=1 35726 movdqa xmm3, xmm1 35727 jg .LBB4_1071 35728 # %bb.1075: # in Loop: Header=BB4_1072 Depth=1 35729 movdqa xmm3, xmm2 35730 jmp .LBB4_1071 35731 .LBB4_1076: 35732 xor edi, edi 35733 .LBB4_1077: 35734 test r9b, 1 35735 je .LBB4_1079 35736 # %bb.1078: 35737 movupd xmm0, xmmword ptr [rcx + 8*rdi] 35738 movupd xmm1, xmmword ptr [rcx + 8*rdi + 16] 35739 xorpd xmm2, xmm2 35740 movapd xmm3, xmm0 35741 cmpeqpd xmm3, xmm2 35742 shufps xmm3, xmm3, 232 # xmm3 = xmm3[0,2,2,3] 35743 cmpeqpd xmm2, xmm1 35744 shufps xmm2, xmm2, 232 # xmm2 = xmm2[0,2,2,3] 35745 movapd xmm4, xmmword ptr [rip + .LCPI4_0] # xmm4 = [-0.0E+0,-0.0E+0] 35746 andpd xmm0, xmm4 35747 movapd xmm5, xmmword ptr [rip + .LCPI4_1] # xmm5 = [1.0E+0,1.0E+0] 35748 orpd xmm0, xmm5 35749 andpd xmm1, xmm4 35750 orpd xmm1, xmm5 35751 cvttpd2dq xmm0, xmm0 35752 cvttpd2dq xmm1, xmm1 35753 andnps xmm3, xmm0 35754 andnps xmm2, xmm1 35755 movlhps xmm3, xmm2 # xmm3 = xmm3[0],xmm2[0] 35756 movups xmmword ptr [r8 + 4*rdi], xmm3 35757 .LBB4_1079: 35758 cmp rsi, rax 35759 je .LBB4_1655 35760 .LBB4_1080: 35761 xorpd xmm0, xmm0 35762 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 35763 movsd xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero 35764 .LBB4_1081: # =>This Inner Loop Header: Depth=1 35765 movsd xmm3, qword ptr [rcx + 8*rsi] # xmm3 = mem[0],zero 35766 ucomisd xmm0, xmm3 35767 andpd xmm3, xmm1 35768 orpd xmm3, xmm2 35769 cvttsd2si edx, xmm3 35770 cmove edx, r10d 35771 mov dword ptr [r8 + 4*rsi], edx 35772 add rsi, 1 35773 cmp rax, rsi 35774 jne .LBB4_1081 35775 jmp .LBB4_1655 35776 .LBB4_1082: 35777 xor esi, esi 35778 .LBB4_1083: 35779 test r9b, 1 35780 je .LBB4_1085 35781 # %bb.1084: 35782 movdqu xmm0, xmmword ptr [rcx + 8*rsi] 35783 movdqu xmm1, xmmword ptr [rcx + 8*rsi + 16] 35784 pxor xmm2, xmm2 35785 pcmpeqq xmm0, xmm2 35786 pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] 35787 movdqa xmm3, xmmword ptr [rip + .LCPI4_16] # xmm3 = <1,1,u,u> 35788 pandn xmm0, xmm3 35789 pcmpeqq xmm1, xmm2 35790 pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] 35791 pandn xmm1, xmm3 35792 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] 35793 movdqu xmmword ptr [r8 + 4*rsi], xmm0 35794 .LBB4_1085: 35795 cmp rdx, rax 35796 je .LBB4_1655 35797 .LBB4_1086: # =>This Inner Loop Header: Depth=1 35798 xor esi, esi 35799 cmp qword ptr [rcx + 8*rdx], 0 35800 setne sil 35801 mov dword ptr [r8 + 4*rdx], esi 35802 add rdx, 1 35803 cmp rax, rdx 35804 jne .LBB4_1086 35805 jmp .LBB4_1655 35806 .LBB4_1087: 35807 xor esi, esi 35808 .LBB4_1088: 35809 test r9b, 1 35810 je .LBB4_1090 35811 # %bb.1089: 35812 movq xmm0, qword ptr [rcx + 2*rsi] # xmm0 = mem[0],zero 35813 movq xmm1, qword ptr [rcx + 2*rsi + 8] # xmm1 = mem[0],zero 35814 pxor xmm2, xmm2 35815 pcmpeqw xmm0, xmm2 35816 pcmpeqd xmm3, xmm3 35817 pxor xmm0, xmm3 35818 pmovzxwd xmm0, xmm0 # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 35819 movdqa xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 35820 pand xmm0, xmm4 35821 pcmpeqw xmm1, xmm2 35822 pxor xmm1, xmm3 35823 pmovzxwd xmm1, xmm1 # xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 35824 pand xmm1, xmm4 35825 movdqu xmmword ptr [r8 + 4*rsi], xmm0 35826 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm1 35827 .LBB4_1090: 35828 cmp rdx, rax 35829 je .LBB4_1655 35830 .LBB4_1091: # =>This Inner Loop Header: Depth=1 35831 xor esi, esi 35832 cmp word ptr [rcx + 2*rdx], 0 35833 setne sil 35834 mov dword ptr [r8 + 4*rdx], esi 35835 add rdx, 1 35836 cmp rax, rdx 35837 jne .LBB4_1091 35838 jmp .LBB4_1655 35839 .LBB4_1092: 35840 xor esi, esi 35841 .LBB4_1093: 35842 test r9b, 1 35843 je .LBB4_1095 35844 # %bb.1094: 35845 movq xmm2, qword ptr [rcx + 2*rsi] # xmm2 = mem[0],zero 35846 movq xmm3, qword ptr [rcx + 2*rsi + 8] # xmm3 = mem[0],zero 35847 xorps xmm4, xmm4 35848 movdqa xmm0, xmm2 35849 pcmpgtw xmm0, xmm4 35850 pmovsxwd xmm0, xmm0 35851 movdqa xmm1, xmm3 35852 pcmpgtw xmm1, xmm4 35853 pmovsxwd xmm1, xmm1 35854 pcmpeqw xmm2, xmm4 35855 pcmpeqd xmm5, xmm5 35856 pxor xmm2, xmm5 35857 pmovsxwd xmm2, xmm2 35858 pcmpeqw xmm3, xmm4 35859 pxor xmm3, xmm5 35860 pmovsxwd xmm3, xmm3 35861 movaps xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 35862 blendvps xmm2, xmm4, xmm0 35863 movdqa xmm0, xmm1 35864 blendvps xmm3, xmm4, xmm0 35865 movups xmmword ptr [r8 + 4*rsi], xmm2 35866 movups xmmword ptr [r8 + 4*rsi + 16], xmm3 35867 .LBB4_1095: 35868 cmp rdx, r10 35869 je .LBB4_1655 35870 .LBB4_1096: 35871 mov esi, 1 35872 .LBB4_1097: # =>This Inner Loop Header: Depth=1 35873 movzx edi, word ptr [rcx + 2*rdx] 35874 xor eax, eax 35875 test di, di 35876 setne al 35877 neg eax 35878 test di, di 35879 cmovg eax, esi 35880 mov dword ptr [r8 + 4*rdx], eax 35881 add rdx, 1 35882 cmp r10, rdx 35883 jne .LBB4_1097 35884 jmp .LBB4_1655 35885 .LBB4_1098: 35886 xor esi, esi 35887 .LBB4_1099: 35888 test r9b, 1 35889 je .LBB4_1101 35890 # %bb.1100: 35891 movdqu xmm2, xmmword ptr [rcx + 8*rsi] 35892 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 16] 35893 xorps xmm4, xmm4 35894 movdqa xmm0, xmm2 35895 pcmpgtq xmm0, xmm4 35896 pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] 35897 movdqa xmm1, xmm3 35898 pcmpgtq xmm1, xmm4 35899 pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] 35900 pcmpeqq xmm2, xmm4 35901 pshufd xmm2, xmm2, 232 # xmm2 = xmm2[0,2,2,3] 35902 pcmpeqd xmm5, xmm5 35903 pxor xmm2, xmm5 35904 pcmpeqq xmm3, xmm4 35905 pshufd xmm3, xmm3, 232 # xmm3 = xmm3[0,2,2,3] 35906 pxor xmm3, xmm5 35907 movaps xmm4, xmmword ptr [rip + .LCPI4_16] # xmm4 = <1,1,u,u> 35908 blendvps xmm2, xmm4, xmm0 35909 movdqa xmm0, xmm1 35910 blendvps xmm3, xmm4, xmm0 35911 movlhps xmm2, xmm3 # xmm2 = xmm2[0],xmm3[0] 35912 movups xmmword ptr [r8 + 4*rsi], xmm2 35913 .LBB4_1101: 35914 cmp rdx, r10 35915 je .LBB4_1655 35916 .LBB4_1102: 35917 mov esi, 1 35918 .LBB4_1103: # =>This Inner Loop Header: Depth=1 35919 mov rdi, qword ptr [rcx + 8*rdx] 35920 xor eax, eax 35921 test rdi, rdi 35922 setne al 35923 neg eax 35924 test rdi, rdi 35925 cmovg eax, esi 35926 mov dword ptr [r8 + 4*rdx], eax 35927 add rdx, 1 35928 cmp r10, rdx 35929 jne .LBB4_1103 35930 jmp .LBB4_1655 35931 .LBB4_1106: 35932 xor edi, edi 35933 .LBB4_1107: 35934 test r9b, 1 35935 je .LBB4_1109 35936 # %bb.1108: 35937 movupd xmm3, xmmword ptr [rcx + 8*rdi] 35938 movupd xmm2, xmmword ptr [rcx + 8*rdi + 16] 35939 xorpd xmm1, xmm1 35940 movapd xmm0, xmm3 35941 cmpeqpd xmm0, xmm1 35942 shufps xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] 35943 cmpeqpd xmm1, xmm2 35944 movapd xmm4, xmmword ptr [rip + .LCPI4_0] # xmm4 = [-0.0E+0,-0.0E+0] 35945 andpd xmm3, xmm4 35946 movapd xmm5, xmmword ptr [rip + .LCPI4_1] # xmm5 = [1.0E+0,1.0E+0] 35947 orpd xmm3, xmm5 35948 andpd xmm2, xmm4 35949 orpd xmm2, xmm5 35950 pshufd xmm4, xmm3, 238 # xmm4 = xmm3[2,3,2,3] 35951 cvttsd2si rax, xmm4 35952 cvttsd2si rdx, xmm3 35953 movd xmm3, edx 35954 pinsrd xmm3, eax, 1 35955 pshufd xmm4, xmm2, 238 # xmm4 = xmm2[2,3,2,3] 35956 cvttsd2si rax, xmm4 35957 cvttsd2si rdx, xmm2 35958 shufps xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] 35959 movd xmm2, edx 35960 pinsrd xmm2, eax, 1 35961 andnps xmm0, xmm3 35962 andnps xmm1, xmm2 35963 movlhps xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] 35964 movups xmmword ptr [r8 + 4*rdi], xmm0 35965 .LBB4_1109: 35966 cmp rsi, r11 35967 je .LBB4_1655 35968 .LBB4_1110: 35969 xorpd xmm0, xmm0 35970 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 35971 movsd xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero 35972 .LBB4_1111: # =>This Inner Loop Header: Depth=1 35973 movsd xmm3, qword ptr [rcx + 8*rsi] # xmm3 = mem[0],zero 35974 ucomisd xmm0, xmm3 35975 andpd xmm3, xmm1 35976 orpd xmm3, xmm2 35977 cvttsd2si rax, xmm3 35978 cmove eax, r10d 35979 mov dword ptr [r8 + 4*rsi], eax 35980 add rsi, 1 35981 cmp r11, rsi 35982 jne .LBB4_1111 35983 jmp .LBB4_1655 35984 .LBB4_1112: 35985 xor esi, esi 35986 .LBB4_1113: 35987 test r9b, 1 35988 je .LBB4_1115 35989 # %bb.1114: 35990 movq xmm0, qword ptr [rcx + 2*rsi] # xmm0 = mem[0],zero 35991 movq xmm1, qword ptr [rcx + 2*rsi + 8] # xmm1 = mem[0],zero 35992 pxor xmm2, xmm2 35993 pcmpeqw xmm0, xmm2 35994 pcmpeqd xmm3, xmm3 35995 pxor xmm0, xmm3 35996 pmovzxwd xmm0, xmm0 # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 35997 movdqa xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 35998 pand xmm0, xmm4 35999 pcmpeqw xmm1, xmm2 36000 pxor xmm1, xmm3 36001 pmovzxwd xmm1, xmm1 # xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 36002 pand xmm1, xmm4 36003 movdqu xmmword ptr [r8 + 4*rsi], xmm0 36004 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm1 36005 .LBB4_1115: 36006 cmp rdx, rax 36007 je .LBB4_1655 36008 .LBB4_1116: # =>This Inner Loop Header: Depth=1 36009 xor esi, esi 36010 cmp word ptr [rcx + 2*rdx], 0 36011 setne sil 36012 mov dword ptr [r8 + 4*rdx], esi 36013 add rdx, 1 36014 cmp rax, rdx 36015 jne .LBB4_1116 36016 jmp .LBB4_1655 36017 .LBB4_1117: 36018 xor esi, esi 36019 .LBB4_1118: 36020 test r9b, 1 36021 je .LBB4_1120 36022 # %bb.1119: 36023 movq xmm2, qword ptr [rcx + 2*rsi] # xmm2 = mem[0],zero 36024 movq xmm3, qword ptr [rcx + 2*rsi + 8] # xmm3 = mem[0],zero 36025 xorps xmm4, xmm4 36026 movdqa xmm0, xmm2 36027 pcmpgtw xmm0, xmm4 36028 pmovsxwd xmm0, xmm0 36029 movdqa xmm1, xmm3 36030 pcmpgtw xmm1, xmm4 36031 pmovsxwd xmm1, xmm1 36032 pcmpeqw xmm2, xmm4 36033 pcmpeqd xmm5, xmm5 36034 pxor xmm2, xmm5 36035 pmovsxwd xmm2, xmm2 36036 pcmpeqw xmm3, xmm4 36037 pxor xmm3, xmm5 36038 pmovsxwd xmm3, xmm3 36039 movaps xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 36040 blendvps xmm2, xmm4, xmm0 36041 movdqa xmm0, xmm1 36042 blendvps xmm3, xmm4, xmm0 36043 movups xmmword ptr [r8 + 4*rsi], xmm2 36044 movups xmmword ptr [r8 + 4*rsi + 16], xmm3 36045 .LBB4_1120: 36046 cmp rdx, r10 36047 je .LBB4_1655 36048 .LBB4_1121: 36049 mov esi, 1 36050 .LBB4_1122: # =>This Inner Loop Header: Depth=1 36051 movzx edi, word ptr [rcx + 2*rdx] 36052 xor eax, eax 36053 test di, di 36054 setne al 36055 neg eax 36056 test di, di 36057 cmovg eax, esi 36058 mov dword ptr [r8 + 4*rdx], eax 36059 add rdx, 1 36060 cmp r10, rdx 36061 jne .LBB4_1122 36062 jmp .LBB4_1655 36063 .LBB4_1123: 36064 xor esi, esi 36065 .LBB4_1124: 36066 test r9b, 1 36067 je .LBB4_1126 36068 # %bb.1125: 36069 movdqu xmm2, xmmword ptr [rcx + 8*rsi] 36070 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 16] 36071 xorps xmm4, xmm4 36072 movdqa xmm0, xmm2 36073 pcmpgtq xmm0, xmm4 36074 pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] 36075 movdqa xmm1, xmm3 36076 pcmpgtq xmm1, xmm4 36077 pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] 36078 pcmpeqq xmm2, xmm4 36079 pshufd xmm2, xmm2, 232 # xmm2 = xmm2[0,2,2,3] 36080 pcmpeqd xmm5, xmm5 36081 pxor xmm2, xmm5 36082 pcmpeqq xmm3, xmm4 36083 pshufd xmm3, xmm3, 232 # xmm3 = xmm3[0,2,2,3] 36084 pxor xmm3, xmm5 36085 movaps xmm4, xmmword ptr [rip + .LCPI4_16] # xmm4 = <1,1,u,u> 36086 blendvps xmm2, xmm4, xmm0 36087 movdqa xmm0, xmm1 36088 blendvps xmm3, xmm4, xmm0 36089 movlhps xmm2, xmm3 # xmm2 = xmm2[0],xmm3[0] 36090 movups xmmword ptr [r8 + 4*rsi], xmm2 36091 .LBB4_1126: 36092 cmp rdx, r10 36093 je .LBB4_1655 36094 .LBB4_1127: 36095 mov esi, 1 36096 .LBB4_1128: # =>This Inner Loop Header: Depth=1 36097 mov rdi, qword ptr [rcx + 8*rdx] 36098 xor eax, eax 36099 test rdi, rdi 36100 setne al 36101 neg eax 36102 test rdi, rdi 36103 cmovg eax, esi 36104 mov dword ptr [r8 + 4*rdx], eax 36105 add rdx, 1 36106 cmp r10, rdx 36107 jne .LBB4_1128 36108 jmp .LBB4_1655 36109 .LBB4_1129: 36110 xor esi, esi 36111 .LBB4_1130: 36112 test r9b, 1 36113 je .LBB4_1132 36114 # %bb.1131: 36115 movups xmm0, xmmword ptr [rcx + 4*rsi] 36116 xorps xmm1, xmm1 36117 cmpneqps xmm1, xmm0 36118 psrad xmm0, 31 36119 por xmm0, xmmword ptr [rip + .LCPI4_8] 36120 cvtdq2ps xmm2, xmm0 36121 movaps xmm3, xmmword ptr [rip + .LCPI4_10] # xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] 36122 movaps xmm0, xmm2 36123 cmpltps xmm0, xmm3 36124 cvttps2dq xmm4, xmm2 36125 subps xmm2, xmm3 36126 cvttps2dq xmm2, xmm2 36127 xorps xmm2, xmmword ptr [rip + .LCPI4_4] 36128 blendvps xmm2, xmm4, xmm0 36129 andps xmm1, xmm2 36130 movups xmmword ptr [r8 + 4*rsi], xmm1 36131 .LBB4_1132: 36132 cmp rdx, rax 36133 je .LBB4_1655 36134 .LBB4_1133: 36135 xorps xmm0, xmm0 36136 jmp .LBB4_1135 36137 .LBB4_1134: # in Loop: Header=BB4_1135 Depth=1 36138 mov dword ptr [r8 + 4*rdx], esi 36139 add rdx, 1 36140 cmp rax, rdx 36141 je .LBB4_1655 36142 .LBB4_1135: # =>This Inner Loop Header: Depth=1 36143 movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero 36144 xor esi, esi 36145 ucomiss xmm0, xmm1 36146 je .LBB4_1134 36147 # %bb.1136: # in Loop: Header=BB4_1135 Depth=1 36148 movmskps esi, xmm1 36149 and esi, 1 36150 neg esi 36151 or esi, 1 36152 xorps xmm1, xmm1 36153 cvtsi2ss xmm1, esi 36154 cvttss2si rsi, xmm1 36155 jmp .LBB4_1134 36156 .LBB4_1137: 36157 xor esi, esi 36158 .LBB4_1138: 36159 test r9b, 1 36160 je .LBB4_1140 36161 # %bb.1139: 36162 movdqu xmm0, xmmword ptr [rcx + 4*rsi] 36163 movdqu xmm1, xmmword ptr [rcx + 4*rsi + 16] 36164 pxor xmm2, xmm2 36165 pcmpeqd xmm0, xmm2 36166 pcmpeqd xmm3, xmm3 36167 pxor xmm0, xmm3 36168 packssdw xmm0, xmm0 36169 movdqa xmm4, xmmword ptr [rip + .LCPI4_11] # xmm4 = <1,1,1,1,u,u,u,u> 36170 pand xmm0, xmm4 36171 pcmpeqd xmm1, xmm2 36172 pxor xmm1, xmm3 36173 packssdw xmm1, xmm1 36174 pand xmm1, xmm4 36175 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] 36176 movdqu xmmword ptr [r8 + 2*rsi], xmm0 36177 .LBB4_1140: 36178 cmp rdx, rax 36179 je .LBB4_1655 36180 .LBB4_1141: # =>This Inner Loop Header: Depth=1 36181 xor esi, esi 36182 cmp dword ptr [rcx + 4*rdx], 0 36183 setne sil 36184 mov word ptr [r8 + 2*rdx], si 36185 add rdx, 1 36186 cmp rax, rdx 36187 jne .LBB4_1141 36188 jmp .LBB4_1655 36189 .LBB4_1142: 36190 xor esi, esi 36191 .LBB4_1143: 36192 test r9b, 1 36193 je .LBB4_1145 36194 # %bb.1144: 36195 movdqu xmm0, xmmword ptr [rcx + 4*rsi] 36196 movdqu xmm1, xmmword ptr [rcx + 4*rsi + 16] 36197 pxor xmm2, xmm2 36198 pcmpeqd xmm0, xmm2 36199 pcmpeqd xmm3, xmm3 36200 pxor xmm0, xmm3 36201 packssdw xmm0, xmm0 36202 movdqa xmm4, xmmword ptr [rip + .LCPI4_11] # xmm4 = <1,1,1,1,u,u,u,u> 36203 pand xmm0, xmm4 36204 pcmpeqd xmm1, xmm2 36205 pxor xmm1, xmm3 36206 packssdw xmm1, xmm1 36207 pand xmm1, xmm4 36208 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] 36209 movdqu xmmword ptr [r8 + 2*rsi], xmm0 36210 .LBB4_1145: 36211 cmp rdx, rax 36212 je .LBB4_1655 36213 .LBB4_1146: # =>This Inner Loop Header: Depth=1 36214 xor esi, esi 36215 cmp dword ptr [rcx + 4*rdx], 0 36216 setne sil 36217 mov word ptr [r8 + 2*rdx], si 36218 add rdx, 1 36219 cmp rax, rdx 36220 jne .LBB4_1146 36221 jmp .LBB4_1655 36222 .LBB4_1147: 36223 xor edi, edi 36224 .LBB4_1148: 36225 test r9b, 1 36226 je .LBB4_1150 36227 # %bb.1149: 36228 movupd xmm2, xmmword ptr [rcx + 8*rdi] 36229 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 36230 xorpd xmm4, xmm4 36231 movapd xmm0, xmm2 36232 cmpeqpd xmm0, xmm4 36233 packssdw xmm0, xmm0 36234 packssdw xmm0, xmm0 36235 movapd xmm1, xmm3 36236 cmpeqpd xmm1, xmm4 36237 packssdw xmm1, xmm1 36238 packssdw xmm1, xmm1 36239 movapd xmm5, xmmword ptr [rip + .LCPI4_0] # xmm5 = [-0.0E+0,-0.0E+0] 36240 andpd xmm2, xmm5 36241 movapd xmm6, xmmword ptr [rip + .LCPI4_1] # xmm6 = [1.0E+0,1.0E+0] 36242 orpd xmm2, xmm6 36243 andpd xmm3, xmm5 36244 orpd xmm3, xmm6 36245 cvttpd2dq xmm2, xmm2 36246 cvttpd2dq xmm3, xmm3 36247 pshuflw xmm2, xmm2, 232 # xmm2 = xmm2[0,2,2,3,4,5,6,7] 36248 pshuflw xmm3, xmm3, 232 # xmm3 = xmm3[0,2,2,3,4,5,6,7] 36249 pblendvb xmm2, xmm4, xmm0 36250 movdqa xmm0, xmm1 36251 pblendvb xmm3, xmm4, xmm0 36252 movd dword ptr [r8 + 2*rdi], xmm2 36253 movd dword ptr [r8 + 2*rdi + 4], xmm3 36254 .LBB4_1150: 36255 cmp rsi, rax 36256 je .LBB4_1655 36257 .LBB4_1151: 36258 pxor xmm0, xmm0 36259 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 36260 movsd xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero 36261 .LBB4_1152: # =>This Inner Loop Header: Depth=1 36262 movsd xmm3, qword ptr [rcx + 8*rsi] # xmm3 = mem[0],zero 36263 ucomisd xmm0, xmm3 36264 andpd xmm3, xmm1 36265 orpd xmm3, xmm2 36266 cvttsd2si edx, xmm3 36267 cmove edx, r10d 36268 mov word ptr [r8 + 2*rsi], dx 36269 add rsi, 1 36270 cmp rax, rsi 36271 jne .LBB4_1152 36272 jmp .LBB4_1655 36273 .LBB4_1153: 36274 xor edi, edi 36275 .LBB4_1154: 36276 test r9b, 1 36277 je .LBB4_1156 36278 # %bb.1155: 36279 movupd xmm2, xmmword ptr [rcx + 8*rdi] 36280 movupd xmm3, xmmword ptr [rcx + 8*rdi + 16] 36281 xorpd xmm4, xmm4 36282 movapd xmm0, xmm2 36283 cmpeqpd xmm0, xmm4 36284 packssdw xmm0, xmm0 36285 packssdw xmm0, xmm0 36286 movapd xmm1, xmm3 36287 cmpeqpd xmm1, xmm4 36288 packssdw xmm1, xmm1 36289 packssdw xmm1, xmm1 36290 movapd xmm5, xmmword ptr [rip + .LCPI4_0] # xmm5 = [-0.0E+0,-0.0E+0] 36291 andpd xmm2, xmm5 36292 movapd xmm6, xmmword ptr [rip + .LCPI4_1] # xmm6 = [1.0E+0,1.0E+0] 36293 orpd xmm2, xmm6 36294 andpd xmm3, xmm5 36295 orpd xmm3, xmm6 36296 cvttpd2dq xmm2, xmm2 36297 cvttpd2dq xmm3, xmm3 36298 pshuflw xmm2, xmm2, 232 # xmm2 = xmm2[0,2,2,3,4,5,6,7] 36299 pshuflw xmm3, xmm3, 232 # xmm3 = xmm3[0,2,2,3,4,5,6,7] 36300 pblendvb xmm2, xmm4, xmm0 36301 movdqa xmm0, xmm1 36302 pblendvb xmm3, xmm4, xmm0 36303 movd dword ptr [r8 + 2*rdi], xmm2 36304 movd dword ptr [r8 + 2*rdi + 4], xmm3 36305 .LBB4_1156: 36306 cmp rsi, rax 36307 je .LBB4_1655 36308 .LBB4_1157: 36309 pxor xmm0, xmm0 36310 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 36311 movsd xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero 36312 .LBB4_1158: # =>This Inner Loop Header: Depth=1 36313 movsd xmm3, qword ptr [rcx + 8*rsi] # xmm3 = mem[0],zero 36314 ucomisd xmm0, xmm3 36315 andpd xmm3, xmm1 36316 orpd xmm3, xmm2 36317 cvttsd2si edx, xmm3 36318 cmove edx, r10d 36319 mov word ptr [r8 + 2*rsi], dx 36320 add rsi, 1 36321 cmp rax, rsi 36322 jne .LBB4_1158 36323 jmp .LBB4_1655 36324 .LBB4_1159: 36325 xor esi, esi 36326 .LBB4_1160: 36327 test r9b, 1 36328 je .LBB4_1162 36329 # %bb.1161: 36330 movdqu xmm2, xmmword ptr [rcx + 8*rsi] 36331 movdqu xmm3, xmmword ptr [rcx + 8*rsi + 16] 36332 pxor xmm4, xmm4 36333 movdqa xmm0, xmm2 36334 pcmpgtq xmm0, xmm4 36335 packssdw xmm0, xmm0 36336 packssdw xmm0, xmm0 36337 movdqa xmm1, xmm3 36338 pcmpgtq xmm1, xmm4 36339 packssdw xmm1, xmm1 36340 packssdw xmm1, xmm1 36341 pcmpeqq xmm2, xmm4 36342 pcmpeqd xmm5, xmm5 36343 pxor xmm2, xmm5 36344 packssdw xmm2, xmm2 36345 packssdw xmm2, xmm2 36346 pcmpeqq xmm3, xmm4 36347 pxor xmm3, xmm5 36348 packssdw xmm3, xmm3 36349 packssdw xmm3, xmm3 36350 movdqa xmm4, xmmword ptr [rip + .LCPI4_17] # xmm4 = <1,1,u,u,u,u,u,u> 36351 pblendvb xmm2, xmm4, xmm0 36352 movdqa xmm0, xmm1 36353 pblendvb xmm3, xmm4, xmm0 36354 movd dword ptr [r8 + 2*rsi], xmm2 36355 movd dword ptr [r8 + 2*rsi + 4], xmm3 36356 .LBB4_1162: 36357 cmp rdx, r10 36358 je .LBB4_1655 36359 .LBB4_1163: 36360 mov esi, 1 36361 .LBB4_1164: # =>This Inner Loop Header: Depth=1 36362 mov rdi, qword ptr [rcx + 8*rdx] 36363 xor eax, eax 36364 test rdi, rdi 36365 setne al 36366 neg eax 36367 test rdi, rdi 36368 cmovg eax, esi 36369 mov word ptr [r8 + 2*rdx], ax 36370 add rdx, 1 36371 cmp r10, rdx 36372 jne .LBB4_1164 36373 jmp .LBB4_1655 36374 .LBB4_1165: 36375 xor edi, edi 36376 .LBB4_1166: 36377 test r9b, 1 36378 je .LBB4_1168 36379 # %bb.1167: 36380 movups xmm0, xmmword ptr [rcx + 4*rdi] 36381 movups xmm1, xmmword ptr [rcx + 4*rdi + 16] 36382 xorps xmm4, xmm4 36383 movaps xmm2, xmm0 36384 cmpeqps xmm2, xmm4 36385 packssdw xmm2, xmm2 36386 movaps xmm3, xmm1 36387 cmpeqps xmm3, xmm4 36388 packssdw xmm3, xmm3 36389 pcmpeqd xmm5, xmm5 36390 pcmpgtd xmm0, xmm5 36391 packssdw xmm0, xmm0 36392 pcmpgtd xmm1, xmm5 36393 packssdw xmm1, xmm1 36394 movdqa xmm6, xmmword ptr [rip + .LCPI4_11] # xmm6 = <1,1,1,1,u,u,u,u> 36395 pcmpeqd xmm7, xmm7 36396 pblendvb xmm7, xmm6, xmm0 36397 movdqa xmm0, xmm1 36398 pblendvb xmm5, xmm6, xmm0 36399 movdqa xmm0, xmm2 36400 pblendvb xmm7, xmm4, xmm0 36401 movdqa xmm0, xmm3 36402 pblendvb xmm5, xmm4, xmm0 36403 punpcklqdq xmm7, xmm5 # xmm7 = xmm7[0],xmm5[0] 36404 movdqu xmmword ptr [r8 + 2*rdi], xmm7 36405 .LBB4_1168: 36406 cmp rsi, rax 36407 je .LBB4_1655 36408 .LBB4_1169: 36409 pxor xmm0, xmm0 36410 .LBB4_1170: # =>This Inner Loop Header: Depth=1 36411 movd xmm1, dword ptr [rcx + 4*rsi] # xmm1 = mem[0],zero,zero,zero 36412 movd edx, xmm1 36413 xor edi, edi 36414 test edx, edx 36415 setns dil 36416 ucomiss xmm0, xmm1 36417 lea edx, [rdi + rdi - 1] 36418 cmove edx, r10d 36419 mov word ptr [r8 + 2*rsi], dx 36420 add rsi, 1 36421 cmp rax, rsi 36422 jne .LBB4_1170 36423 jmp .LBB4_1655 36424 .LBB4_1171: 36425 xor edi, edi 36426 .LBB4_1172: 36427 test r9b, 1 36428 je .LBB4_1174 36429 # %bb.1173: 36430 movups xmm0, xmmword ptr [rcx + 4*rdi] 36431 movups xmm1, xmmword ptr [rcx + 4*rdi + 16] 36432 xorps xmm4, xmm4 36433 movaps xmm2, xmm0 36434 cmpeqps xmm2, xmm4 36435 packssdw xmm2, xmm2 36436 movaps xmm3, xmm1 36437 cmpeqps xmm3, xmm4 36438 packssdw xmm3, xmm3 36439 pcmpeqd xmm5, xmm5 36440 pcmpgtd xmm0, xmm5 36441 packssdw xmm0, xmm0 36442 pcmpgtd xmm1, xmm5 36443 packssdw xmm1, xmm1 36444 movdqa xmm6, xmmword ptr [rip + .LCPI4_11] # xmm6 = <1,1,1,1,u,u,u,u> 36445 pcmpeqd xmm7, xmm7 36446 pblendvb xmm7, xmm6, xmm0 36447 movdqa xmm0, xmm1 36448 pblendvb xmm5, xmm6, xmm0 36449 movdqa xmm0, xmm2 36450 pblendvb xmm7, xmm4, xmm0 36451 movdqa xmm0, xmm3 36452 pblendvb xmm5, xmm4, xmm0 36453 punpcklqdq xmm7, xmm5 # xmm7 = xmm7[0],xmm5[0] 36454 movdqu xmmword ptr [r8 + 2*rdi], xmm7 36455 .LBB4_1174: 36456 cmp rsi, rax 36457 je .LBB4_1655 36458 .LBB4_1175: 36459 pxor xmm0, xmm0 36460 .LBB4_1176: # =>This Inner Loop Header: Depth=1 36461 movd xmm1, dword ptr [rcx + 4*rsi] # xmm1 = mem[0],zero,zero,zero 36462 movd edx, xmm1 36463 xor edi, edi 36464 test edx, edx 36465 setns dil 36466 ucomiss xmm0, xmm1 36467 lea edx, [rdi + rdi - 1] 36468 cmove edx, r10d 36469 mov word ptr [r8 + 2*rsi], dx 36470 add rsi, 1 36471 cmp rax, rsi 36472 jne .LBB4_1176 36473 jmp .LBB4_1655 36474 .LBB4_1177: 36475 xor esi, esi 36476 .LBB4_1178: 36477 test r9b, 1 36478 je .LBB4_1180 36479 # %bb.1179: 36480 movq xmm0, qword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero 36481 movq xmm1, qword ptr [rcx + 4*rsi + 8] # xmm1 = mem[0],zero 36482 pxor xmm2, xmm2 36483 pcmpeqd xmm0, xmm2 36484 pcmpeqd xmm3, xmm3 36485 pxor xmm0, xmm3 36486 pmovzxdq xmm0, xmm0 # xmm0 = xmm0[0],zero,xmm0[1],zero 36487 movdqa xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 36488 pand xmm0, xmm4 36489 pcmpeqd xmm1, xmm2 36490 pxor xmm1, xmm3 36491 pmovzxdq xmm1, xmm1 # xmm1 = xmm1[0],zero,xmm1[1],zero 36492 pand xmm1, xmm4 36493 movdqu xmmword ptr [r8 + 8*rsi], xmm0 36494 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm1 36495 .LBB4_1180: 36496 cmp rdx, rax 36497 je .LBB4_1655 36498 .LBB4_1181: # =>This Inner Loop Header: Depth=1 36499 xor esi, esi 36500 cmp dword ptr [rcx + 4*rdx], 0 36501 setne sil 36502 mov qword ptr [r8 + 8*rdx], rsi 36503 add rdx, 1 36504 cmp rax, rdx 36505 jne .LBB4_1181 36506 jmp .LBB4_1655 36507 .LBB4_1182: 36508 xor esi, esi 36509 .LBB4_1183: 36510 test r9b, 1 36511 je .LBB4_1185 36512 # %bb.1184: 36513 movdqu xmm0, xmmword ptr [rcx + 4*rsi] 36514 movdqu xmm1, xmmword ptr [rcx + 4*rsi + 16] 36515 pxor xmm2, xmm2 36516 pcmpeqd xmm0, xmm2 36517 movdqa xmm3, xmmword ptr [rip + .LCPI4_19] # xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 36518 pandn xmm0, xmm3 36519 pcmpeqd xmm1, xmm2 36520 pandn xmm1, xmm3 36521 movdqu xmmword ptr [r8 + 4*rsi], xmm0 36522 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm1 36523 .LBB4_1185: 36524 cmp rdx, rax 36525 je .LBB4_1655 36526 .LBB4_1186: 36527 movd xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero 36528 jmp .LBB4_1188 36529 .LBB4_1187: # in Loop: Header=BB4_1188 Depth=1 36530 movd dword ptr [r8 + 4*rdx], xmm1 36531 add rdx, 1 36532 cmp rax, rdx 36533 je .LBB4_1655 36534 .LBB4_1188: # =>This Inner Loop Header: Depth=1 36535 cmp dword ptr [rcx + 4*rdx], 0 36536 movdqa xmm1, xmm0 36537 jne .LBB4_1187 36538 # %bb.1189: # in Loop: Header=BB4_1188 Depth=1 36539 pxor xmm1, xmm1 36540 jmp .LBB4_1187 36541 .LBB4_1190: 36542 xor esi, esi 36543 .LBB4_1191: 36544 test r9b, 1 36545 je .LBB4_1193 36546 # %bb.1192: 36547 movupd xmm0, xmmword ptr [rcx + 8*rsi] 36548 movupd xmm1, xmmword ptr [rcx + 8*rsi + 16] 36549 xorpd xmm2, xmm2 36550 movapd xmm3, xmmword ptr [rip + .LCPI4_0] # xmm3 = [-0.0E+0,-0.0E+0] 36551 movapd xmm4, xmm0 36552 andpd xmm4, xmm3 36553 movapd xmm5, xmmword ptr [rip + .LCPI4_1] # xmm5 = [1.0E+0,1.0E+0] 36554 orpd xmm4, xmm5 36555 andpd xmm3, xmm1 36556 orpd xmm3, xmm5 36557 cvttsd2si rdi, xmm4 36558 movq xmm5, rdi 36559 pshufd xmm4, xmm4, 238 # xmm4 = xmm4[2,3,2,3] 36560 cvttsd2si rdi, xmm4 36561 movq xmm4, rdi 36562 punpcklqdq xmm5, xmm4 # xmm5 = xmm5[0],xmm4[0] 36563 cvttsd2si rdi, xmm3 36564 movq xmm4, rdi 36565 pshufd xmm3, xmm3, 238 # xmm3 = xmm3[2,3,2,3] 36566 cvttsd2si rdi, xmm3 36567 movq xmm3, rdi 36568 punpcklqdq xmm4, xmm3 # xmm4 = xmm4[0],xmm3[0] 36569 cmpneqpd xmm0, xmm2 36570 andpd xmm0, xmm5 36571 cmpneqpd xmm1, xmm2 36572 andpd xmm1, xmm4 36573 movupd xmmword ptr [r8 + 8*rsi], xmm0 36574 movupd xmmword ptr [r8 + 8*rsi + 16], xmm1 36575 .LBB4_1193: 36576 cmp rdx, rax 36577 je .LBB4_1655 36578 .LBB4_1194: 36579 xor esi, esi 36580 xorpd xmm0, xmm0 36581 movapd xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [-0.0E+0,-0.0E+0] 36582 movsd xmm2, qword ptr [rip + .LCPI4_2] # xmm2 = mem[0],zero 36583 .LBB4_1195: # =>This Inner Loop Header: Depth=1 36584 movsd xmm3, qword ptr [rcx + 8*rdx] # xmm3 = mem[0],zero 36585 ucomisd xmm0, xmm3 36586 andpd xmm3, xmm1 36587 orpd xmm3, xmm2 36588 cvttsd2si rdi, xmm3 36589 cmove rdi, rsi 36590 mov qword ptr [r8 + 8*rdx], rdi 36591 add rdx, 1 36592 cmp rax, rdx 36593 jne .LBB4_1195 36594 jmp .LBB4_1655 36595 .LBB4_1196: 36596 xor esi, esi 36597 .LBB4_1197: 36598 test r9b, 1 36599 je .LBB4_1199 36600 # %bb.1198: 36601 movupd xmm2, xmmword ptr [rcx + 8*rsi] 36602 movupd xmm8, xmmword ptr [rcx + 8*rsi + 16] 36603 xorps xmm0, xmm0 36604 cvtsd2ss xmm3, xmm2 36605 cmpeqpd xmm2, xmm0 36606 shufps xmm2, xmm2, 232 # xmm2 = xmm2[0,2,2,3] 36607 cvtpd2ps xmm4, xmmword ptr [rip + .LCPI4_1] 36608 cmpeqpd xmm0, xmm8 36609 movsd xmm5, qword ptr [rcx + 8*rsi + 8] # xmm5 = mem[0],zero 36610 cvtsd2ss xmm5, xmm5 36611 shufps xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] 36612 movaps xmm6, xmmword ptr [rip + .LCPI4_3] # xmm6 = [NaN,NaN,NaN,NaN] 36613 movaps xmm7, xmm6 36614 andnps xmm7, xmm5 36615 movshdup xmm5, xmm4 # xmm5 = xmm4[1,1,3,3] 36616 andps xmm5, xmm6 36617 orps xmm7, xmm5 36618 movaps xmm1, xmm6 36619 andnps xmm1, xmm3 36620 andps xmm4, xmm6 36621 orps xmm1, xmm4 36622 unpcklps xmm1, xmm7 # xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] 36623 andnps xmm2, xmm1 36624 movsd xmm1, qword ptr [rcx + 8*rsi + 24] # xmm1 = mem[0],zero 36625 cvtsd2ss xmm1, xmm1 36626 movaps xmm3, xmm6 36627 andnps xmm3, xmm1 36628 orps xmm3, xmm5 36629 xorps xmm1, xmm1 36630 cvtsd2ss xmm1, xmm8 36631 andnps xmm6, xmm1 36632 orps xmm6, xmm4 36633 unpcklps xmm6, xmm3 # xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] 36634 andnps xmm0, xmm6 36635 movlhps xmm2, xmm0 # xmm2 = xmm2[0],xmm0[0] 36636 movups xmmword ptr [r8 + 4*rsi], xmm2 36637 .LBB4_1199: 36638 cmp rdx, rax 36639 je .LBB4_1655 36640 .LBB4_1200: 36641 xorps xmm0, xmm0 36642 movaps xmm1, xmmword ptr [rip + .LCPI4_4] # xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 36643 movss xmm2, dword ptr [rip + .LCPI4_5] # xmm2 = mem[0],zero,zero,zero 36644 jmp .LBB4_1202 36645 .LBB4_1201: # in Loop: Header=BB4_1202 Depth=1 36646 movss dword ptr [r8 + 4*rdx], xmm3 36647 add rdx, 1 36648 cmp rax, rdx 36649 je .LBB4_1655 36650 .LBB4_1202: # =>This Inner Loop Header: Depth=1 36651 movsd xmm4, qword ptr [rcx + 8*rdx] # xmm4 = mem[0],zero 36652 ucomisd xmm0, xmm4 36653 xorps xmm3, xmm3 36654 je .LBB4_1201 36655 # %bb.1203: # in Loop: Header=BB4_1202 Depth=1 36656 xorps xmm3, xmm3 36657 cvtsd2ss xmm3, xmm4 36658 andps xmm3, xmm1 36659 orps xmm3, xmm2 36660 jmp .LBB4_1201 36661 .LBB4_1204: 36662 xor esi, esi 36663 .LBB4_1205: 36664 test r9b, 1 36665 je .LBB4_1207 36666 # %bb.1206: 36667 movq xmm0, qword ptr [rcx + 2*rsi] # xmm0 = mem[0],zero 36668 movq xmm1, qword ptr [rcx + 2*rsi + 8] # xmm1 = mem[0],zero 36669 pxor xmm2, xmm2 36670 pcmpeqw xmm0, xmm2 36671 pcmpeqd xmm3, xmm3 36672 pxor xmm0, xmm3 36673 pmovzxwd xmm0, xmm0 # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 36674 movdqa xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 36675 pand xmm0, xmm4 36676 cvtdq2ps xmm0, xmm0 36677 pcmpeqw xmm1, xmm2 36678 pxor xmm1, xmm3 36679 pmovzxwd xmm1, xmm1 # xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 36680 pand xmm1, xmm4 36681 cvtdq2ps xmm1, xmm1 36682 movups xmmword ptr [r8 + 4*rsi], xmm0 36683 movups xmmword ptr [r8 + 4*rsi + 16], xmm1 36684 .LBB4_1207: 36685 cmp rdx, rax 36686 je .LBB4_1655 36687 .LBB4_1208: 36688 movd xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero 36689 jmp .LBB4_1210 36690 .LBB4_1209: # in Loop: Header=BB4_1210 Depth=1 36691 movd dword ptr [r8 + 4*rdx], xmm1 36692 add rdx, 1 36693 cmp rax, rdx 36694 je .LBB4_1655 36695 .LBB4_1210: # =>This Inner Loop Header: Depth=1 36696 cmp word ptr [rcx + 2*rdx], 0 36697 movdqa xmm1, xmm0 36698 jne .LBB4_1209 36699 # %bb.1211: # in Loop: Header=BB4_1210 Depth=1 36700 pxor xmm1, xmm1 36701 jmp .LBB4_1209 36702 .LBB4_1212: 36703 xor esi, esi 36704 .LBB4_1213: 36705 test r9b, 1 36706 je .LBB4_1215 36707 # %bb.1214: 36708 movd xmm2, dword ptr [rcx + 2*rsi] # xmm2 = mem[0],zero,zero,zero 36709 movd xmm3, dword ptr [rcx + 2*rsi + 4] # xmm3 = mem[0],zero,zero,zero 36710 xorpd xmm4, xmm4 36711 movdqa xmm0, xmm2 36712 pcmpgtw xmm0, xmm4 36713 pmovsxwq xmm0, xmm0 36714 movdqa xmm1, xmm3 36715 pcmpgtw xmm1, xmm4 36716 pmovsxwq xmm1, xmm1 36717 pcmpeqw xmm2, xmm4 36718 pcmpeqd xmm5, xmm5 36719 pxor xmm2, xmm5 36720 pmovsxwq xmm2, xmm2 36721 pcmpeqw xmm3, xmm4 36722 pxor xmm3, xmm5 36723 pmovsxwq xmm3, xmm3 36724 movapd xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 36725 blendvpd xmm2, xmm4, xmm0 36726 movdqa xmm0, xmm1 36727 blendvpd xmm3, xmm4, xmm0 36728 movupd xmmword ptr [r8 + 8*rsi], xmm2 36729 movupd xmmword ptr [r8 + 8*rsi + 16], xmm3 36730 .LBB4_1215: 36731 cmp rdx, r10 36732 je .LBB4_1655 36733 .LBB4_1216: 36734 mov esi, 1 36735 .LBB4_1217: # =>This Inner Loop Header: Depth=1 36736 movzx edi, word ptr [rcx + 2*rdx] 36737 xor eax, eax 36738 test di, di 36739 setne al 36740 neg rax 36741 test di, di 36742 cmovg rax, rsi 36743 mov qword ptr [r8 + 8*rdx], rax 36744 add rdx, 1 36745 cmp r10, rdx 36746 jne .LBB4_1217 36747 jmp .LBB4_1655 36748 .LBB4_1218: 36749 xor esi, esi 36750 .LBB4_1219: 36751 test r9b, 1 36752 je .LBB4_1221 36753 # %bb.1220: 36754 movq xmm2, qword ptr [rcx + 2*rsi] # xmm2 = mem[0],zero 36755 movq xmm3, qword ptr [rcx + 2*rsi + 8] # xmm3 = mem[0],zero 36756 xorps xmm4, xmm4 36757 movdqa xmm0, xmm2 36758 pcmpgtw xmm0, xmm4 36759 pmovsxwd xmm0, xmm0 36760 movdqa xmm1, xmm3 36761 pcmpgtw xmm1, xmm4 36762 pmovsxwd xmm1, xmm1 36763 pcmpeqw xmm2, xmm4 36764 pcmpeqd xmm5, xmm5 36765 pxor xmm2, xmm5 36766 pmovsxwd xmm2, xmm2 36767 cvtdq2ps xmm2, xmm2 36768 pcmpeqw xmm3, xmm4 36769 pxor xmm3, xmm5 36770 pmovsxwd xmm3, xmm3 36771 cvtdq2ps xmm3, xmm3 36772 movaps xmm4, xmmword ptr [rip + .LCPI4_19] # xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 36773 blendvps xmm2, xmm4, xmm0 36774 movdqa xmm0, xmm1 36775 blendvps xmm3, xmm4, xmm0 36776 movups xmmword ptr [r8 + 4*rsi], xmm2 36777 movups xmmword ptr [r8 + 4*rsi + 16], xmm3 36778 .LBB4_1221: 36779 cmp rdx, rax 36780 je .LBB4_1655 36781 .LBB4_1222: 36782 movd xmm0, dword ptr [rip + .LCPI4_14] # xmm0 = mem[0],zero,zero,zero 36783 movd xmm1, dword ptr [rip + .LCPI4_5] # xmm1 = mem[0],zero,zero,zero 36784 jmp .LBB4_1224 36785 .LBB4_1223: # in Loop: Header=BB4_1224 Depth=1 36786 movd dword ptr [r8 + 4*rdx], xmm3 36787 add rdx, 1 36788 cmp rax, rdx 36789 je .LBB4_1655 36790 .LBB4_1224: # =>This Inner Loop Header: Depth=1 36791 cmp word ptr [rcx + 2*rdx], 0 36792 movdqa xmm2, xmm0 36793 jne .LBB4_1226 36794 # %bb.1225: # in Loop: Header=BB4_1224 Depth=1 36795 pxor xmm2, xmm2 36796 .LBB4_1226: # in Loop: Header=BB4_1224 Depth=1 36797 movdqa xmm3, xmm1 36798 jg .LBB4_1223 36799 # %bb.1227: # in Loop: Header=BB4_1224 Depth=1 36800 movdqa xmm3, xmm2 36801 jmp .LBB4_1223 36802 .LBB4_1104: 36803 movmskps ecx, xmm0 36804 and ecx, 1 36805 neg ecx 36806 or ecx, 1 36807 xorps xmm0, xmm0 36808 cvtsi2ss xmm0, ecx 36809 cvttss2si rcx, xmm0 36810 .LBB4_1105: 36811 mov qword ptr [r8 + 8*rax], rcx 36812 .LBB4_1655: 36813 lea rsp, [rbp - 16] 36814 pop rbx 36815 pop r14 36816 pop rbp 36817 ret 36818 .LBB4_1228: 36819 xor esi, esi 36820 .LBB4_1229: 36821 test r9b, 1 36822 je .LBB4_1231 36823 # %bb.1230: 36824 movdqu xmm0, xmmword ptr [rcx + 4*rsi] 36825 movdqu xmm1, xmmword ptr [rcx + 4*rsi + 16] 36826 pxor xmm2, xmm2 36827 pcmpeqd xmm0, xmm2 36828 movdqa xmm3, xmmword ptr [rip + .LCPI4_8] # xmm3 = [1,1,1,1] 36829 pandn xmm0, xmm3 36830 pcmpeqd xmm1, xmm2 36831 pandn xmm1, xmm3 36832 movdqu xmmword ptr [r8 + 4*rsi], xmm0 36833 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm1 36834 .LBB4_1231: 36835 cmp rdx, r10 36836 je .LBB4_1655 36837 jmp .LBB4_1232 36838 .LBB4_1236: 36839 xor esi, esi 36840 .LBB4_1237: 36841 test r9b, 1 36842 je .LBB4_1239 36843 # %bb.1238: 36844 movd xmm2, dword ptr [rcx + rsi] # xmm2 = mem[0],zero,zero,zero 36845 movd xmm3, dword ptr [rcx + rsi + 4] # xmm3 = mem[0],zero,zero,zero 36846 xorps xmm4, xmm4 36847 movdqa xmm0, xmm2 36848 pcmpgtb xmm0, xmm4 36849 pmovsxbd xmm0, xmm0 36850 movdqa xmm1, xmm3 36851 pcmpgtb xmm1, xmm4 36852 pmovsxbd xmm1, xmm1 36853 pcmpeqb xmm2, xmm4 36854 pcmpeqd xmm5, xmm5 36855 pxor xmm2, xmm5 36856 pmovsxbd xmm2, xmm2 36857 pcmpeqb xmm3, xmm4 36858 pxor xmm3, xmm5 36859 pmovsxbd xmm3, xmm3 36860 movaps xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 36861 blendvps xmm2, xmm4, xmm0 36862 movdqa xmm0, xmm1 36863 blendvps xmm3, xmm4, xmm0 36864 movups xmmword ptr [r8 + 4*rsi], xmm2 36865 movups xmmword ptr [r8 + 4*rsi + 16], xmm3 36866 .LBB4_1239: 36867 cmp rdx, r10 36868 je .LBB4_1655 36869 jmp .LBB4_1240 36870 .LBB4_1245: 36871 xor esi, esi 36872 .LBB4_1246: 36873 test r9b, 1 36874 je .LBB4_1248 36875 # %bb.1247: 36876 movd xmm0, dword ptr [rcx + rsi] # xmm0 = mem[0],zero,zero,zero 36877 movd xmm1, dword ptr [rcx + rsi + 4] # xmm1 = mem[0],zero,zero,zero 36878 pxor xmm2, xmm2 36879 pcmpeqb xmm0, xmm2 36880 pcmpeqd xmm3, xmm3 36881 pxor xmm0, xmm3 36882 pmovzxbd xmm0, xmm0 # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 36883 movdqa xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 36884 pand xmm0, xmm4 36885 pcmpeqb xmm1, xmm2 36886 pxor xmm1, xmm3 36887 pmovzxbd xmm1, xmm1 # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 36888 pand xmm1, xmm4 36889 movdqu xmmword ptr [r8 + 4*rsi], xmm0 36890 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm1 36891 .LBB4_1248: 36892 cmp rdx, r10 36893 je .LBB4_1655 36894 jmp .LBB4_1249 36895 .LBB4_1253: 36896 xor esi, esi 36897 .LBB4_1254: 36898 test r9b, 1 36899 je .LBB4_1256 36900 # %bb.1255: 36901 movdqu xmm1, xmmword ptr [rcx + 4*rsi] 36902 movdqu xmm2, xmmword ptr [rcx + 4*rsi + 16] 36903 pxor xmm3, xmm3 36904 movdqa xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 36905 movdqa xmm0, xmm4 36906 pcmpgtd xmm0, xmm1 36907 movdqa xmm5, xmm1 36908 pcmpeqd xmm5, xmm3 36909 pcmpeqd xmm1, xmm1 36910 pxor xmm5, xmm1 36911 pcmpeqd xmm3, xmm2 36912 pxor xmm3, xmm1 36913 movdqa xmm1, xmm4 36914 pcmpgtd xmm1, xmm2 36915 movdqa xmm2, xmm4 36916 blendvps xmm2, xmm5, xmm0 36917 movdqa xmm0, xmm1 36918 blendvps xmm4, xmm3, xmm0 36919 movups xmmword ptr [r8 + 4*rsi], xmm2 36920 movups xmmword ptr [r8 + 4*rsi + 16], xmm4 36921 .LBB4_1256: 36922 cmp rdx, r11 36923 je .LBB4_1655 36924 jmp .LBB4_1257 36925 .LBB4_1262: 36926 xor esi, esi 36927 .LBB4_1263: 36928 test r9b, 1 36929 je .LBB4_1265 36930 # %bb.1264: 36931 movupd xmm0, xmmword ptr [rcx + 8*rsi] 36932 movupd xmm1, xmmword ptr [rcx + 8*rsi + 16] 36933 xorpd xmm2, xmm2 36934 movapd xmm3, xmmword ptr [rip + .LCPI4_0] # xmm3 = [-0.0E+0,-0.0E+0] 36935 movapd xmm4, xmm0 36936 andpd xmm4, xmm3 36937 movapd xmm5, xmmword ptr [rip + .LCPI4_1] # xmm5 = [1.0E+0,1.0E+0] 36938 orpd xmm4, xmm5 36939 andpd xmm3, xmm1 36940 orpd xmm3, xmm5 36941 cmpneqpd xmm0, xmm2 36942 andpd xmm0, xmm4 36943 cmpneqpd xmm1, xmm2 36944 andpd xmm1, xmm3 36945 movupd xmmword ptr [r8 + 8*rsi], xmm0 36946 movupd xmmword ptr [r8 + 8*rsi + 16], xmm1 36947 .LBB4_1265: 36948 cmp rdx, rax 36949 je .LBB4_1655 36950 jmp .LBB4_1266 36951 .LBB4_1271: 36952 xor esi, esi 36953 .LBB4_1272: 36954 test r9b, 1 36955 je .LBB4_1274 36956 # %bb.1273: 36957 movdqu xmm0, xmmword ptr [rcx + 4*rsi] 36958 movdqu xmm1, xmmword ptr [rcx + 4*rsi + 16] 36959 pxor xmm2, xmm2 36960 pcmpeqd xmm0, xmm2 36961 pcmpeqd xmm3, xmm3 36962 pxor xmm0, xmm3 36963 packssdw xmm0, xmm0 36964 packsswb xmm0, xmm0 36965 movdqa xmm4, xmmword ptr [rip + .LCPI4_12] # xmm4 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u> 36966 pand xmm0, xmm4 36967 pcmpeqd xmm1, xmm2 36968 pxor xmm1, xmm3 36969 packssdw xmm1, xmm1 36970 packsswb xmm1, xmm1 36971 pand xmm1, xmm4 36972 movd dword ptr [r8 + rsi], xmm0 36973 movd dword ptr [r8 + rsi + 4], xmm1 36974 .LBB4_1274: 36975 cmp rdx, rax 36976 je .LBB4_1655 36977 jmp .LBB4_1275 36978 .LBB4_1279: 36979 xor esi, esi 36980 .LBB4_1280: 36981 test r9b, 1 36982 je .LBB4_1282 36983 # %bb.1281: 36984 movupd xmm3, xmmword ptr [rcx + 8*rsi] 36985 movupd xmm4, xmmword ptr [rcx + 8*rsi + 16] 36986 xorpd xmm2, xmm2 36987 movapd xmm0, xmm3 36988 cmpeqpd xmm0, xmm2 36989 packssdw xmm0, xmm0 36990 packssdw xmm0, xmm0 36991 packsswb xmm0, xmm0 36992 movapd xmm1, xmm4 36993 cmpeqpd xmm1, xmm2 36994 packssdw xmm1, xmm1 36995 packssdw xmm1, xmm1 36996 packsswb xmm1, xmm1 36997 movapd xmm5, xmmword ptr [rip + .LCPI4_0] # xmm5 = [-0.0E+0,-0.0E+0] 36998 andpd xmm3, xmm5 36999 movapd xmm6, xmmword ptr [rip + .LCPI4_1] # xmm6 = [1.0E+0,1.0E+0] 37000 orpd xmm3, xmm6 37001 andpd xmm4, xmm5 37002 orpd xmm4, xmm6 37003 cvttpd2dq xmm3, xmm3 37004 movdqa xmm5, xmmword ptr [rip + .LCPI4_7] # xmm5 = <0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 37005 pshufb xmm3, xmm5 37006 cvttpd2dq xmm4, xmm4 37007 pshufb xmm4, xmm5 37008 pblendvb xmm3, xmm2, xmm0 37009 movdqa xmm0, xmm1 37010 pblendvb xmm4, xmm2, xmm0 37011 pextrw word ptr [r8 + rsi], xmm3, 0 37012 pextrw word ptr [r8 + rsi + 2], xmm4, 0 37013 .LBB4_1282: 37014 cmp rdx, rax 37015 je .LBB4_1655 37016 jmp .LBB4_1283 37017 .LBB4_1288: 37018 xor eax, eax 37019 .LBB4_1289: 37020 test r9b, 1 37021 je .LBB4_1291 37022 # %bb.1290: 37023 movdqu xmm1, xmmword ptr [rcx + rax] 37024 movdqu xmm2, xmmword ptr [rcx + rax + 16] 37025 pxor xmm3, xmm3 37026 movdqa xmm4, xmmword ptr [rip + .LCPI4_22] # xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 37027 movdqa xmm0, xmm4 37028 pcmpgtb xmm0, xmm1 37029 movdqa xmm5, xmm1 37030 pcmpeqb xmm5, xmm3 37031 pcmpeqd xmm1, xmm1 37032 pxor xmm5, xmm1 37033 pcmpeqb xmm3, xmm2 37034 pxor xmm3, xmm1 37035 movdqa xmm1, xmm4 37036 pcmpgtb xmm1, xmm2 37037 movdqa xmm2, xmm4 37038 pblendvb xmm2, xmm5, xmm0 37039 movdqa xmm0, xmm1 37040 pblendvb xmm4, xmm3, xmm0 37041 movdqu xmmword ptr [r8 + rax], xmm2 37042 movdqu xmmword ptr [r8 + rax + 16], xmm4 37043 .LBB4_1291: 37044 cmp rsi, r10 37045 je .LBB4_1655 37046 jmp .LBB4_1292 37047 .LBB4_1297: 37048 xor esi, esi 37049 .LBB4_1298: 37050 test r9b, 1 37051 je .LBB4_1300 37052 # %bb.1299: 37053 movdqu xmm0, xmmword ptr [rcx + 8*rsi] 37054 movdqu xmm1, xmmword ptr [rcx + 8*rsi + 16] 37055 pxor xmm2, xmm2 37056 pcmpeqq xmm0, xmm2 37057 pcmpeqd xmm3, xmm3 37058 pxor xmm0, xmm3 37059 packssdw xmm0, xmm0 37060 packssdw xmm0, xmm0 37061 packsswb xmm0, xmm0 37062 movdqa xmm4, xmmword ptr [rip + .LCPI4_18] # xmm4 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 37063 pand xmm0, xmm4 37064 pcmpeqq xmm1, xmm2 37065 pxor xmm1, xmm3 37066 packssdw xmm1, xmm1 37067 packssdw xmm1, xmm1 37068 packsswb xmm1, xmm1 37069 pextrw word ptr [r8 + rsi], xmm0, 0 37070 pand xmm1, xmm4 37071 pextrw word ptr [r8 + rsi + 2], xmm1, 0 37072 .LBB4_1300: 37073 cmp rdx, rax 37074 je .LBB4_1655 37075 jmp .LBB4_1301 37076 .LBB4_1305: 37077 xor esi, esi 37078 .LBB4_1306: 37079 test r9b, 1 37080 je .LBB4_1308 37081 # %bb.1307: 37082 movdqu xmm0, xmmword ptr [rcx + 2*rsi] 37083 movdqu xmm1, xmmword ptr [rcx + 2*rsi + 16] 37084 pxor xmm2, xmm2 37085 pcmpeqw xmm0, xmm2 37086 pcmpeqd xmm3, xmm3 37087 pxor xmm0, xmm3 37088 packsswb xmm0, xmm0 37089 movdqa xmm4, xmmword ptr [rip + .LCPI4_21] # xmm4 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u> 37090 pand xmm0, xmm4 37091 pcmpeqw xmm1, xmm2 37092 pxor xmm1, xmm3 37093 packsswb xmm1, xmm1 37094 pand xmm1, xmm4 37095 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] 37096 movdqu xmmword ptr [r8 + rsi], xmm0 37097 .LBB4_1308: 37098 cmp rdx, rax 37099 je .LBB4_1655 37100 jmp .LBB4_1309 37101 .LBB4_1313: 37102 xor eax, eax 37103 .LBB4_1314: 37104 test r9b, 1 37105 je .LBB4_1316 37106 # %bb.1315: 37107 movdqu xmm2, xmmword ptr [rcx + 2*rax] 37108 movdqu xmm3, xmmword ptr [rcx + 2*rax + 16] 37109 pxor xmm4, xmm4 37110 movdqa xmm0, xmm2 37111 pcmpgtw xmm0, xmm4 37112 packsswb xmm0, xmm0 37113 movdqa xmm1, xmm3 37114 pcmpgtw xmm1, xmm4 37115 packsswb xmm1, xmm1 37116 pcmpeqw xmm2, xmm4 37117 pcmpeqd xmm5, xmm5 37118 pxor xmm2, xmm5 37119 packsswb xmm2, xmm2 37120 pcmpeqw xmm3, xmm4 37121 pxor xmm3, xmm5 37122 packsswb xmm3, xmm3 37123 movdqa xmm4, xmmword ptr [rip + .LCPI4_21] # xmm4 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u> 37124 pblendvb xmm2, xmm4, xmm0 37125 movdqa xmm0, xmm1 37126 pblendvb xmm3, xmm4, xmm0 37127 punpcklqdq xmm2, xmm3 # xmm2 = xmm2[0],xmm3[0] 37128 movdqu xmmword ptr [r8 + rax], xmm2 37129 .LBB4_1316: 37130 cmp rsi, r10 37131 je .LBB4_1655 37132 jmp .LBB4_1317 37133 .LBB4_1322: 37134 xor eax, eax 37135 .LBB4_1323: 37136 test r9b, 1 37137 je .LBB4_1325 37138 # %bb.1324: 37139 movdqu xmm2, xmmword ptr [rcx + 8*rax] 37140 movdqu xmm3, xmmword ptr [rcx + 8*rax + 16] 37141 pxor xmm4, xmm4 37142 movdqa xmm0, xmm2 37143 pcmpgtq xmm0, xmm4 37144 packssdw xmm0, xmm0 37145 packssdw xmm0, xmm0 37146 packsswb xmm0, xmm0 37147 movdqa xmm1, xmm3 37148 pcmpgtq xmm1, xmm4 37149 packssdw xmm1, xmm1 37150 packssdw xmm1, xmm1 37151 packsswb xmm1, xmm1 37152 pcmpeqq xmm2, xmm4 37153 pcmpeqd xmm5, xmm5 37154 pxor xmm2, xmm5 37155 packssdw xmm2, xmm2 37156 packssdw xmm2, xmm2 37157 packsswb xmm2, xmm2 37158 pcmpeqq xmm3, xmm4 37159 pxor xmm3, xmm5 37160 packssdw xmm3, xmm3 37161 packssdw xmm3, xmm3 37162 packsswb xmm3, xmm3 37163 movdqa xmm4, xmmword ptr [rip + .LCPI4_18] # xmm4 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 37164 pblendvb xmm2, xmm4, xmm0 37165 movdqa xmm0, xmm1 37166 pblendvb xmm3, xmm4, xmm0 37167 pextrw word ptr [r8 + rax], xmm2, 0 37168 pextrw word ptr [r8 + rax + 2], xmm3, 0 37169 .LBB4_1325: 37170 cmp rsi, r10 37171 je .LBB4_1655 37172 jmp .LBB4_1326 37173 .LBB4_1331: 37174 xor esi, esi 37175 .LBB4_1332: 37176 test r9b, 1 37177 je .LBB4_1334 37178 # %bb.1333: 37179 movups xmm0, xmmword ptr [rcx + 4*rsi] 37180 movups xmm1, xmmword ptr [rcx + 4*rsi + 16] 37181 xorps xmm4, xmm4 37182 movaps xmm2, xmm0 37183 cmpeqps xmm2, xmm4 37184 packssdw xmm2, xmm2 37185 packsswb xmm2, xmm2 37186 movaps xmm3, xmm1 37187 cmpeqps xmm3, xmm4 37188 packssdw xmm3, xmm3 37189 packsswb xmm3, xmm3 37190 pcmpeqd xmm5, xmm5 37191 pcmpgtd xmm0, xmm5 37192 packssdw xmm0, xmm0 37193 packsswb xmm0, xmm0 37194 pcmpgtd xmm1, xmm5 37195 packssdw xmm1, xmm1 37196 packsswb xmm1, xmm1 37197 movdqa xmm6, xmmword ptr [rip + .LCPI4_12] # xmm6 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u> 37198 pcmpeqd xmm7, xmm7 37199 pblendvb xmm7, xmm6, xmm0 37200 movdqa xmm0, xmm1 37201 pblendvb xmm5, xmm6, xmm0 37202 movdqa xmm0, xmm2 37203 pblendvb xmm7, xmm4, xmm0 37204 movdqa xmm0, xmm3 37205 pblendvb xmm5, xmm4, xmm0 37206 movd dword ptr [r8 + rsi], xmm7 37207 movd dword ptr [r8 + rsi + 4], xmm5 37208 .LBB4_1334: 37209 cmp rdx, r10 37210 je .LBB4_1655 37211 jmp .LBB4_1335 37212 .LBB4_1340: 37213 xor esi, esi 37214 .LBB4_1341: 37215 test r9b, 1 37216 je .LBB4_1343 37217 # %bb.1342: 37218 movdqu xmm0, xmmword ptr [rcx + rsi] 37219 movdqu xmm1, xmmword ptr [rcx + rsi + 16] 37220 pxor xmm2, xmm2 37221 pcmpeqb xmm0, xmm2 37222 movdqa xmm3, xmmword ptr [rip + .LCPI4_22] # xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 37223 pandn xmm0, xmm3 37224 pcmpeqb xmm1, xmm2 37225 pandn xmm1, xmm3 37226 movdqu xmmword ptr [r8 + rsi], xmm0 37227 movdqu xmmword ptr [r8 + rsi + 16], xmm1 37228 .LBB4_1343: 37229 cmp rdx, rax 37230 je .LBB4_1655 37231 jmp .LBB4_1344 37232 .LBB4_1348: 37233 xor eax, eax 37234 .LBB4_1349: 37235 test r9b, 1 37236 je .LBB4_1351 37237 # %bb.1350: 37238 movdqu xmm2, xmmword ptr [rcx + 4*rax] 37239 movdqu xmm3, xmmword ptr [rcx + 4*rax + 16] 37240 pxor xmm4, xmm4 37241 movdqa xmm0, xmm2 37242 pcmpgtd xmm0, xmm4 37243 packssdw xmm0, xmm0 37244 packsswb xmm0, xmm0 37245 movdqa xmm1, xmm3 37246 pcmpgtd xmm1, xmm4 37247 packssdw xmm1, xmm1 37248 packsswb xmm1, xmm1 37249 pcmpeqd xmm2, xmm4 37250 pcmpeqd xmm5, xmm5 37251 pxor xmm2, xmm5 37252 packssdw xmm2, xmm2 37253 packsswb xmm2, xmm2 37254 pcmpeqd xmm3, xmm4 37255 pxor xmm3, xmm5 37256 packssdw xmm3, xmm3 37257 packsswb xmm3, xmm3 37258 movdqa xmm4, xmmword ptr [rip + .LCPI4_12] # xmm4 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u> 37259 pblendvb xmm2, xmm4, xmm0 37260 movdqa xmm0, xmm1 37261 pblendvb xmm3, xmm4, xmm0 37262 movd dword ptr [r8 + rax], xmm2 37263 movd dword ptr [r8 + rax + 4], xmm3 37264 .LBB4_1351: 37265 cmp rsi, r10 37266 je .LBB4_1655 37267 jmp .LBB4_1352 37268 .LBB4_1357: 37269 xor esi, esi 37270 .LBB4_1358: 37271 test r9b, 1 37272 je .LBB4_1360 37273 # %bb.1359: 37274 movzx eax, word ptr [rcx + rsi] 37275 movd xmm2, eax 37276 movzx eax, word ptr [rcx + rsi + 2] 37277 movd xmm3, eax 37278 xorpd xmm4, xmm4 37279 movdqa xmm0, xmm2 37280 pcmpgtb xmm0, xmm4 37281 pmovsxbq xmm0, xmm0 37282 movdqa xmm1, xmm3 37283 pcmpgtb xmm1, xmm4 37284 pmovsxbq xmm1, xmm1 37285 pcmpeqb xmm2, xmm4 37286 pcmpeqd xmm5, xmm5 37287 pxor xmm2, xmm5 37288 pmovsxbq xmm2, xmm2 37289 pcmpeqb xmm3, xmm4 37290 pxor xmm3, xmm5 37291 pmovsxbq xmm3, xmm3 37292 movapd xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 37293 blendvpd xmm2, xmm4, xmm0 37294 movdqa xmm0, xmm1 37295 blendvpd xmm3, xmm4, xmm0 37296 movupd xmmword ptr [r8 + 8*rsi], xmm2 37297 movupd xmmword ptr [r8 + 8*rsi + 16], xmm3 37298 .LBB4_1360: 37299 cmp rdx, r10 37300 je .LBB4_1655 37301 jmp .LBB4_1361 37302 .LBB4_1366: 37303 xor esi, esi 37304 .LBB4_1367: 37305 test r9b, 1 37306 je .LBB4_1369 37307 # %bb.1368: 37308 movdqu xmm0, xmmword ptr [rcx + 8*rsi] 37309 movdqu xmm1, xmmword ptr [rcx + 8*rsi + 16] 37310 pxor xmm2, xmm2 37311 pcmpeqq xmm0, xmm2 37312 movdqa xmm3, xmmword ptr [rip + .LCPI4_15] # xmm3 = [1,1] 37313 pandn xmm0, xmm3 37314 pcmpeqq xmm1, xmm2 37315 pandn xmm1, xmm3 37316 movdqu xmmword ptr [r8 + 8*rsi], xmm0 37317 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm1 37318 .LBB4_1369: 37319 cmp rdx, r10 37320 je .LBB4_1655 37321 jmp .LBB4_1370 37322 .LBB4_1374: 37323 xor esi, esi 37324 .LBB4_1375: 37325 test r9b, 1 37326 je .LBB4_1377 37327 # %bb.1376: 37328 movdqu xmm1, xmmword ptr [rcx + 8*rsi] 37329 movdqu xmm2, xmmword ptr [rcx + 8*rsi + 16] 37330 pxor xmm3, xmm3 37331 movdqa xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 37332 movdqa xmm0, xmm4 37333 pcmpgtq xmm0, xmm1 37334 movdqa xmm5, xmm1 37335 pcmpeqq xmm5, xmm3 37336 pcmpeqd xmm1, xmm1 37337 pxor xmm5, xmm1 37338 pcmpeqq xmm3, xmm2 37339 pxor xmm3, xmm1 37340 movdqa xmm1, xmm4 37341 pcmpgtq xmm1, xmm2 37342 movdqa xmm2, xmm4 37343 blendvpd xmm2, xmm5, xmm0 37344 movdqa xmm0, xmm1 37345 blendvpd xmm4, xmm3, xmm0 37346 movupd xmmword ptr [r8 + 8*rsi], xmm2 37347 movupd xmmword ptr [r8 + 8*rsi + 16], xmm4 37348 .LBB4_1377: 37349 cmp rdx, r11 37350 je .LBB4_1655 37351 jmp .LBB4_1378 37352 .LBB4_1383: 37353 xor esi, esi 37354 .LBB4_1384: 37355 test r9b, 1 37356 je .LBB4_1386 37357 # %bb.1385: 37358 movzx eax, word ptr [rcx + rsi] 37359 movd xmm0, eax 37360 movzx eax, word ptr [rcx + rsi + 2] 37361 movd xmm1, eax 37362 pxor xmm2, xmm2 37363 pcmpeqb xmm0, xmm2 37364 pcmpeqd xmm3, xmm3 37365 pxor xmm0, xmm3 37366 pmovzxbq xmm0, xmm0 # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 37367 movdqa xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 37368 pand xmm0, xmm4 37369 pcmpeqb xmm1, xmm2 37370 pxor xmm1, xmm3 37371 pmovzxbq xmm1, xmm1 # xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 37372 pand xmm1, xmm4 37373 movdqu xmmword ptr [r8 + 8*rsi], xmm0 37374 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm1 37375 .LBB4_1386: 37376 cmp rdx, r10 37377 je .LBB4_1655 37378 jmp .LBB4_1387 37379 .LBB4_1391: 37380 xor esi, esi 37381 .LBB4_1392: 37382 test r9b, 1 37383 je .LBB4_1394 37384 # %bb.1393: 37385 movq xmm2, qword ptr [rcx + rsi] # xmm2 = mem[0],zero 37386 movq xmm3, qword ptr [rcx + rsi + 8] # xmm3 = mem[0],zero 37387 pxor xmm4, xmm4 37388 movdqa xmm0, xmm2 37389 pcmpgtb xmm0, xmm4 37390 pmovsxbw xmm0, xmm0 37391 movdqa xmm1, xmm3 37392 pcmpgtb xmm1, xmm4 37393 pmovsxbw xmm1, xmm1 37394 pcmpeqb xmm2, xmm4 37395 pcmpeqd xmm5, xmm5 37396 pxor xmm2, xmm5 37397 pmovsxbw xmm2, xmm2 37398 pcmpeqb xmm3, xmm4 37399 pxor xmm3, xmm5 37400 pmovsxbw xmm3, xmm3 37401 movdqa xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1] 37402 pblendvb xmm2, xmm4, xmm0 37403 movdqa xmm0, xmm1 37404 pblendvb xmm3, xmm4, xmm0 37405 movdqu xmmword ptr [r8 + 2*rsi], xmm2 37406 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm3 37407 .LBB4_1394: 37408 cmp rdx, r10 37409 je .LBB4_1655 37410 jmp .LBB4_1395 37411 .LBB4_1400: 37412 xor esi, esi 37413 .LBB4_1401: 37414 test r9b, 1 37415 je .LBB4_1403 37416 # %bb.1402: 37417 movq xmm2, qword ptr [rcx + rsi] # xmm2 = mem[0],zero 37418 movq xmm3, qword ptr [rcx + rsi + 8] # xmm3 = mem[0],zero 37419 pxor xmm4, xmm4 37420 movdqa xmm0, xmm2 37421 pcmpgtb xmm0, xmm4 37422 pmovsxbw xmm0, xmm0 37423 movdqa xmm1, xmm3 37424 pcmpgtb xmm1, xmm4 37425 pmovsxbw xmm1, xmm1 37426 pcmpeqb xmm2, xmm4 37427 pcmpeqd xmm5, xmm5 37428 pxor xmm2, xmm5 37429 pmovsxbw xmm2, xmm2 37430 pcmpeqb xmm3, xmm4 37431 pxor xmm3, xmm5 37432 pmovsxbw xmm3, xmm3 37433 movdqa xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1] 37434 pblendvb xmm2, xmm4, xmm0 37435 movdqa xmm0, xmm1 37436 pblendvb xmm3, xmm4, xmm0 37437 movdqu xmmword ptr [r8 + 2*rsi], xmm2 37438 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm3 37439 .LBB4_1403: 37440 cmp rdx, r10 37441 je .LBB4_1655 37442 jmp .LBB4_1404 37443 .LBB4_1409: 37444 xor esi, esi 37445 .LBB4_1410: 37446 test r9b, 1 37447 je .LBB4_1412 37448 # %bb.1411: 37449 movdqu xmm0, xmmword ptr [rcx + 2*rsi] 37450 movdqu xmm1, xmmword ptr [rcx + 2*rsi + 16] 37451 pxor xmm2, xmm2 37452 pcmpeqw xmm0, xmm2 37453 movdqa xmm3, xmmword ptr [rip + .LCPI4_20] # xmm3 = [1,1,1,1,1,1,1,1] 37454 pandn xmm0, xmm3 37455 pcmpeqw xmm1, xmm2 37456 pandn xmm1, xmm3 37457 movdqu xmmword ptr [r8 + 2*rsi], xmm0 37458 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm1 37459 .LBB4_1412: 37460 cmp rdx, r10 37461 je .LBB4_1655 37462 jmp .LBB4_1413 37463 .LBB4_1417: 37464 xor esi, esi 37465 .LBB4_1418: 37466 test r9b, 1 37467 je .LBB4_1420 37468 # %bb.1419: 37469 movdqu xmm0, xmmword ptr [rcx + 2*rsi] 37470 movdqu xmm1, xmmword ptr [rcx + 2*rsi + 16] 37471 pxor xmm2, xmm2 37472 pcmpeqw xmm0, xmm2 37473 movdqa xmm3, xmmword ptr [rip + .LCPI4_20] # xmm3 = [1,1,1,1,1,1,1,1] 37474 pandn xmm0, xmm3 37475 pcmpeqw xmm1, xmm2 37476 pandn xmm1, xmm3 37477 movdqu xmmword ptr [r8 + 2*rsi], xmm0 37478 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm1 37479 .LBB4_1420: 37480 cmp rdx, r10 37481 je .LBB4_1655 37482 jmp .LBB4_1421 37483 .LBB4_1425: 37484 xor esi, esi 37485 .LBB4_1426: 37486 test r9b, 1 37487 je .LBB4_1428 37488 # %bb.1427: 37489 movdqu xmm1, xmmword ptr [rcx + 2*rsi] 37490 movdqu xmm2, xmmword ptr [rcx + 2*rsi + 16] 37491 pxor xmm3, xmm3 37492 movdqa xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1] 37493 movdqa xmm0, xmm4 37494 pcmpgtw xmm0, xmm1 37495 movdqa xmm5, xmm1 37496 pcmpeqw xmm5, xmm3 37497 pcmpeqd xmm1, xmm1 37498 pxor xmm5, xmm1 37499 pcmpeqw xmm3, xmm2 37500 pxor xmm3, xmm1 37501 movdqa xmm1, xmm4 37502 pcmpgtw xmm1, xmm2 37503 movdqa xmm2, xmm4 37504 pblendvb xmm2, xmm5, xmm0 37505 movdqa xmm0, xmm1 37506 pblendvb xmm4, xmm3, xmm0 37507 movdqu xmmword ptr [r8 + 2*rsi], xmm2 37508 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm4 37509 .LBB4_1428: 37510 cmp rdx, r11 37511 je .LBB4_1655 37512 jmp .LBB4_1429 37513 .LBB4_1434: 37514 xor esi, esi 37515 .LBB4_1435: 37516 test r9b, 1 37517 je .LBB4_1437 37518 # %bb.1436: 37519 movdqu xmm1, xmmword ptr [rcx + 2*rsi] 37520 movdqu xmm2, xmmword ptr [rcx + 2*rsi + 16] 37521 pxor xmm3, xmm3 37522 movdqa xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1] 37523 movdqa xmm0, xmm4 37524 pcmpgtw xmm0, xmm1 37525 movdqa xmm5, xmm1 37526 pcmpeqw xmm5, xmm3 37527 pcmpeqd xmm1, xmm1 37528 pxor xmm5, xmm1 37529 pcmpeqw xmm3, xmm2 37530 pxor xmm3, xmm1 37531 movdqa xmm1, xmm4 37532 pcmpgtw xmm1, xmm2 37533 movdqa xmm2, xmm4 37534 pblendvb xmm2, xmm5, xmm0 37535 movdqa xmm0, xmm1 37536 pblendvb xmm4, xmm3, xmm0 37537 movdqu xmmword ptr [r8 + 2*rsi], xmm2 37538 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm4 37539 .LBB4_1437: 37540 cmp rdx, r11 37541 je .LBB4_1655 37542 jmp .LBB4_1438 37543 .LBB4_1443: 37544 xor esi, esi 37545 .LBB4_1444: 37546 test r9b, 1 37547 je .LBB4_1446 37548 # %bb.1445: 37549 movq xmm0, qword ptr [rcx + rsi] # xmm0 = mem[0],zero 37550 movq xmm1, qword ptr [rcx + rsi + 8] # xmm1 = mem[0],zero 37551 pxor xmm2, xmm2 37552 pcmpeqb xmm0, xmm2 37553 pcmpeqd xmm3, xmm3 37554 pxor xmm0, xmm3 37555 pmovzxbw xmm0, xmm0 # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 37556 movdqa xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1] 37557 pand xmm0, xmm4 37558 pcmpeqb xmm1, xmm2 37559 pxor xmm1, xmm3 37560 pmovzxbw xmm1, xmm1 # xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 37561 pand xmm1, xmm4 37562 movdqu xmmword ptr [r8 + 2*rsi], xmm0 37563 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm1 37564 .LBB4_1446: 37565 cmp rdx, r10 37566 je .LBB4_1655 37567 jmp .LBB4_1447 37568 .LBB4_1451: 37569 xor esi, esi 37570 .LBB4_1452: 37571 test r9b, 1 37572 je .LBB4_1454 37573 # %bb.1453: 37574 movq xmm0, qword ptr [rcx + rsi] # xmm0 = mem[0],zero 37575 movq xmm1, qword ptr [rcx + rsi + 8] # xmm1 = mem[0],zero 37576 pxor xmm2, xmm2 37577 pcmpeqb xmm0, xmm2 37578 pcmpeqd xmm3, xmm3 37579 pxor xmm0, xmm3 37580 pmovzxbw xmm0, xmm0 # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 37581 movdqa xmm4, xmmword ptr [rip + .LCPI4_20] # xmm4 = [1,1,1,1,1,1,1,1] 37582 pand xmm0, xmm4 37583 pcmpeqb xmm1, xmm2 37584 pxor xmm1, xmm3 37585 pmovzxbw xmm1, xmm1 # xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 37586 pand xmm1, xmm4 37587 movdqu xmmword ptr [r8 + 2*rsi], xmm0 37588 movdqu xmmword ptr [r8 + 2*rsi + 16], xmm1 37589 .LBB4_1454: 37590 cmp rdx, r10 37591 je .LBB4_1655 37592 jmp .LBB4_1455 37593 .LBB4_1459: 37594 xor esi, esi 37595 .LBB4_1460: 37596 test r9b, 1 37597 je .LBB4_1462 37598 # %bb.1461: 37599 movzx eax, word ptr [rcx + rsi] 37600 movd xmm2, eax 37601 movzx eax, word ptr [rcx + rsi + 2] 37602 movd xmm3, eax 37603 xorpd xmm4, xmm4 37604 movdqa xmm0, xmm2 37605 pcmpgtb xmm0, xmm4 37606 pmovsxbq xmm0, xmm0 37607 movdqa xmm1, xmm3 37608 pcmpgtb xmm1, xmm4 37609 pmovsxbq xmm1, xmm1 37610 pcmpeqb xmm2, xmm4 37611 pcmpeqd xmm5, xmm5 37612 pxor xmm2, xmm5 37613 pmovsxbq xmm2, xmm2 37614 pcmpeqb xmm3, xmm4 37615 pxor xmm3, xmm5 37616 pmovsxbq xmm3, xmm3 37617 movapd xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 37618 blendvpd xmm2, xmm4, xmm0 37619 movdqa xmm0, xmm1 37620 blendvpd xmm3, xmm4, xmm0 37621 movupd xmmword ptr [r8 + 8*rsi], xmm2 37622 movupd xmmword ptr [r8 + 8*rsi + 16], xmm3 37623 .LBB4_1462: 37624 cmp rdx, r10 37625 je .LBB4_1655 37626 jmp .LBB4_1463 37627 .LBB4_1468: 37628 xor esi, esi 37629 .LBB4_1469: 37630 test r9b, 1 37631 je .LBB4_1471 37632 # %bb.1470: 37633 movd xmm2, dword ptr [rcx + rsi] # xmm2 = mem[0],zero,zero,zero 37634 movd xmm3, dword ptr [rcx + rsi + 4] # xmm3 = mem[0],zero,zero,zero 37635 xorps xmm4, xmm4 37636 movdqa xmm0, xmm2 37637 pcmpgtb xmm0, xmm4 37638 pmovsxbd xmm0, xmm0 37639 movdqa xmm1, xmm3 37640 pcmpgtb xmm1, xmm4 37641 pmovsxbd xmm1, xmm1 37642 pcmpeqb xmm2, xmm4 37643 pcmpeqd xmm5, xmm5 37644 pxor xmm2, xmm5 37645 pmovsxbd xmm2, xmm2 37646 cvtdq2ps xmm2, xmm2 37647 pcmpeqb xmm3, xmm4 37648 pxor xmm3, xmm5 37649 pmovsxbd xmm3, xmm3 37650 cvtdq2ps xmm3, xmm3 37651 movaps xmm4, xmmword ptr [rip + .LCPI4_19] # xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 37652 blendvps xmm2, xmm4, xmm0 37653 movdqa xmm0, xmm1 37654 blendvps xmm3, xmm4, xmm0 37655 movups xmmword ptr [r8 + 4*rsi], xmm2 37656 movups xmmword ptr [r8 + 4*rsi + 16], xmm3 37657 .LBB4_1471: 37658 cmp rdx, rax 37659 je .LBB4_1655 37660 jmp .LBB4_1472 37661 .LBB4_1490: 37662 xor esi, esi 37663 .LBB4_1491: 37664 test r9b, 1 37665 je .LBB4_1493 37666 # %bb.1492: 37667 movdqu xmm0, xmmword ptr [rcx + 8*rsi] 37668 movdqu xmm1, xmmword ptr [rcx + 8*rsi + 16] 37669 pxor xmm2, xmm2 37670 pcmpeqq xmm0, xmm2 37671 movdqa xmm3, xmmword ptr [rip + .LCPI4_15] # xmm3 = [1,1] 37672 pandn xmm0, xmm3 37673 pcmpeqq xmm1, xmm2 37674 pandn xmm1, xmm3 37675 movdqu xmmword ptr [r8 + 8*rsi], xmm0 37676 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm1 37677 .LBB4_1493: 37678 cmp rdx, r10 37679 je .LBB4_1655 37680 jmp .LBB4_1494 37681 .LBB4_1498: 37682 xor esi, esi 37683 .LBB4_1499: 37684 test r9b, 1 37685 je .LBB4_1501 37686 # %bb.1500: 37687 movdqu xmm1, xmmword ptr [rcx + 8*rsi] 37688 movdqu xmm2, xmmword ptr [rcx + 8*rsi + 16] 37689 pxor xmm3, xmm3 37690 movdqa xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 37691 movdqa xmm0, xmm4 37692 pcmpgtq xmm0, xmm1 37693 movdqa xmm5, xmm1 37694 pcmpeqq xmm5, xmm3 37695 pcmpeqd xmm1, xmm1 37696 pxor xmm5, xmm1 37697 pcmpeqq xmm3, xmm2 37698 pxor xmm3, xmm1 37699 movdqa xmm1, xmm4 37700 pcmpgtq xmm1, xmm2 37701 movdqa xmm2, xmm4 37702 blendvpd xmm2, xmm5, xmm0 37703 movdqa xmm0, xmm1 37704 blendvpd xmm4, xmm3, xmm0 37705 movupd xmmword ptr [r8 + 8*rsi], xmm2 37706 movupd xmmword ptr [r8 + 8*rsi + 16], xmm4 37707 .LBB4_1501: 37708 cmp rdx, r11 37709 je .LBB4_1655 37710 jmp .LBB4_1502 37711 .LBB4_1507: 37712 xor esi, esi 37713 .LBB4_1508: 37714 test r9b, 1 37715 je .LBB4_1510 37716 # %bb.1509: 37717 movzx eax, word ptr [rcx + rsi] 37718 movd xmm0, eax 37719 movzx eax, word ptr [rcx + rsi + 2] 37720 movd xmm1, eax 37721 pxor xmm2, xmm2 37722 pcmpeqb xmm0, xmm2 37723 pcmpeqd xmm3, xmm3 37724 pxor xmm0, xmm3 37725 pmovzxbq xmm0, xmm0 # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 37726 movdqa xmm4, xmmword ptr [rip + .LCPI4_15] # xmm4 = [1,1] 37727 pand xmm0, xmm4 37728 pcmpeqb xmm1, xmm2 37729 pxor xmm1, xmm3 37730 pmovzxbq xmm1, xmm1 # xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 37731 pand xmm1, xmm4 37732 movdqu xmmword ptr [r8 + 8*rsi], xmm0 37733 movdqu xmmword ptr [r8 + 8*rsi + 16], xmm1 37734 .LBB4_1510: 37735 cmp rdx, r10 37736 je .LBB4_1655 37737 jmp .LBB4_1511 37738 .LBB4_1515: 37739 xor esi, esi 37740 .LBB4_1516: 37741 test r9b, 1 37742 je .LBB4_1518 37743 # %bb.1517: 37744 movd xmm0, dword ptr [rcx + rsi] # xmm0 = mem[0],zero,zero,zero 37745 movd xmm1, dword ptr [rcx + rsi + 4] # xmm1 = mem[0],zero,zero,zero 37746 pxor xmm2, xmm2 37747 pcmpeqb xmm0, xmm2 37748 pcmpeqd xmm3, xmm3 37749 pxor xmm0, xmm3 37750 pmovzxbd xmm0, xmm0 # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 37751 movdqa xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 37752 pand xmm0, xmm4 37753 cvtdq2ps xmm0, xmm0 37754 pcmpeqb xmm1, xmm2 37755 pxor xmm1, xmm3 37756 pmovzxbd xmm1, xmm1 # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 37757 pand xmm1, xmm4 37758 cvtdq2ps xmm1, xmm1 37759 movups xmmword ptr [r8 + 4*rsi], xmm0 37760 movups xmmword ptr [r8 + 4*rsi + 16], xmm1 37761 .LBB4_1518: 37762 cmp rdx, rax 37763 je .LBB4_1655 37764 jmp .LBB4_1519 37765 .LBB4_1535: 37766 xor esi, esi 37767 .LBB4_1536: 37768 test r9b, 1 37769 je .LBB4_1538 37770 # %bb.1537: 37771 movdqu xmm0, xmmword ptr [rcx + 4*rsi] 37772 movdqu xmm1, xmmword ptr [rcx + 4*rsi + 16] 37773 pxor xmm2, xmm2 37774 pcmpeqd xmm0, xmm2 37775 pcmpeqd xmm3, xmm3 37776 pxor xmm0, xmm3 37777 packssdw xmm0, xmm0 37778 packsswb xmm0, xmm0 37779 movdqa xmm4, xmmword ptr [rip + .LCPI4_12] # xmm4 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u> 37780 pand xmm0, xmm4 37781 pcmpeqd xmm1, xmm2 37782 pxor xmm1, xmm3 37783 packssdw xmm1, xmm1 37784 packsswb xmm1, xmm1 37785 pand xmm1, xmm4 37786 movd dword ptr [r8 + rsi], xmm0 37787 movd dword ptr [r8 + rsi + 4], xmm1 37788 .LBB4_1538: 37789 cmp rdx, rax 37790 je .LBB4_1655 37791 jmp .LBB4_1539 37792 .LBB4_1543: 37793 xor esi, esi 37794 .LBB4_1544: 37795 test r9b, 1 37796 je .LBB4_1546 37797 # %bb.1545: 37798 movupd xmm3, xmmword ptr [rcx + 8*rsi] 37799 movupd xmm4, xmmword ptr [rcx + 8*rsi + 16] 37800 xorpd xmm2, xmm2 37801 movapd xmm0, xmm3 37802 cmpeqpd xmm0, xmm2 37803 packssdw xmm0, xmm0 37804 packssdw xmm0, xmm0 37805 packsswb xmm0, xmm0 37806 movapd xmm1, xmm4 37807 cmpeqpd xmm1, xmm2 37808 packssdw xmm1, xmm1 37809 packssdw xmm1, xmm1 37810 packsswb xmm1, xmm1 37811 movapd xmm5, xmmword ptr [rip + .LCPI4_0] # xmm5 = [-0.0E+0,-0.0E+0] 37812 andpd xmm3, xmm5 37813 movapd xmm6, xmmword ptr [rip + .LCPI4_1] # xmm6 = [1.0E+0,1.0E+0] 37814 orpd xmm3, xmm6 37815 andpd xmm4, xmm5 37816 orpd xmm4, xmm6 37817 cvttpd2dq xmm3, xmm3 37818 movdqa xmm5, xmmword ptr [rip + .LCPI4_7] # xmm5 = <0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 37819 pshufb xmm3, xmm5 37820 cvttpd2dq xmm4, xmm4 37821 pshufb xmm4, xmm5 37822 pblendvb xmm3, xmm2, xmm0 37823 movdqa xmm0, xmm1 37824 pblendvb xmm4, xmm2, xmm0 37825 pextrw word ptr [r8 + rsi], xmm3, 0 37826 pextrw word ptr [r8 + rsi + 2], xmm4, 0 37827 .LBB4_1546: 37828 cmp rdx, rax 37829 je .LBB4_1655 37830 jmp .LBB4_1547 37831 .LBB4_1552: 37832 xor eax, eax 37833 .LBB4_1553: 37834 test r9b, 1 37835 je .LBB4_1555 37836 # %bb.1554: 37837 movdqu xmm1, xmmword ptr [rcx + rax] 37838 movdqu xmm2, xmmword ptr [rcx + rax + 16] 37839 pxor xmm3, xmm3 37840 movdqa xmm4, xmmword ptr [rip + .LCPI4_22] # xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 37841 movdqa xmm0, xmm4 37842 pcmpgtb xmm0, xmm1 37843 movdqa xmm5, xmm1 37844 pcmpeqb xmm5, xmm3 37845 pcmpeqd xmm1, xmm1 37846 pxor xmm5, xmm1 37847 pcmpeqb xmm3, xmm2 37848 pxor xmm3, xmm1 37849 movdqa xmm1, xmm4 37850 pcmpgtb xmm1, xmm2 37851 movdqa xmm2, xmm4 37852 pblendvb xmm2, xmm5, xmm0 37853 movdqa xmm0, xmm1 37854 pblendvb xmm4, xmm3, xmm0 37855 movdqu xmmword ptr [r8 + rax], xmm2 37856 movdqu xmmword ptr [r8 + rax + 16], xmm4 37857 .LBB4_1555: 37858 cmp rsi, r10 37859 je .LBB4_1655 37860 jmp .LBB4_1556 37861 .LBB4_1561: 37862 xor esi, esi 37863 .LBB4_1562: 37864 test r9b, 1 37865 je .LBB4_1564 37866 # %bb.1563: 37867 movdqu xmm0, xmmword ptr [rcx + 8*rsi] 37868 movdqu xmm1, xmmword ptr [rcx + 8*rsi + 16] 37869 pxor xmm2, xmm2 37870 pcmpeqq xmm0, xmm2 37871 pcmpeqd xmm3, xmm3 37872 pxor xmm0, xmm3 37873 packssdw xmm0, xmm0 37874 packssdw xmm0, xmm0 37875 packsswb xmm0, xmm0 37876 movdqa xmm4, xmmword ptr [rip + .LCPI4_18] # xmm4 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 37877 pand xmm0, xmm4 37878 pcmpeqq xmm1, xmm2 37879 pxor xmm1, xmm3 37880 packssdw xmm1, xmm1 37881 packssdw xmm1, xmm1 37882 packsswb xmm1, xmm1 37883 pextrw word ptr [r8 + rsi], xmm0, 0 37884 pand xmm1, xmm4 37885 pextrw word ptr [r8 + rsi + 2], xmm1, 0 37886 .LBB4_1564: 37887 cmp rdx, rax 37888 je .LBB4_1655 37889 jmp .LBB4_1565 37890 .LBB4_1569: 37891 xor esi, esi 37892 .LBB4_1570: 37893 test r9b, 1 37894 je .LBB4_1572 37895 # %bb.1571: 37896 movdqu xmm0, xmmword ptr [rcx + 2*rsi] 37897 movdqu xmm1, xmmword ptr [rcx + 2*rsi + 16] 37898 pxor xmm2, xmm2 37899 pcmpeqw xmm0, xmm2 37900 pcmpeqd xmm3, xmm3 37901 pxor xmm0, xmm3 37902 packsswb xmm0, xmm0 37903 movdqa xmm4, xmmword ptr [rip + .LCPI4_21] # xmm4 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u> 37904 pand xmm0, xmm4 37905 pcmpeqw xmm1, xmm2 37906 pxor xmm1, xmm3 37907 packsswb xmm1, xmm1 37908 pand xmm1, xmm4 37909 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] 37910 movdqu xmmword ptr [r8 + rsi], xmm0 37911 .LBB4_1572: 37912 cmp rdx, rax 37913 je .LBB4_1655 37914 jmp .LBB4_1573 37915 .LBB4_1577: 37916 xor eax, eax 37917 .LBB4_1578: 37918 test r9b, 1 37919 je .LBB4_1580 37920 # %bb.1579: 37921 movdqu xmm2, xmmword ptr [rcx + 2*rax] 37922 movdqu xmm3, xmmword ptr [rcx + 2*rax + 16] 37923 pxor xmm4, xmm4 37924 movdqa xmm0, xmm2 37925 pcmpgtw xmm0, xmm4 37926 packsswb xmm0, xmm0 37927 movdqa xmm1, xmm3 37928 pcmpgtw xmm1, xmm4 37929 packsswb xmm1, xmm1 37930 pcmpeqw xmm2, xmm4 37931 pcmpeqd xmm5, xmm5 37932 pxor xmm2, xmm5 37933 packsswb xmm2, xmm2 37934 pcmpeqw xmm3, xmm4 37935 pxor xmm3, xmm5 37936 packsswb xmm3, xmm3 37937 movdqa xmm4, xmmword ptr [rip + .LCPI4_21] # xmm4 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u> 37938 pblendvb xmm2, xmm4, xmm0 37939 movdqa xmm0, xmm1 37940 pblendvb xmm3, xmm4, xmm0 37941 punpcklqdq xmm2, xmm3 # xmm2 = xmm2[0],xmm3[0] 37942 movdqu xmmword ptr [r8 + rax], xmm2 37943 .LBB4_1580: 37944 cmp rsi, r10 37945 je .LBB4_1655 37946 jmp .LBB4_1581 37947 .LBB4_1586: 37948 xor eax, eax 37949 .LBB4_1587: 37950 test r9b, 1 37951 je .LBB4_1589 37952 # %bb.1588: 37953 movdqu xmm2, xmmword ptr [rcx + 8*rax] 37954 movdqu xmm3, xmmword ptr [rcx + 8*rax + 16] 37955 pxor xmm4, xmm4 37956 movdqa xmm0, xmm2 37957 pcmpgtq xmm0, xmm4 37958 packssdw xmm0, xmm0 37959 packssdw xmm0, xmm0 37960 packsswb xmm0, xmm0 37961 movdqa xmm1, xmm3 37962 pcmpgtq xmm1, xmm4 37963 packssdw xmm1, xmm1 37964 packssdw xmm1, xmm1 37965 packsswb xmm1, xmm1 37966 pcmpeqq xmm2, xmm4 37967 pcmpeqd xmm5, xmm5 37968 pxor xmm2, xmm5 37969 packssdw xmm2, xmm2 37970 packssdw xmm2, xmm2 37971 packsswb xmm2, xmm2 37972 pcmpeqq xmm3, xmm4 37973 pxor xmm3, xmm5 37974 packssdw xmm3, xmm3 37975 packssdw xmm3, xmm3 37976 packsswb xmm3, xmm3 37977 movdqa xmm4, xmmword ptr [rip + .LCPI4_18] # xmm4 = <1,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 37978 pblendvb xmm2, xmm4, xmm0 37979 movdqa xmm0, xmm1 37980 pblendvb xmm3, xmm4, xmm0 37981 pextrw word ptr [r8 + rax], xmm2, 0 37982 pextrw word ptr [r8 + rax + 2], xmm3, 0 37983 .LBB4_1589: 37984 cmp rsi, r10 37985 je .LBB4_1655 37986 jmp .LBB4_1590 37987 .LBB4_1595: 37988 xor esi, esi 37989 .LBB4_1596: 37990 test r9b, 1 37991 je .LBB4_1598 37992 # %bb.1597: 37993 movups xmm0, xmmword ptr [rcx + 4*rsi] 37994 movups xmm1, xmmword ptr [rcx + 4*rsi + 16] 37995 xorps xmm4, xmm4 37996 movaps xmm2, xmm0 37997 cmpeqps xmm2, xmm4 37998 packssdw xmm2, xmm2 37999 packsswb xmm2, xmm2 38000 movaps xmm3, xmm1 38001 cmpeqps xmm3, xmm4 38002 packssdw xmm3, xmm3 38003 packsswb xmm3, xmm3 38004 pcmpeqd xmm5, xmm5 38005 pcmpgtd xmm0, xmm5 38006 packssdw xmm0, xmm0 38007 packsswb xmm0, xmm0 38008 pcmpgtd xmm1, xmm5 38009 packssdw xmm1, xmm1 38010 packsswb xmm1, xmm1 38011 movdqa xmm6, xmmword ptr [rip + .LCPI4_12] # xmm6 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u> 38012 pcmpeqd xmm7, xmm7 38013 pblendvb xmm7, xmm6, xmm0 38014 movdqa xmm0, xmm1 38015 pblendvb xmm5, xmm6, xmm0 38016 movdqa xmm0, xmm2 38017 pblendvb xmm7, xmm4, xmm0 38018 movdqa xmm0, xmm3 38019 pblendvb xmm5, xmm4, xmm0 38020 movd dword ptr [r8 + rsi], xmm7 38021 movd dword ptr [r8 + rsi + 4], xmm5 38022 .LBB4_1598: 38023 cmp rdx, r10 38024 je .LBB4_1655 38025 jmp .LBB4_1599 38026 .LBB4_1604: 38027 xor esi, esi 38028 .LBB4_1605: 38029 test r9b, 1 38030 je .LBB4_1607 38031 # %bb.1606: 38032 movdqu xmm0, xmmword ptr [rcx + rsi] 38033 movdqu xmm1, xmmword ptr [rcx + rsi + 16] 38034 pxor xmm2, xmm2 38035 pcmpeqb xmm0, xmm2 38036 movdqa xmm3, xmmword ptr [rip + .LCPI4_22] # xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 38037 pandn xmm0, xmm3 38038 pcmpeqb xmm1, xmm2 38039 pandn xmm1, xmm3 38040 movdqu xmmword ptr [r8 + rsi], xmm0 38041 movdqu xmmword ptr [r8 + rsi + 16], xmm1 38042 .LBB4_1607: 38043 cmp rdx, rax 38044 je .LBB4_1655 38045 jmp .LBB4_1608 38046 .LBB4_1612: 38047 xor eax, eax 38048 .LBB4_1613: 38049 test r9b, 1 38050 je .LBB4_1615 38051 # %bb.1614: 38052 movdqu xmm2, xmmword ptr [rcx + 4*rax] 38053 movdqu xmm3, xmmword ptr [rcx + 4*rax + 16] 38054 pxor xmm4, xmm4 38055 movdqa xmm0, xmm2 38056 pcmpgtd xmm0, xmm4 38057 packssdw xmm0, xmm0 38058 packsswb xmm0, xmm0 38059 movdqa xmm1, xmm3 38060 pcmpgtd xmm1, xmm4 38061 packssdw xmm1, xmm1 38062 packsswb xmm1, xmm1 38063 pcmpeqd xmm2, xmm4 38064 pcmpeqd xmm5, xmm5 38065 pxor xmm2, xmm5 38066 packssdw xmm2, xmm2 38067 packsswb xmm2, xmm2 38068 pcmpeqd xmm3, xmm4 38069 pxor xmm3, xmm5 38070 packssdw xmm3, xmm3 38071 packsswb xmm3, xmm3 38072 movdqa xmm4, xmmword ptr [rip + .LCPI4_12] # xmm4 = <1,1,1,1,u,u,u,u,u,u,u,u,u,u,u,u> 38073 pblendvb xmm2, xmm4, xmm0 38074 movdqa xmm0, xmm1 38075 pblendvb xmm3, xmm4, xmm0 38076 movd dword ptr [r8 + rax], xmm2 38077 movd dword ptr [r8 + rax + 4], xmm3 38078 .LBB4_1615: 38079 cmp rsi, r10 38080 je .LBB4_1655 38081 jmp .LBB4_1616 38082 .LBB4_1621: 38083 xor esi, esi 38084 .LBB4_1622: 38085 test r9b, 1 38086 je .LBB4_1624 38087 # %bb.1623: 38088 movdqu xmm0, xmmword ptr [rcx + 4*rsi] 38089 movdqu xmm1, xmmword ptr [rcx + 4*rsi + 16] 38090 pxor xmm2, xmm2 38091 pcmpeqd xmm0, xmm2 38092 movdqa xmm3, xmmword ptr [rip + .LCPI4_8] # xmm3 = [1,1,1,1] 38093 pandn xmm0, xmm3 38094 pcmpeqd xmm1, xmm2 38095 pandn xmm1, xmm3 38096 movdqu xmmword ptr [r8 + 4*rsi], xmm0 38097 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm1 38098 .LBB4_1624: 38099 cmp rdx, r10 38100 je .LBB4_1655 38101 jmp .LBB4_1625 38102 .LBB4_1629: 38103 xor esi, esi 38104 .LBB4_1630: 38105 test r9b, 1 38106 je .LBB4_1632 38107 # %bb.1631: 38108 movd xmm2, dword ptr [rcx + rsi] # xmm2 = mem[0],zero,zero,zero 38109 movd xmm3, dword ptr [rcx + rsi + 4] # xmm3 = mem[0],zero,zero,zero 38110 xorps xmm4, xmm4 38111 movdqa xmm0, xmm2 38112 pcmpgtb xmm0, xmm4 38113 pmovsxbd xmm0, xmm0 38114 movdqa xmm1, xmm3 38115 pcmpgtb xmm1, xmm4 38116 pmovsxbd xmm1, xmm1 38117 pcmpeqb xmm2, xmm4 38118 pcmpeqd xmm5, xmm5 38119 pxor xmm2, xmm5 38120 pmovsxbd xmm2, xmm2 38121 pcmpeqb xmm3, xmm4 38122 pxor xmm3, xmm5 38123 pmovsxbd xmm3, xmm3 38124 movaps xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 38125 blendvps xmm2, xmm4, xmm0 38126 movdqa xmm0, xmm1 38127 blendvps xmm3, xmm4, xmm0 38128 movups xmmword ptr [r8 + 4*rsi], xmm2 38129 movups xmmword ptr [r8 + 4*rsi + 16], xmm3 38130 .LBB4_1632: 38131 cmp rdx, r10 38132 je .LBB4_1655 38133 jmp .LBB4_1633 38134 .LBB4_1638: 38135 xor esi, esi 38136 .LBB4_1639: 38137 test r9b, 1 38138 je .LBB4_1641 38139 # %bb.1640: 38140 movd xmm0, dword ptr [rcx + rsi] # xmm0 = mem[0],zero,zero,zero 38141 movd xmm1, dword ptr [rcx + rsi + 4] # xmm1 = mem[0],zero,zero,zero 38142 pxor xmm2, xmm2 38143 pcmpeqb xmm0, xmm2 38144 pcmpeqd xmm3, xmm3 38145 pxor xmm0, xmm3 38146 pmovzxbd xmm0, xmm0 # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 38147 movdqa xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 38148 pand xmm0, xmm4 38149 pcmpeqb xmm1, xmm2 38150 pxor xmm1, xmm3 38151 pmovzxbd xmm1, xmm1 # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 38152 pand xmm1, xmm4 38153 movdqu xmmword ptr [r8 + 4*rsi], xmm0 38154 movdqu xmmword ptr [r8 + 4*rsi + 16], xmm1 38155 .LBB4_1641: 38156 cmp rdx, r10 38157 je .LBB4_1655 38158 jmp .LBB4_1642 38159 .LBB4_1646: 38160 xor esi, esi 38161 .LBB4_1647: 38162 test r9b, 1 38163 je .LBB4_1649 38164 # %bb.1648: 38165 movdqu xmm1, xmmword ptr [rcx + 4*rsi] 38166 movdqu xmm2, xmmword ptr [rcx + 4*rsi + 16] 38167 pxor xmm3, xmm3 38168 movdqa xmm4, xmmword ptr [rip + .LCPI4_8] # xmm4 = [1,1,1,1] 38169 movdqa xmm0, xmm4 38170 pcmpgtd xmm0, xmm1 38171 movdqa xmm5, xmm1 38172 pcmpeqd xmm5, xmm3 38173 pcmpeqd xmm1, xmm1 38174 pxor xmm5, xmm1 38175 pcmpeqd xmm3, xmm2 38176 pxor xmm3, xmm1 38177 movdqa xmm1, xmm4 38178 pcmpgtd xmm1, xmm2 38179 movdqa xmm2, xmm4 38180 blendvps xmm2, xmm5, xmm0 38181 movdqa xmm0, xmm1 38182 blendvps xmm4, xmm3, xmm0 38183 movups xmmword ptr [r8 + 4*rsi], xmm2 38184 movups xmmword ptr [r8 + 4*rsi + 16], xmm4 38185 .LBB4_1649: 38186 cmp rdx, r11 38187 je .LBB4_1655 38188 jmp .LBB4_1650 38189 .Lfunc_end4: 38190 .size arithmetic_unary_diff_type_sse4, .Lfunc_end4-arithmetic_unary_diff_type_sse4 38191 # -- End function 38192 .ident "Ubuntu clang version 11.1.0-6" 38193 .section ".note.GNU-stack","",@progbits 38194 .addrsig