github.com/apache/arrow/go/v14@v14.0.1/internal/utils/_lib/min_max_sse4_amd64.s (about) 1 .text 2 .intel_syntax noprefix 3 .file "min_max.c" 4 .section .rodata.cst16,"aM",@progbits,16 5 .p2align 4 # -- Begin function int8_max_min_sse4 6 .LCPI0_0: 7 .zero 16,128 8 .LCPI0_1: 9 .zero 16,127 10 .text 11 .globl int8_max_min_sse4 12 .p2align 4, 0x90 13 .type int8_max_min_sse4,@function 14 int8_max_min_sse4: # @int8_max_min_sse4 15 # %bb.0: 16 push rbp 17 mov rbp, rsp 18 and rsp, -8 19 test esi, esi 20 jle .LBB0_1 21 # %bb.2: 22 mov r9d, esi 23 cmp esi, 31 24 ja .LBB0_4 25 # %bb.3: 26 mov r8b, -128 27 mov sil, 127 28 xor r11d, r11d 29 jmp .LBB0_11 30 .LBB0_1: 31 mov sil, 127 32 mov r8b, -128 33 jmp .LBB0_12 34 .LBB0_4: 35 mov r11d, r9d 36 and r11d, -32 37 lea rax, [r11 - 32] 38 mov r8, rax 39 shr r8, 5 40 add r8, 1 41 test rax, rax 42 je .LBB0_5 43 # %bb.6: 44 mov r10, r8 45 and r10, -2 46 neg r10 47 movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 48 movdqa xmm0, xmmword ptr [rip + .LCPI0_1] # xmm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 49 xor eax, eax 50 movdqa xmm2, xmm0 51 movdqa xmm3, xmm1 52 .p2align 4, 0x90 53 .LBB0_7: # =>This Inner Loop Header: Depth=1 54 movdqu xmm4, xmmword ptr [rdi + rax] 55 movdqu xmm5, xmmword ptr [rdi + rax + 16] 56 movdqu xmm6, xmmword ptr [rdi + rax + 32] 57 movdqu xmm7, xmmword ptr [rdi + rax + 48] 58 pminsb xmm0, xmm4 59 pminsb xmm2, xmm5 60 pmaxsb xmm1, xmm4 61 pmaxsb xmm3, xmm5 62 pminsb xmm0, xmm6 63 pminsb xmm2, xmm7 64 pmaxsb xmm1, xmm6 65 pmaxsb xmm3, xmm7 66 add rax, 64 67 add r10, 2 68 jne .LBB0_7 69 # %bb.8: 70 test r8b, 1 71 je .LBB0_10 72 .LBB0_9: 73 movdqu xmm4, xmmword ptr [rdi + rax] 74 movdqu xmm5, xmmword ptr [rdi + rax + 16] 75 pmaxsb xmm3, xmm5 76 pmaxsb xmm1, xmm4 77 pminsb xmm2, xmm5 78 pminsb xmm0, xmm4 79 .LBB0_10: 80 pminsb xmm0, xmm2 81 pmaxsb xmm1, xmm3 82 pxor xmm1, xmmword ptr [rip + .LCPI0_1] 83 movdqa xmm2, xmm1 84 psrlw xmm2, 8 85 pminub xmm2, xmm1 86 phminposuw xmm1, xmm2 87 movd r8d, xmm1 88 xor r8b, 127 89 pxor xmm0, xmmword ptr [rip + .LCPI0_0] 90 movdqa xmm1, xmm0 91 psrlw xmm1, 8 92 pminub xmm1, xmm0 93 phminposuw xmm0, xmm1 94 movd esi, xmm0 95 xor sil, -128 96 cmp r11, r9 97 je .LBB0_12 98 .p2align 4, 0x90 99 .LBB0_11: # =>This Inner Loop Header: Depth=1 100 movzx eax, byte ptr [rdi + r11] 101 cmp sil, al 102 movzx esi, sil 103 cmovg esi, eax 104 cmp r8b, al 105 movzx r8d, r8b 106 cmovl r8d, eax 107 add r11, 1 108 cmp r9, r11 109 jne .LBB0_11 110 .LBB0_12: 111 mov byte ptr [rcx], r8b 112 mov byte ptr [rdx], sil 113 mov rsp, rbp 114 pop rbp 115 ret 116 .LBB0_5: 117 movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 118 movdqa xmm0, xmmword ptr [rip + .LCPI0_1] # xmm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 119 xor eax, eax 120 movdqa xmm2, xmm0 121 movdqa xmm3, xmm1 122 test r8b, 1 123 jne .LBB0_9 124 jmp .LBB0_10 125 .Lfunc_end0: 126 .size int8_max_min_sse4, .Lfunc_end0-int8_max_min_sse4 127 # -- End function 128 .globl uint8_max_min_sse4 # -- Begin function uint8_max_min_sse4 129 .p2align 4, 0x90 130 .type uint8_max_min_sse4,@function 131 uint8_max_min_sse4: # @uint8_max_min_sse4 132 # %bb.0: 133 push rbp 134 mov rbp, rsp 135 and rsp, -8 136 test esi, esi 137 jle .LBB1_1 138 # %bb.2: 139 mov r9d, esi 140 cmp esi, 31 141 ja .LBB1_4 142 # %bb.3: 143 mov sil, -1 144 xor r11d, r11d 145 xor eax, eax 146 jmp .LBB1_11 147 .LBB1_1: 148 mov sil, -1 149 xor eax, eax 150 jmp .LBB1_12 151 .LBB1_4: 152 mov r11d, r9d 153 and r11d, -32 154 lea rax, [r11 - 32] 155 mov r8, rax 156 shr r8, 5 157 add r8, 1 158 test rax, rax 159 je .LBB1_5 160 # %bb.6: 161 mov r10, r8 162 and r10, -2 163 neg r10 164 pxor xmm1, xmm1 165 pcmpeqd xmm0, xmm0 166 xor eax, eax 167 pcmpeqd xmm2, xmm2 168 pxor xmm3, xmm3 169 .p2align 4, 0x90 170 .LBB1_7: # =>This Inner Loop Header: Depth=1 171 movdqu xmm4, xmmword ptr [rdi + rax] 172 movdqu xmm5, xmmword ptr [rdi + rax + 16] 173 movdqu xmm6, xmmword ptr [rdi + rax + 32] 174 movdqu xmm7, xmmword ptr [rdi + rax + 48] 175 pminub xmm0, xmm4 176 pminub xmm2, xmm5 177 pmaxub xmm1, xmm4 178 pmaxub xmm3, xmm5 179 pminub xmm0, xmm6 180 pminub xmm2, xmm7 181 pmaxub xmm1, xmm6 182 pmaxub xmm3, xmm7 183 add rax, 64 184 add r10, 2 185 jne .LBB1_7 186 # %bb.8: 187 test r8b, 1 188 je .LBB1_10 189 .LBB1_9: 190 movdqu xmm4, xmmword ptr [rdi + rax] 191 movdqu xmm5, xmmword ptr [rdi + rax + 16] 192 pmaxub xmm3, xmm5 193 pmaxub xmm1, xmm4 194 pminub xmm2, xmm5 195 pminub xmm0, xmm4 196 .LBB1_10: 197 pminub xmm0, xmm2 198 pmaxub xmm1, xmm3 199 pcmpeqd xmm2, xmm2 200 pxor xmm2, xmm1 201 movdqa xmm1, xmm2 202 psrlw xmm1, 8 203 pminub xmm1, xmm2 204 phminposuw xmm1, xmm1 205 movd eax, xmm1 206 not al 207 movdqa xmm1, xmm0 208 psrlw xmm1, 8 209 pminub xmm1, xmm0 210 phminposuw xmm0, xmm1 211 movd esi, xmm0 212 cmp r11, r9 213 je .LBB1_12 214 .p2align 4, 0x90 215 .LBB1_11: # =>This Inner Loop Header: Depth=1 216 movzx r8d, byte ptr [rdi + r11] 217 cmp sil, r8b 218 movzx esi, sil 219 cmovae esi, r8d 220 cmp al, r8b 221 movzx eax, al 222 cmovbe eax, r8d 223 add r11, 1 224 cmp r9, r11 225 jne .LBB1_11 226 .LBB1_12: 227 mov byte ptr [rcx], al 228 mov byte ptr [rdx], sil 229 mov rsp, rbp 230 pop rbp 231 ret 232 .LBB1_5: 233 pxor xmm1, xmm1 234 pcmpeqd xmm0, xmm0 235 xor eax, eax 236 pcmpeqd xmm2, xmm2 237 pxor xmm3, xmm3 238 test r8b, 1 239 jne .LBB1_9 240 jmp .LBB1_10 241 .Lfunc_end1: 242 .size uint8_max_min_sse4, .Lfunc_end1-uint8_max_min_sse4 243 # -- End function 244 .section .rodata.cst16,"aM",@progbits,16 245 .p2align 4 # -- Begin function int16_max_min_sse4 246 .LCPI2_0: 247 .short 32768 # 0x8000 248 .short 32768 # 0x8000 249 .short 32768 # 0x8000 250 .short 32768 # 0x8000 251 .short 32768 # 0x8000 252 .short 32768 # 0x8000 253 .short 32768 # 0x8000 254 .short 32768 # 0x8000 255 .LCPI2_1: 256 .short 32767 # 0x7fff 257 .short 32767 # 0x7fff 258 .short 32767 # 0x7fff 259 .short 32767 # 0x7fff 260 .short 32767 # 0x7fff 261 .short 32767 # 0x7fff 262 .short 32767 # 0x7fff 263 .short 32767 # 0x7fff 264 .text 265 .globl int16_max_min_sse4 266 .p2align 4, 0x90 267 .type int16_max_min_sse4,@function 268 int16_max_min_sse4: # @int16_max_min_sse4 269 # %bb.0: 270 push rbp 271 mov rbp, rsp 272 and rsp, -8 273 test esi, esi 274 jle .LBB2_1 275 # %bb.2: 276 mov r9d, esi 277 cmp esi, 15 278 ja .LBB2_4 279 # %bb.3: 280 mov r8w, -32768 281 mov si, 32767 282 xor r11d, r11d 283 jmp .LBB2_11 284 .LBB2_1: 285 mov si, 32767 286 mov r8w, -32768 287 jmp .LBB2_12 288 .LBB2_4: 289 mov r11d, r9d 290 and r11d, -16 291 lea rax, [r11 - 16] 292 mov r8, rax 293 shr r8, 4 294 add r8, 1 295 test rax, rax 296 je .LBB2_5 297 # %bb.6: 298 mov r10, r8 299 and r10, -2 300 neg r10 301 movdqa xmm1, xmmword ptr [rip + .LCPI2_0] # xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] 302 movdqa xmm0, xmmword ptr [rip + .LCPI2_1] # xmm0 = [32767,32767,32767,32767,32767,32767,32767,32767] 303 xor eax, eax 304 movdqa xmm2, xmm0 305 movdqa xmm3, xmm1 306 .p2align 4, 0x90 307 .LBB2_7: # =>This Inner Loop Header: Depth=1 308 movdqu xmm4, xmmword ptr [rdi + 2*rax] 309 movdqu xmm5, xmmword ptr [rdi + 2*rax + 16] 310 movdqu xmm6, xmmword ptr [rdi + 2*rax + 32] 311 movdqu xmm7, xmmword ptr [rdi + 2*rax + 48] 312 pminsw xmm0, xmm4 313 pminsw xmm2, xmm5 314 pmaxsw xmm1, xmm4 315 pmaxsw xmm3, xmm5 316 pminsw xmm0, xmm6 317 pminsw xmm2, xmm7 318 pmaxsw xmm1, xmm6 319 pmaxsw xmm3, xmm7 320 add rax, 32 321 add r10, 2 322 jne .LBB2_7 323 # %bb.8: 324 test r8b, 1 325 je .LBB2_10 326 .LBB2_9: 327 movdqu xmm4, xmmword ptr [rdi + 2*rax] 328 movdqu xmm5, xmmword ptr [rdi + 2*rax + 16] 329 pmaxsw xmm3, xmm5 330 pmaxsw xmm1, xmm4 331 pminsw xmm2, xmm5 332 pminsw xmm0, xmm4 333 .LBB2_10: 334 pminsw xmm0, xmm2 335 pmaxsw xmm1, xmm3 336 pxor xmm1, xmmword ptr [rip + .LCPI2_1] 337 phminposuw xmm1, xmm1 338 movd r8d, xmm1 339 xor r8d, 32767 340 pxor xmm0, xmmword ptr [rip + .LCPI2_0] 341 phminposuw xmm0, xmm0 342 movd esi, xmm0 343 xor esi, 32768 344 cmp r11, r9 345 je .LBB2_12 346 .p2align 4, 0x90 347 .LBB2_11: # =>This Inner Loop Header: Depth=1 348 movzx eax, word ptr [rdi + 2*r11] 349 cmp si, ax 350 cmovg esi, eax 351 cmp r8w, ax 352 cmovl r8d, eax 353 add r11, 1 354 cmp r9, r11 355 jne .LBB2_11 356 .LBB2_12: 357 mov word ptr [rcx], r8w 358 mov word ptr [rdx], si 359 mov rsp, rbp 360 pop rbp 361 ret 362 .LBB2_5: 363 movdqa xmm1, xmmword ptr [rip + .LCPI2_0] # xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] 364 movdqa xmm0, xmmword ptr [rip + .LCPI2_1] # xmm0 = [32767,32767,32767,32767,32767,32767,32767,32767] 365 xor eax, eax 366 movdqa xmm2, xmm0 367 movdqa xmm3, xmm1 368 test r8b, 1 369 jne .LBB2_9 370 jmp .LBB2_10 371 .Lfunc_end2: 372 .size int16_max_min_sse4, .Lfunc_end2-int16_max_min_sse4 373 # -- End function 374 .globl uint16_max_min_sse4 # -- Begin function uint16_max_min_sse4 375 .p2align 4, 0x90 376 .type uint16_max_min_sse4,@function 377 uint16_max_min_sse4: # @uint16_max_min_sse4 378 # %bb.0: 379 push rbp 380 mov rbp, rsp 381 and rsp, -8 382 test esi, esi 383 jle .LBB3_1 384 # %bb.2: 385 mov r9d, esi 386 cmp esi, 15 387 ja .LBB3_4 388 # %bb.3: 389 mov r8w, -1 390 xor r11d, r11d 391 xor esi, esi 392 jmp .LBB3_11 393 .LBB3_1: 394 mov r8w, -1 395 xor esi, esi 396 jmp .LBB3_12 397 .LBB3_4: 398 mov r11d, r9d 399 and r11d, -16 400 lea rax, [r11 - 16] 401 mov r8, rax 402 shr r8, 4 403 add r8, 1 404 test rax, rax 405 je .LBB3_5 406 # %bb.6: 407 mov r10, r8 408 and r10, -2 409 neg r10 410 pxor xmm1, xmm1 411 pcmpeqd xmm0, xmm0 412 xor eax, eax 413 pcmpeqd xmm2, xmm2 414 pxor xmm3, xmm3 415 .p2align 4, 0x90 416 .LBB3_7: # =>This Inner Loop Header: Depth=1 417 movdqu xmm4, xmmword ptr [rdi + 2*rax] 418 movdqu xmm5, xmmword ptr [rdi + 2*rax + 16] 419 movdqu xmm6, xmmword ptr [rdi + 2*rax + 32] 420 movdqu xmm7, xmmword ptr [rdi + 2*rax + 48] 421 pminuw xmm0, xmm4 422 pminuw xmm2, xmm5 423 pmaxuw xmm1, xmm4 424 pmaxuw xmm3, xmm5 425 pminuw xmm0, xmm6 426 pminuw xmm2, xmm7 427 pmaxuw xmm1, xmm6 428 pmaxuw xmm3, xmm7 429 add rax, 32 430 add r10, 2 431 jne .LBB3_7 432 # %bb.8: 433 test r8b, 1 434 je .LBB3_10 435 .LBB3_9: 436 movdqu xmm4, xmmword ptr [rdi + 2*rax] 437 movdqu xmm5, xmmword ptr [rdi + 2*rax + 16] 438 pmaxuw xmm3, xmm5 439 pmaxuw xmm1, xmm4 440 pminuw xmm2, xmm5 441 pminuw xmm0, xmm4 442 .LBB3_10: 443 pminuw xmm0, xmm2 444 pmaxuw xmm1, xmm3 445 pcmpeqd xmm2, xmm2 446 pxor xmm2, xmm1 447 phminposuw xmm1, xmm2 448 movd esi, xmm1 449 not esi 450 phminposuw xmm0, xmm0 451 movd r8d, xmm0 452 cmp r11, r9 453 je .LBB3_12 454 .p2align 4, 0x90 455 .LBB3_11: # =>This Inner Loop Header: Depth=1 456 movzx eax, word ptr [rdi + 2*r11] 457 cmp r8w, ax 458 cmovae r8d, eax 459 cmp si, ax 460 cmovbe esi, eax 461 add r11, 1 462 cmp r9, r11 463 jne .LBB3_11 464 .LBB3_12: 465 mov word ptr [rcx], si 466 mov word ptr [rdx], r8w 467 mov rsp, rbp 468 pop rbp 469 ret 470 .LBB3_5: 471 pxor xmm1, xmm1 472 pcmpeqd xmm0, xmm0 473 xor eax, eax 474 pcmpeqd xmm2, xmm2 475 pxor xmm3, xmm3 476 test r8b, 1 477 jne .LBB3_9 478 jmp .LBB3_10 479 .Lfunc_end3: 480 .size uint16_max_min_sse4, .Lfunc_end3-uint16_max_min_sse4 481 # -- End function 482 .section .rodata.cst16,"aM",@progbits,16 483 .p2align 4 # -- Begin function int32_max_min_sse4 484 .LCPI4_0: 485 .long 2147483648 # 0x80000000 486 .long 2147483648 # 0x80000000 487 .long 2147483648 # 0x80000000 488 .long 2147483648 # 0x80000000 489 .LCPI4_1: 490 .long 2147483647 # 0x7fffffff 491 .long 2147483647 # 0x7fffffff 492 .long 2147483647 # 0x7fffffff 493 .long 2147483647 # 0x7fffffff 494 .text 495 .globl int32_max_min_sse4 496 .p2align 4, 0x90 497 .type int32_max_min_sse4,@function 498 int32_max_min_sse4: # @int32_max_min_sse4 499 # %bb.0: 500 push rbp 501 mov rbp, rsp 502 and rsp, -8 503 test esi, esi 504 jle .LBB4_1 505 # %bb.2: 506 mov r9d, esi 507 cmp esi, 7 508 ja .LBB4_6 509 # %bb.3: 510 mov eax, -2147483648 511 mov r8d, 2147483647 512 xor r11d, r11d 513 jmp .LBB4_4 514 .LBB4_1: 515 mov r8d, 2147483647 516 mov eax, -2147483648 517 jmp .LBB4_13 518 .LBB4_6: 519 mov r11d, r9d 520 and r11d, -8 521 lea rax, [r11 - 8] 522 mov r8, rax 523 shr r8, 3 524 add r8, 1 525 test rax, rax 526 je .LBB4_7 527 # %bb.8: 528 mov r10, r8 529 and r10, -2 530 neg r10 531 movdqa xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [2147483648,2147483648,2147483648,2147483648] 532 movdqa xmm0, xmmword ptr [rip + .LCPI4_1] # xmm0 = [2147483647,2147483647,2147483647,2147483647] 533 xor eax, eax 534 movdqa xmm2, xmm0 535 movdqa xmm3, xmm1 536 .p2align 4, 0x90 537 .LBB4_9: # =>This Inner Loop Header: Depth=1 538 movdqu xmm4, xmmword ptr [rdi + 4*rax] 539 movdqu xmm5, xmmword ptr [rdi + 4*rax + 16] 540 movdqu xmm6, xmmword ptr [rdi + 4*rax + 32] 541 movdqu xmm7, xmmword ptr [rdi + 4*rax + 48] 542 pminsd xmm0, xmm4 543 pminsd xmm2, xmm5 544 pmaxsd xmm1, xmm4 545 pmaxsd xmm3, xmm5 546 pminsd xmm0, xmm6 547 pminsd xmm2, xmm7 548 pmaxsd xmm1, xmm6 549 pmaxsd xmm3, xmm7 550 add rax, 16 551 add r10, 2 552 jne .LBB4_9 553 # %bb.10: 554 test r8b, 1 555 je .LBB4_12 556 .LBB4_11: 557 movdqu xmm4, xmmword ptr [rdi + 4*rax] 558 movdqu xmm5, xmmword ptr [rdi + 4*rax + 16] 559 pmaxsd xmm3, xmm5 560 pmaxsd xmm1, xmm4 561 pminsd xmm2, xmm5 562 pminsd xmm0, xmm4 563 .LBB4_12: 564 pminsd xmm0, xmm2 565 pmaxsd xmm1, xmm3 566 pshufd xmm2, xmm1, 78 # xmm2 = xmm1[2,3,0,1] 567 pmaxsd xmm2, xmm1 568 pshufd xmm1, xmm2, 229 # xmm1 = xmm2[1,1,2,3] 569 pmaxsd xmm1, xmm2 570 movd eax, xmm1 571 pshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] 572 pminsd xmm1, xmm0 573 pshufd xmm0, xmm1, 229 # xmm0 = xmm1[1,1,2,3] 574 pminsd xmm0, xmm1 575 movd r8d, xmm0 576 cmp r11, r9 577 je .LBB4_13 578 .LBB4_4: 579 mov esi, eax 580 .p2align 4, 0x90 581 .LBB4_5: # =>This Inner Loop Header: Depth=1 582 mov eax, dword ptr [rdi + 4*r11] 583 cmp r8d, eax 584 cmovg r8d, eax 585 cmp esi, eax 586 cmovge eax, esi 587 add r11, 1 588 mov esi, eax 589 cmp r9, r11 590 jne .LBB4_5 591 .LBB4_13: 592 mov dword ptr [rcx], eax 593 mov dword ptr [rdx], r8d 594 mov rsp, rbp 595 pop rbp 596 ret 597 .LBB4_7: 598 movdqa xmm1, xmmword ptr [rip + .LCPI4_0] # xmm1 = [2147483648,2147483648,2147483648,2147483648] 599 movdqa xmm0, xmmword ptr [rip + .LCPI4_1] # xmm0 = [2147483647,2147483647,2147483647,2147483647] 600 xor eax, eax 601 movdqa xmm2, xmm0 602 movdqa xmm3, xmm1 603 test r8b, 1 604 jne .LBB4_11 605 jmp .LBB4_12 606 .Lfunc_end4: 607 .size int32_max_min_sse4, .Lfunc_end4-int32_max_min_sse4 608 # -- End function 609 .globl uint32_max_min_sse4 # -- Begin function uint32_max_min_sse4 610 .p2align 4, 0x90 611 .type uint32_max_min_sse4,@function 612 uint32_max_min_sse4: # @uint32_max_min_sse4 613 # %bb.0: 614 push rbp 615 mov rbp, rsp 616 and rsp, -8 617 test esi, esi 618 jle .LBB5_1 619 # %bb.2: 620 mov r9d, esi 621 cmp esi, 7 622 ja .LBB5_6 623 # %bb.3: 624 xor r11d, r11d 625 mov r8d, -1 626 xor esi, esi 627 jmp .LBB5_4 628 .LBB5_1: 629 mov r8d, -1 630 xor esi, esi 631 jmp .LBB5_13 632 .LBB5_6: 633 mov r11d, r9d 634 and r11d, -8 635 lea rax, [r11 - 8] 636 mov r8, rax 637 shr r8, 3 638 add r8, 1 639 test rax, rax 640 je .LBB5_7 641 # %bb.8: 642 mov r10, r8 643 and r10, -2 644 neg r10 645 pxor xmm1, xmm1 646 pcmpeqd xmm0, xmm0 647 xor eax, eax 648 pcmpeqd xmm2, xmm2 649 pxor xmm3, xmm3 650 .p2align 4, 0x90 651 .LBB5_9: # =>This Inner Loop Header: Depth=1 652 movdqu xmm4, xmmword ptr [rdi + 4*rax] 653 movdqu xmm5, xmmword ptr [rdi + 4*rax + 16] 654 movdqu xmm6, xmmword ptr [rdi + 4*rax + 32] 655 movdqu xmm7, xmmword ptr [rdi + 4*rax + 48] 656 pminud xmm0, xmm4 657 pminud xmm2, xmm5 658 pmaxud xmm1, xmm4 659 pmaxud xmm3, xmm5 660 pminud xmm0, xmm6 661 pminud xmm2, xmm7 662 pmaxud xmm1, xmm6 663 pmaxud xmm3, xmm7 664 add rax, 16 665 add r10, 2 666 jne .LBB5_9 667 # %bb.10: 668 test r8b, 1 669 je .LBB5_12 670 .LBB5_11: 671 movdqu xmm4, xmmword ptr [rdi + 4*rax] 672 movdqu xmm5, xmmword ptr [rdi + 4*rax + 16] 673 pmaxud xmm3, xmm5 674 pmaxud xmm1, xmm4 675 pminud xmm2, xmm5 676 pminud xmm0, xmm4 677 .LBB5_12: 678 pminud xmm0, xmm2 679 pmaxud xmm1, xmm3 680 pshufd xmm2, xmm1, 78 # xmm2 = xmm1[2,3,0,1] 681 pmaxud xmm2, xmm1 682 pshufd xmm1, xmm2, 229 # xmm1 = xmm2[1,1,2,3] 683 pmaxud xmm1, xmm2 684 movd esi, xmm1 685 pshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] 686 pminud xmm1, xmm0 687 pshufd xmm0, xmm1, 229 # xmm0 = xmm1[1,1,2,3] 688 pminud xmm0, xmm1 689 movd r8d, xmm0 690 cmp r11, r9 691 je .LBB5_13 692 .LBB5_4: 693 mov eax, esi 694 .p2align 4, 0x90 695 .LBB5_5: # =>This Inner Loop Header: Depth=1 696 mov esi, dword ptr [rdi + 4*r11] 697 cmp r8d, esi 698 cmovae r8d, esi 699 cmp eax, esi 700 cmova esi, eax 701 add r11, 1 702 mov eax, esi 703 cmp r9, r11 704 jne .LBB5_5 705 .LBB5_13: 706 mov dword ptr [rcx], esi 707 mov dword ptr [rdx], r8d 708 mov rsp, rbp 709 pop rbp 710 ret 711 .LBB5_7: 712 pxor xmm1, xmm1 713 pcmpeqd xmm0, xmm0 714 xor eax, eax 715 pcmpeqd xmm2, xmm2 716 pxor xmm3, xmm3 717 test r8b, 1 718 jne .LBB5_11 719 jmp .LBB5_12 720 .Lfunc_end5: 721 .size uint32_max_min_sse4, .Lfunc_end5-uint32_max_min_sse4 722 # -- End function 723 .section .rodata.cst16,"aM",@progbits,16 724 .p2align 4 # -- Begin function int64_max_min_sse4 725 .LCPI6_0: 726 .quad -9223372036854775808 # 0x8000000000000000 727 .quad -9223372036854775808 # 0x8000000000000000 728 .LCPI6_1: 729 .quad 9223372036854775807 # 0x7fffffffffffffff 730 .quad 9223372036854775807 # 0x7fffffffffffffff 731 .text 732 .globl int64_max_min_sse4 733 .p2align 4, 0x90 734 .type int64_max_min_sse4,@function 735 int64_max_min_sse4: # @int64_max_min_sse4 736 # %bb.0: 737 push rbp 738 mov rbp, rsp 739 and rsp, -8 740 movabs r8, 9223372036854775807 741 test esi, esi 742 jle .LBB6_1 743 # %bb.2: 744 mov r9d, esi 745 cmp esi, 3 746 ja .LBB6_6 747 # %bb.3: 748 lea rsi, [r8 + 1] 749 xor r11d, r11d 750 jmp .LBB6_4 751 .LBB6_1: 752 lea rsi, [r8 + 1] 753 jmp .LBB6_13 754 .LBB6_6: 755 mov r11d, r9d 756 and r11d, -4 757 lea rax, [r11 - 4] 758 mov r8, rax 759 shr r8, 2 760 add r8, 1 761 test rax, rax 762 je .LBB6_7 763 # %bb.8: 764 mov r10, r8 765 and r10, -2 766 neg r10 767 movdqa xmm9, xmmword ptr [rip + .LCPI6_0] # xmm9 = [9223372036854775808,9223372036854775808] 768 movdqa xmm8, xmmword ptr [rip + .LCPI6_1] # xmm8 = [9223372036854775807,9223372036854775807] 769 xor eax, eax 770 movdqa xmm2, xmm8 771 movdqa xmm6, xmm9 772 .p2align 4, 0x90 773 .LBB6_9: # =>This Inner Loop Header: Depth=1 774 movdqu xmm7, xmmword ptr [rdi + 8*rax] 775 movdqa xmm0, xmm7 776 pcmpgtq xmm0, xmm8 777 movdqa xmm4, xmm7 778 blendvpd xmm4, xmm8, xmm0 779 movdqu xmm1, xmmword ptr [rdi + 8*rax + 16] 780 movdqa xmm0, xmm1 781 pcmpgtq xmm0, xmm2 782 movdqa xmm5, xmm1 783 blendvpd xmm5, xmm2, xmm0 784 movdqa xmm0, xmm9 785 pcmpgtq xmm0, xmm7 786 blendvpd xmm7, xmm9, xmm0 787 movdqa xmm0, xmm6 788 pcmpgtq xmm0, xmm1 789 blendvpd xmm1, xmm6, xmm0 790 movdqu xmm3, xmmword ptr [rdi + 8*rax + 32] 791 movdqa xmm0, xmm3 792 pcmpgtq xmm0, xmm4 793 movdqa xmm8, xmm3 794 blendvpd xmm8, xmm4, xmm0 795 movdqu xmm4, xmmword ptr [rdi + 8*rax + 48] 796 movdqa xmm0, xmm4 797 pcmpgtq xmm0, xmm5 798 movdqa xmm2, xmm4 799 blendvpd xmm2, xmm5, xmm0 800 movapd xmm0, xmm7 801 pcmpgtq xmm0, xmm3 802 blendvpd xmm3, xmm7, xmm0 803 movapd xmm0, xmm1 804 pcmpgtq xmm0, xmm4 805 blendvpd xmm4, xmm1, xmm0 806 add rax, 8 807 movapd xmm9, xmm3 808 movapd xmm6, xmm4 809 add r10, 2 810 jne .LBB6_9 811 # %bb.10: 812 test r8b, 1 813 je .LBB6_12 814 .LBB6_11: 815 movdqu xmm1, xmmword ptr [rdi + 8*rax + 16] 816 movapd xmm0, xmm4 817 pcmpgtq xmm0, xmm1 818 movdqa xmm5, xmm1 819 blendvpd xmm5, xmm4, xmm0 820 movdqu xmm4, xmmword ptr [rdi + 8*rax] 821 movapd xmm0, xmm3 822 pcmpgtq xmm0, xmm4 823 movdqa xmm6, xmm4 824 blendvpd xmm6, xmm3, xmm0 825 movdqa xmm0, xmm1 826 pcmpgtq xmm0, xmm2 827 blendvpd xmm1, xmm2, xmm0 828 movdqa xmm0, xmm4 829 pcmpgtq xmm0, xmm8 830 blendvpd xmm4, xmm8, xmm0 831 movapd xmm8, xmm4 832 movapd xmm2, xmm1 833 movapd xmm3, xmm6 834 movapd xmm4, xmm5 835 .LBB6_12: 836 movapd xmm0, xmm3 837 pcmpgtq xmm0, xmm4 838 blendvpd xmm4, xmm3, xmm0 839 pshufd xmm1, xmm4, 78 # xmm1 = xmm4[2,3,0,1] 840 movdqa xmm0, xmm4 841 pcmpgtq xmm0, xmm1 842 blendvpd xmm1, xmm4, xmm0 843 movq rsi, xmm1 844 movdqa xmm0, xmm2 845 pcmpgtq xmm0, xmm8 846 blendvpd xmm2, xmm8, xmm0 847 pshufd xmm1, xmm2, 78 # xmm1 = xmm2[2,3,0,1] 848 movdqa xmm0, xmm1 849 pcmpgtq xmm0, xmm2 850 blendvpd xmm1, xmm2, xmm0 851 movq r8, xmm1 852 cmp r11, r9 853 je .LBB6_13 854 .LBB6_4: 855 mov rax, rsi 856 .p2align 4, 0x90 857 .LBB6_5: # =>This Inner Loop Header: Depth=1 858 mov rsi, qword ptr [rdi + 8*r11] 859 cmp r8, rsi 860 cmovg r8, rsi 861 cmp rax, rsi 862 cmovge rsi, rax 863 add r11, 1 864 mov rax, rsi 865 cmp r9, r11 866 jne .LBB6_5 867 .LBB6_13: 868 mov qword ptr [rcx], rsi 869 mov qword ptr [rdx], r8 870 mov rsp, rbp 871 pop rbp 872 ret 873 .LBB6_7: 874 movapd xmm3, xmmword ptr [rip + .LCPI6_0] # xmm3 = [9223372036854775808,9223372036854775808] 875 movdqa xmm8, xmmword ptr [rip + .LCPI6_1] # xmm8 = [9223372036854775807,9223372036854775807] 876 xor eax, eax 877 movdqa xmm2, xmm8 878 movapd xmm4, xmm3 879 test r8b, 1 880 jne .LBB6_11 881 jmp .LBB6_12 882 .Lfunc_end6: 883 .size int64_max_min_sse4, .Lfunc_end6-int64_max_min_sse4 884 # -- End function 885 .section .rodata.cst16,"aM",@progbits,16 886 .p2align 4 # -- Begin function uint64_max_min_sse4 887 .LCPI7_0: 888 .quad -9223372036854775808 # 0x8000000000000000 889 .quad -9223372036854775808 # 0x8000000000000000 890 .text 891 .globl uint64_max_min_sse4 892 .p2align 4, 0x90 893 .type uint64_max_min_sse4,@function 894 uint64_max_min_sse4: # @uint64_max_min_sse4 895 # %bb.0: 896 push rbp 897 mov rbp, rsp 898 and rsp, -8 899 test esi, esi 900 jle .LBB7_1 901 # %bb.2: 902 mov r9d, esi 903 cmp esi, 3 904 ja .LBB7_6 905 # %bb.3: 906 mov r8, -1 907 xor r11d, r11d 908 xor eax, eax 909 jmp .LBB7_4 910 .LBB7_1: 911 mov r8, -1 912 xor eax, eax 913 jmp .LBB7_13 914 .LBB7_6: 915 mov r11d, r9d 916 and r11d, -4 917 lea rax, [r11 - 4] 918 mov r8, rax 919 shr r8, 2 920 add r8, 1 921 test rax, rax 922 je .LBB7_7 923 # %bb.8: 924 mov r10, r8 925 and r10, -2 926 neg r10 927 pxor xmm9, xmm9 928 pcmpeqd xmm10, xmm10 929 xor eax, eax 930 movdqa xmm8, xmmword ptr [rip + .LCPI7_0] # xmm8 = [9223372036854775808,9223372036854775808] 931 pcmpeqd xmm11, xmm11 932 pxor xmm12, xmm12 933 .p2align 4, 0x90 934 .LBB7_9: # =>This Inner Loop Header: Depth=1 935 movdqa xmm2, xmm10 936 pxor xmm2, xmm8 937 movdqu xmm4, xmmword ptr [rdi + 8*rax] 938 movdqu xmm5, xmmword ptr [rdi + 8*rax + 16] 939 movdqu xmm13, xmmword ptr [rdi + 8*rax + 32] 940 movdqa xmm0, xmm4 941 pxor xmm0, xmm8 942 movdqa xmm1, xmm9 943 pxor xmm1, xmm8 944 pcmpgtq xmm1, xmm0 945 pcmpgtq xmm0, xmm2 946 movdqa xmm3, xmm4 947 blendvpd xmm3, xmm10, xmm0 948 movdqu xmm6, xmmword ptr [rdi + 8*rax + 48] 949 movdqa xmm7, xmm11 950 pxor xmm7, xmm8 951 movdqa xmm0, xmm5 952 pxor xmm0, xmm8 953 movdqa xmm2, xmm12 954 pxor xmm2, xmm8 955 pcmpgtq xmm2, xmm0 956 pcmpgtq xmm0, xmm7 957 movdqa xmm7, xmm5 958 blendvpd xmm7, xmm11, xmm0 959 movdqa xmm0, xmm1 960 blendvpd xmm4, xmm9, xmm0 961 movdqa xmm0, xmm2 962 blendvpd xmm5, xmm12, xmm0 963 movapd xmm2, xmm3 964 xorpd xmm2, xmm8 965 movdqa xmm0, xmm13 966 pxor xmm0, xmm8 967 movapd xmm1, xmm4 968 xorpd xmm1, xmm8 969 pcmpgtq xmm1, xmm0 970 pcmpgtq xmm0, xmm2 971 movdqa xmm10, xmm13 972 blendvpd xmm10, xmm3, xmm0 973 movapd xmm3, xmm7 974 xorpd xmm3, xmm8 975 movdqa xmm0, xmm6 976 pxor xmm0, xmm8 977 movapd xmm2, xmm5 978 xorpd xmm2, xmm8 979 pcmpgtq xmm2, xmm0 980 pcmpgtq xmm0, xmm3 981 movdqa xmm11, xmm6 982 blendvpd xmm11, xmm7, xmm0 983 movdqa xmm0, xmm1 984 blendvpd xmm13, xmm4, xmm0 985 movdqa xmm0, xmm2 986 blendvpd xmm6, xmm5, xmm0 987 add rax, 8 988 movapd xmm9, xmm13 989 movapd xmm12, xmm6 990 add r10, 2 991 jne .LBB7_9 992 # %bb.10: 993 test r8b, 1 994 je .LBB7_12 995 .LBB7_11: 996 movupd xmm4, xmmword ptr [rdi + 8*rax] 997 movupd xmm3, xmmword ptr [rdi + 8*rax + 16] 998 movapd xmm5, xmmword ptr [rip + .LCPI7_0] # xmm5 = [9223372036854775808,9223372036854775808] 999 movapd xmm0, xmm6 1000 xorpd xmm0, xmm5 1001 movapd xmm1, xmm3 1002 xorpd xmm1, xmm5 1003 pcmpgtq xmm0, xmm1 1004 movapd xmm7, xmm3 1005 blendvpd xmm7, xmm6, xmm0 1006 movapd xmm0, xmm13 1007 xorpd xmm0, xmm5 1008 movapd xmm2, xmm4 1009 xorpd xmm2, xmm5 1010 pcmpgtq xmm0, xmm2 1011 movapd xmm6, xmm4 1012 blendvpd xmm6, xmm13, xmm0 1013 movapd xmm0, xmm11 1014 xorpd xmm0, xmm5 1015 pcmpgtq xmm1, xmm0 1016 movdqa xmm0, xmm1 1017 blendvpd xmm3, xmm11, xmm0 1018 xorpd xmm5, xmm10 1019 pcmpgtq xmm2, xmm5 1020 movdqa xmm0, xmm2 1021 blendvpd xmm4, xmm10, xmm0 1022 movapd xmm10, xmm4 1023 movapd xmm11, xmm3 1024 movapd xmm13, xmm6 1025 movapd xmm6, xmm7 1026 .LBB7_12: 1027 movapd xmm1, xmmword ptr [rip + .LCPI7_0] # xmm1 = [9223372036854775808,9223372036854775808] 1028 movapd xmm2, xmm6 1029 xorpd xmm2, xmm1 1030 movapd xmm0, xmm13 1031 xorpd xmm0, xmm1 1032 pcmpgtq xmm0, xmm2 1033 blendvpd xmm6, xmm13, xmm0 1034 pshufd xmm2, xmm6, 78 # xmm2 = xmm6[2,3,0,1] 1035 movapd xmm0, xmm6 1036 xorpd xmm0, xmm1 1037 movdqa xmm3, xmm2 1038 pxor xmm3, xmm1 1039 pcmpgtq xmm0, xmm3 1040 blendvpd xmm2, xmm6, xmm0 1041 movq rax, xmm2 1042 movdqa xmm2, xmm10 1043 pxor xmm2, xmm1 1044 movdqa xmm0, xmm11 1045 pxor xmm0, xmm1 1046 pcmpgtq xmm0, xmm2 1047 blendvpd xmm11, xmm10, xmm0 1048 pshufd xmm2, xmm11, 78 # xmm2 = xmm11[2,3,0,1] 1049 movdqa xmm0, xmm11 1050 pxor xmm0, xmm1 1051 pxor xmm1, xmm2 1052 pcmpgtq xmm1, xmm0 1053 movdqa xmm0, xmm1 1054 blendvpd xmm2, xmm11, xmm0 1055 movq r8, xmm2 1056 cmp r11, r9 1057 je .LBB7_13 1058 .LBB7_4: 1059 mov rsi, rax 1060 .p2align 4, 0x90 1061 .LBB7_5: # =>This Inner Loop Header: Depth=1 1062 mov rax, qword ptr [rdi + 8*r11] 1063 cmp r8, rax 1064 cmovae r8, rax 1065 cmp rsi, rax 1066 cmova rax, rsi 1067 add r11, 1 1068 mov rsi, rax 1069 cmp r9, r11 1070 jne .LBB7_5 1071 .LBB7_13: 1072 mov qword ptr [rcx], rax 1073 mov qword ptr [rdx], r8 1074 mov rsp, rbp 1075 pop rbp 1076 ret 1077 .LBB7_7: 1078 xorpd xmm13, xmm13 1079 pcmpeqd xmm10, xmm10 1080 xor eax, eax 1081 pcmpeqd xmm11, xmm11 1082 xorpd xmm6, xmm6 1083 test r8b, 1 1084 jne .LBB7_11 1085 jmp .LBB7_12 1086 .Lfunc_end7: 1087 .size uint64_max_min_sse4, .Lfunc_end7-uint64_max_min_sse4 1088 # -- End function 1089 .ident "Debian clang version 11.0.1-2" 1090 .section ".note.GNU-stack","",@progbits 1091 .addrsig