github.com/apache/arrow/go/v14@v14.0.1/internal/utils/_lib/min_max_avx2_amd64.s (about) 1 .text 2 .intel_syntax noprefix 3 .file "min_max.c" 4 .section .rodata.cst32,"aM",@progbits,32 5 .p2align 5 # -- Begin function int8_max_min_avx2 6 .LCPI0_0: 7 .zero 32,128 8 .LCPI0_1: 9 .zero 32,127 10 .section .rodata.cst16,"aM",@progbits,16 11 .p2align 4 12 .LCPI0_2: 13 .zero 16,127 14 .LCPI0_3: 15 .zero 16,128 16 .text 17 .globl int8_max_min_avx2 18 .p2align 4, 0x90 19 .type int8_max_min_avx2,@function 20 int8_max_min_avx2: # @int8_max_min_avx2 21 # %bb.0: 22 push rbp 23 mov rbp, rsp 24 and rsp, -8 25 test esi, esi 26 jle .LBB0_1 27 # %bb.2: 28 mov r9d, esi 29 cmp esi, 63 30 ja .LBB0_4 31 # %bb.3: 32 mov r8b, -128 33 mov sil, 127 34 xor r10d, r10d 35 jmp .LBB0_11 36 .LBB0_1: 37 mov sil, 127 38 mov r8b, -128 39 jmp .LBB0_12 40 .LBB0_4: 41 mov r10d, r9d 42 and r10d, -64 43 lea rax, [r10 - 64] 44 mov r8, rax 45 shr r8, 6 46 add r8, 1 47 test rax, rax 48 je .LBB0_5 49 # %bb.6: 50 mov rsi, r8 51 and rsi, -2 52 neg rsi 53 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_0] # ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 54 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_1] # ymm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 55 xor eax, eax 56 vmovdqa ymm2, ymm0 57 vmovdqa ymm3, ymm1 58 .p2align 4, 0x90 59 .LBB0_7: # =>This Inner Loop Header: Depth=1 60 vmovdqu ymm4, ymmword ptr [rdi + rax] 61 vmovdqu ymm5, ymmword ptr [rdi + rax + 32] 62 vmovdqu ymm6, ymmword ptr [rdi + rax + 64] 63 vmovdqu ymm7, ymmword ptr [rdi + rax + 96] 64 vpminsb ymm0, ymm0, ymm4 65 vpminsb ymm2, ymm2, ymm5 66 vpmaxsb ymm1, ymm1, ymm4 67 vpmaxsb ymm3, ymm3, ymm5 68 vpminsb ymm0, ymm0, ymm6 69 vpminsb ymm2, ymm2, ymm7 70 vpmaxsb ymm1, ymm1, ymm6 71 vpmaxsb ymm3, ymm3, ymm7 72 sub rax, -128 73 add rsi, 2 74 jne .LBB0_7 75 # %bb.8: 76 test r8b, 1 77 je .LBB0_10 78 .LBB0_9: 79 vmovdqu ymm4, ymmword ptr [rdi + rax] 80 vmovdqu ymm5, ymmword ptr [rdi + rax + 32] 81 vpmaxsb ymm3, ymm3, ymm5 82 vpmaxsb ymm1, ymm1, ymm4 83 vpminsb ymm2, ymm2, ymm5 84 vpminsb ymm0, ymm0, ymm4 85 .LBB0_10: 86 vpmaxsb ymm1, ymm1, ymm3 87 vextracti128 xmm3, ymm1, 1 88 vpmaxsb xmm1, xmm1, xmm3 89 vpxor xmm1, xmm1, xmmword ptr [rip + .LCPI0_2] 90 vpminsb ymm0, ymm0, ymm2 91 vpsrlw xmm2, xmm1, 8 92 vpminub xmm1, xmm1, xmm2 93 vphminposuw xmm1, xmm1 94 vmovd r8d, xmm1 95 xor r8b, 127 96 vextracti128 xmm1, ymm0, 1 97 vpminsb xmm0, xmm0, xmm1 98 vpxor xmm0, xmm0, xmmword ptr [rip + .LCPI0_3] 99 vpsrlw xmm1, xmm0, 8 100 vpminub xmm0, xmm0, xmm1 101 vphminposuw xmm0, xmm0 102 vmovd esi, xmm0 103 xor sil, -128 104 cmp r10, r9 105 je .LBB0_12 106 .p2align 4, 0x90 107 .LBB0_11: # =>This Inner Loop Header: Depth=1 108 movzx eax, byte ptr [rdi + r10] 109 cmp sil, al 110 movzx esi, sil 111 cmovg esi, eax 112 cmp r8b, al 113 movzx r8d, r8b 114 cmovl r8d, eax 115 add r10, 1 116 cmp r9, r10 117 jne .LBB0_11 118 .LBB0_12: 119 mov byte ptr [rcx], r8b 120 mov byte ptr [rdx], sil 121 mov rsp, rbp 122 pop rbp 123 vzeroupper 124 ret 125 .LBB0_5: 126 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_0] # ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 127 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_1] # ymm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 128 xor eax, eax 129 vmovdqa ymm2, ymm0 130 vmovdqa ymm3, ymm1 131 test r8b, 1 132 jne .LBB0_9 133 jmp .LBB0_10 134 .Lfunc_end0: 135 .size int8_max_min_avx2, .Lfunc_end0-int8_max_min_avx2 136 # -- End function 137 .globl uint8_max_min_avx2 # -- Begin function uint8_max_min_avx2 138 .p2align 4, 0x90 139 .type uint8_max_min_avx2,@function 140 uint8_max_min_avx2: # @uint8_max_min_avx2 141 # %bb.0: 142 push rbp 143 mov rbp, rsp 144 and rsp, -8 145 test esi, esi 146 jle .LBB1_1 147 # %bb.2: 148 mov r9d, esi 149 cmp esi, 63 150 ja .LBB1_4 151 # %bb.3: 152 mov sil, -1 153 xor r10d, r10d 154 xor eax, eax 155 jmp .LBB1_11 156 .LBB1_1: 157 mov sil, -1 158 xor eax, eax 159 jmp .LBB1_12 160 .LBB1_4: 161 mov r10d, r9d 162 and r10d, -64 163 lea rax, [r10 - 64] 164 mov r8, rax 165 shr r8, 6 166 add r8, 1 167 test rax, rax 168 je .LBB1_5 169 # %bb.6: 170 mov rsi, r8 171 and rsi, -2 172 neg rsi 173 vpxor xmm0, xmm0, xmm0 174 vpcmpeqd ymm1, ymm1, ymm1 175 xor eax, eax 176 vpcmpeqd ymm2, ymm2, ymm2 177 vpxor xmm3, xmm3, xmm3 178 .p2align 4, 0x90 179 .LBB1_7: # =>This Inner Loop Header: Depth=1 180 vmovdqu ymm4, ymmword ptr [rdi + rax] 181 vmovdqu ymm5, ymmword ptr [rdi + rax + 32] 182 vmovdqu ymm6, ymmword ptr [rdi + rax + 64] 183 vmovdqu ymm7, ymmword ptr [rdi + rax + 96] 184 vpminub ymm1, ymm1, ymm4 185 vpminub ymm2, ymm2, ymm5 186 vpmaxub ymm0, ymm0, ymm4 187 vpmaxub ymm3, ymm3, ymm5 188 vpminub ymm1, ymm1, ymm6 189 vpminub ymm2, ymm2, ymm7 190 vpmaxub ymm0, ymm0, ymm6 191 vpmaxub ymm3, ymm3, ymm7 192 sub rax, -128 193 add rsi, 2 194 jne .LBB1_7 195 # %bb.8: 196 test r8b, 1 197 je .LBB1_10 198 .LBB1_9: 199 vmovdqu ymm4, ymmword ptr [rdi + rax] 200 vmovdqu ymm5, ymmword ptr [rdi + rax + 32] 201 vpmaxub ymm3, ymm3, ymm5 202 vpmaxub ymm0, ymm0, ymm4 203 vpminub ymm2, ymm2, ymm5 204 vpminub ymm1, ymm1, ymm4 205 .LBB1_10: 206 vpminub ymm1, ymm1, ymm2 207 vpmaxub ymm0, ymm0, ymm3 208 vextracti128 xmm2, ymm0, 1 209 vpmaxub xmm0, xmm0, xmm2 210 vpcmpeqd xmm2, xmm2, xmm2 211 vpxor xmm0, xmm0, xmm2 212 vpsrlw xmm2, xmm0, 8 213 vpminub xmm0, xmm0, xmm2 214 vphminposuw xmm0, xmm0 215 vmovd eax, xmm0 216 not al 217 vextracti128 xmm0, ymm1, 1 218 vpminub xmm0, xmm1, xmm0 219 vpsrlw xmm1, xmm0, 8 220 vpminub xmm0, xmm0, xmm1 221 vphminposuw xmm0, xmm0 222 vmovd esi, xmm0 223 cmp r10, r9 224 je .LBB1_12 225 .p2align 4, 0x90 226 .LBB1_11: # =>This Inner Loop Header: Depth=1 227 movzx r8d, byte ptr [rdi + r10] 228 cmp sil, r8b 229 movzx esi, sil 230 cmovae esi, r8d 231 cmp al, r8b 232 movzx eax, al 233 cmovbe eax, r8d 234 add r10, 1 235 cmp r9, r10 236 jne .LBB1_11 237 .LBB1_12: 238 mov byte ptr [rcx], al 239 mov byte ptr [rdx], sil 240 mov rsp, rbp 241 pop rbp 242 vzeroupper 243 ret 244 .LBB1_5: 245 vpxor xmm0, xmm0, xmm0 246 vpcmpeqd ymm1, ymm1, ymm1 247 xor eax, eax 248 vpcmpeqd ymm2, ymm2, ymm2 249 vpxor xmm3, xmm3, xmm3 250 test r8b, 1 251 jne .LBB1_9 252 jmp .LBB1_10 253 .Lfunc_end1: 254 .size uint8_max_min_avx2, .Lfunc_end1-uint8_max_min_avx2 255 # -- End function 256 .section .rodata.cst32,"aM",@progbits,32 257 .p2align 5 # -- Begin function int16_max_min_avx2 258 .LCPI2_0: 259 .short 32768 # 0x8000 260 .short 32768 # 0x8000 261 .short 32768 # 0x8000 262 .short 32768 # 0x8000 263 .short 32768 # 0x8000 264 .short 32768 # 0x8000 265 .short 32768 # 0x8000 266 .short 32768 # 0x8000 267 .short 32768 # 0x8000 268 .short 32768 # 0x8000 269 .short 32768 # 0x8000 270 .short 32768 # 0x8000 271 .short 32768 # 0x8000 272 .short 32768 # 0x8000 273 .short 32768 # 0x8000 274 .short 32768 # 0x8000 275 .LCPI2_1: 276 .short 32767 # 0x7fff 277 .short 32767 # 0x7fff 278 .short 32767 # 0x7fff 279 .short 32767 # 0x7fff 280 .short 32767 # 0x7fff 281 .short 32767 # 0x7fff 282 .short 32767 # 0x7fff 283 .short 32767 # 0x7fff 284 .short 32767 # 0x7fff 285 .short 32767 # 0x7fff 286 .short 32767 # 0x7fff 287 .short 32767 # 0x7fff 288 .short 32767 # 0x7fff 289 .short 32767 # 0x7fff 290 .short 32767 # 0x7fff 291 .short 32767 # 0x7fff 292 .section .rodata.cst16,"aM",@progbits,16 293 .p2align 4 294 .LCPI2_2: 295 .short 32767 # 0x7fff 296 .short 32767 # 0x7fff 297 .short 32767 # 0x7fff 298 .short 32767 # 0x7fff 299 .short 32767 # 0x7fff 300 .short 32767 # 0x7fff 301 .short 32767 # 0x7fff 302 .short 32767 # 0x7fff 303 .LCPI2_3: 304 .short 32768 # 0x8000 305 .short 32768 # 0x8000 306 .short 32768 # 0x8000 307 .short 32768 # 0x8000 308 .short 32768 # 0x8000 309 .short 32768 # 0x8000 310 .short 32768 # 0x8000 311 .short 32768 # 0x8000 312 .text 313 .globl int16_max_min_avx2 314 .p2align 4, 0x90 315 .type int16_max_min_avx2,@function 316 int16_max_min_avx2: # @int16_max_min_avx2 317 # %bb.0: 318 push rbp 319 mov rbp, rsp 320 and rsp, -8 321 test esi, esi 322 jle .LBB2_1 323 # %bb.2: 324 mov r9d, esi 325 cmp esi, 31 326 ja .LBB2_4 327 # %bb.3: 328 mov r8w, -32768 329 mov si, 32767 330 xor r10d, r10d 331 jmp .LBB2_11 332 .LBB2_1: 333 mov si, 32767 334 mov r8w, -32768 335 jmp .LBB2_12 336 .LBB2_4: 337 mov r10d, r9d 338 and r10d, -32 339 lea rax, [r10 - 32] 340 mov r8, rax 341 shr r8, 5 342 add r8, 1 343 test rax, rax 344 je .LBB2_5 345 # %bb.6: 346 mov rsi, r8 347 and rsi, -2 348 neg rsi 349 vmovdqa ymm1, ymmword ptr [rip + .LCPI2_0] # ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] 350 vmovdqa ymm0, ymmword ptr [rip + .LCPI2_1] # ymm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] 351 xor eax, eax 352 vmovdqa ymm2, ymm0 353 vmovdqa ymm3, ymm1 354 .p2align 4, 0x90 355 .LBB2_7: # =>This Inner Loop Header: Depth=1 356 vmovdqu ymm4, ymmword ptr [rdi + 2*rax] 357 vmovdqu ymm5, ymmword ptr [rdi + 2*rax + 32] 358 vmovdqu ymm6, ymmword ptr [rdi + 2*rax + 64] 359 vmovdqu ymm7, ymmword ptr [rdi + 2*rax + 96] 360 vpminsw ymm0, ymm0, ymm4 361 vpminsw ymm2, ymm2, ymm5 362 vpmaxsw ymm1, ymm1, ymm4 363 vpmaxsw ymm3, ymm3, ymm5 364 vpminsw ymm0, ymm0, ymm6 365 vpminsw ymm2, ymm2, ymm7 366 vpmaxsw ymm1, ymm1, ymm6 367 vpmaxsw ymm3, ymm3, ymm7 368 add rax, 64 369 add rsi, 2 370 jne .LBB2_7 371 # %bb.8: 372 test r8b, 1 373 je .LBB2_10 374 .LBB2_9: 375 vmovdqu ymm4, ymmword ptr [rdi + 2*rax] 376 vmovdqu ymm5, ymmword ptr [rdi + 2*rax + 32] 377 vpmaxsw ymm3, ymm3, ymm5 378 vpmaxsw ymm1, ymm1, ymm4 379 vpminsw ymm2, ymm2, ymm5 380 vpminsw ymm0, ymm0, ymm4 381 .LBB2_10: 382 vpmaxsw ymm1, ymm1, ymm3 383 vextracti128 xmm3, ymm1, 1 384 vpmaxsw xmm1, xmm1, xmm3 385 vpxor xmm1, xmm1, xmmword ptr [rip + .LCPI2_2] 386 vpminsw ymm0, ymm0, ymm2 387 vphminposuw xmm1, xmm1 388 vmovd r8d, xmm1 389 xor r8d, 32767 390 vextracti128 xmm1, ymm0, 1 391 vpminsw xmm0, xmm0, xmm1 392 vpxor xmm0, xmm0, xmmword ptr [rip + .LCPI2_3] 393 vphminposuw xmm0, xmm0 394 vmovd esi, xmm0 395 xor esi, 32768 396 cmp r10, r9 397 je .LBB2_12 398 .p2align 4, 0x90 399 .LBB2_11: # =>This Inner Loop Header: Depth=1 400 movzx eax, word ptr [rdi + 2*r10] 401 cmp si, ax 402 cmovg esi, eax 403 cmp r8w, ax 404 cmovl r8d, eax 405 add r10, 1 406 cmp r9, r10 407 jne .LBB2_11 408 .LBB2_12: 409 mov word ptr [rcx], r8w 410 mov word ptr [rdx], si 411 mov rsp, rbp 412 pop rbp 413 vzeroupper 414 ret 415 .LBB2_5: 416 vmovdqa ymm1, ymmword ptr [rip + .LCPI2_0] # ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] 417 vmovdqa ymm0, ymmword ptr [rip + .LCPI2_1] # ymm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] 418 xor eax, eax 419 vmovdqa ymm2, ymm0 420 vmovdqa ymm3, ymm1 421 test r8b, 1 422 jne .LBB2_9 423 jmp .LBB2_10 424 .Lfunc_end2: 425 .size int16_max_min_avx2, .Lfunc_end2-int16_max_min_avx2 426 # -- End function 427 .globl uint16_max_min_avx2 # -- Begin function uint16_max_min_avx2 428 .p2align 4, 0x90 429 .type uint16_max_min_avx2,@function 430 uint16_max_min_avx2: # @uint16_max_min_avx2 431 # %bb.0: 432 push rbp 433 mov rbp, rsp 434 and rsp, -8 435 test esi, esi 436 jle .LBB3_1 437 # %bb.2: 438 mov r9d, esi 439 cmp esi, 31 440 ja .LBB3_4 441 # %bb.3: 442 mov r8w, -1 443 xor r10d, r10d 444 xor esi, esi 445 jmp .LBB3_11 446 .LBB3_1: 447 mov r8w, -1 448 xor esi, esi 449 jmp .LBB3_12 450 .LBB3_4: 451 mov r10d, r9d 452 and r10d, -32 453 lea rax, [r10 - 32] 454 mov r8, rax 455 shr r8, 5 456 add r8, 1 457 test rax, rax 458 je .LBB3_5 459 # %bb.6: 460 mov rsi, r8 461 and rsi, -2 462 neg rsi 463 vpxor xmm0, xmm0, xmm0 464 vpcmpeqd ymm1, ymm1, ymm1 465 xor eax, eax 466 vpcmpeqd ymm2, ymm2, ymm2 467 vpxor xmm3, xmm3, xmm3 468 .p2align 4, 0x90 469 .LBB3_7: # =>This Inner Loop Header: Depth=1 470 vmovdqu ymm4, ymmword ptr [rdi + 2*rax] 471 vmovdqu ymm5, ymmword ptr [rdi + 2*rax + 32] 472 vmovdqu ymm6, ymmword ptr [rdi + 2*rax + 64] 473 vmovdqu ymm7, ymmword ptr [rdi + 2*rax + 96] 474 vpminuw ymm1, ymm1, ymm4 475 vpminuw ymm2, ymm2, ymm5 476 vpmaxuw ymm0, ymm0, ymm4 477 vpmaxuw ymm3, ymm3, ymm5 478 vpminuw ymm1, ymm1, ymm6 479 vpminuw ymm2, ymm2, ymm7 480 vpmaxuw ymm0, ymm0, ymm6 481 vpmaxuw ymm3, ymm3, ymm7 482 add rax, 64 483 add rsi, 2 484 jne .LBB3_7 485 # %bb.8: 486 test r8b, 1 487 je .LBB3_10 488 .LBB3_9: 489 vmovdqu ymm4, ymmword ptr [rdi + 2*rax] 490 vmovdqu ymm5, ymmword ptr [rdi + 2*rax + 32] 491 vpmaxuw ymm3, ymm3, ymm5 492 vpmaxuw ymm0, ymm0, ymm4 493 vpminuw ymm2, ymm2, ymm5 494 vpminuw ymm1, ymm1, ymm4 495 .LBB3_10: 496 vpminuw ymm1, ymm1, ymm2 497 vpmaxuw ymm0, ymm0, ymm3 498 vextracti128 xmm2, ymm0, 1 499 vpmaxuw xmm0, xmm0, xmm2 500 vpcmpeqd xmm2, xmm2, xmm2 501 vpxor xmm0, xmm0, xmm2 502 vphminposuw xmm0, xmm0 503 vmovd esi, xmm0 504 not esi 505 vextracti128 xmm0, ymm1, 1 506 vpminuw xmm0, xmm1, xmm0 507 vphminposuw xmm0, xmm0 508 vmovd r8d, xmm0 509 cmp r10, r9 510 je .LBB3_12 511 .p2align 4, 0x90 512 .LBB3_11: # =>This Inner Loop Header: Depth=1 513 movzx eax, word ptr [rdi + 2*r10] 514 cmp r8w, ax 515 cmovae r8d, eax 516 cmp si, ax 517 cmovbe esi, eax 518 add r10, 1 519 cmp r9, r10 520 jne .LBB3_11 521 .LBB3_12: 522 mov word ptr [rcx], si 523 mov word ptr [rdx], r8w 524 mov rsp, rbp 525 pop rbp 526 vzeroupper 527 ret 528 .LBB3_5: 529 vpxor xmm0, xmm0, xmm0 530 vpcmpeqd ymm1, ymm1, ymm1 531 xor eax, eax 532 vpcmpeqd ymm2, ymm2, ymm2 533 vpxor xmm3, xmm3, xmm3 534 test r8b, 1 535 jne .LBB3_9 536 jmp .LBB3_10 537 .Lfunc_end3: 538 .size uint16_max_min_avx2, .Lfunc_end3-uint16_max_min_avx2 539 # -- End function 540 .section .rodata.cst4,"aM",@progbits,4 541 .p2align 2 # -- Begin function int32_max_min_avx2 542 .LCPI4_0: 543 .long 2147483648 # 0x80000000 544 .LCPI4_1: 545 .long 2147483647 # 0x7fffffff 546 .text 547 .globl int32_max_min_avx2 548 .p2align 4, 0x90 549 .type int32_max_min_avx2,@function 550 int32_max_min_avx2: # @int32_max_min_avx2 551 # %bb.0: 552 push rbp 553 mov rbp, rsp 554 and rsp, -8 555 test esi, esi 556 jle .LBB4_1 557 # %bb.2: 558 mov r8d, esi 559 cmp esi, 31 560 ja .LBB4_4 561 # %bb.3: 562 mov r10d, -2147483648 563 mov eax, 2147483647 564 xor r9d, r9d 565 jmp .LBB4_7 566 .LBB4_1: 567 mov eax, 2147483647 568 mov esi, -2147483648 569 jmp .LBB4_8 570 .LBB4_4: 571 mov r9d, r8d 572 vpbroadcastd ymm4, dword ptr [rip + .LCPI4_0] # ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] 573 and r9d, -32 574 vpbroadcastd ymm0, dword ptr [rip + .LCPI4_1] # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] 575 xor eax, eax 576 vmovdqa ymm1, ymm0 577 vmovdqa ymm2, ymm0 578 vmovdqa ymm3, ymm0 579 vmovdqa ymm5, ymm4 580 vmovdqa ymm6, ymm4 581 vmovdqa ymm7, ymm4 582 .p2align 4, 0x90 583 .LBB4_5: # =>This Inner Loop Header: Depth=1 584 vmovdqu ymm8, ymmword ptr [rdi + 4*rax] 585 vmovdqu ymm9, ymmword ptr [rdi + 4*rax + 32] 586 vmovdqu ymm10, ymmword ptr [rdi + 4*rax + 64] 587 vmovdqu ymm11, ymmword ptr [rdi + 4*rax + 96] 588 vpminsd ymm0, ymm0, ymm8 589 vpminsd ymm1, ymm1, ymm9 590 vpminsd ymm2, ymm2, ymm10 591 vpminsd ymm3, ymm3, ymm11 592 vpmaxsd ymm4, ymm4, ymm8 593 vpmaxsd ymm5, ymm5, ymm9 594 vpmaxsd ymm6, ymm6, ymm10 595 vpmaxsd ymm7, ymm7, ymm11 596 add rax, 32 597 cmp r9, rax 598 jne .LBB4_5 599 # %bb.6: 600 vpmaxsd ymm4, ymm4, ymm5 601 vpmaxsd ymm4, ymm4, ymm6 602 vpmaxsd ymm4, ymm4, ymm7 603 vextracti128 xmm5, ymm4, 1 604 vpmaxsd xmm4, xmm4, xmm5 605 vpshufd xmm5, xmm4, 78 # xmm5 = xmm4[2,3,0,1] 606 vpmaxsd xmm4, xmm4, xmm5 607 vpshufd xmm5, xmm4, 229 # xmm5 = xmm4[1,1,2,3] 608 vpmaxsd xmm4, xmm4, xmm5 609 vmovd r10d, xmm4 610 vpminsd ymm0, ymm0, ymm1 611 vpminsd ymm0, ymm0, ymm2 612 vpminsd ymm0, ymm0, ymm3 613 vextracti128 xmm1, ymm0, 1 614 vpminsd xmm0, xmm0, xmm1 615 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] 616 vpminsd xmm0, xmm0, xmm1 617 vpshufd xmm1, xmm0, 229 # xmm1 = xmm0[1,1,2,3] 618 vpminsd xmm0, xmm0, xmm1 619 vmovd eax, xmm0 620 mov esi, r10d 621 cmp r9, r8 622 je .LBB4_8 623 .p2align 4, 0x90 624 .LBB4_7: # =>This Inner Loop Header: Depth=1 625 mov esi, dword ptr [rdi + 4*r9] 626 cmp eax, esi 627 cmovg eax, esi 628 cmp r10d, esi 629 cmovge esi, r10d 630 add r9, 1 631 mov r10d, esi 632 cmp r8, r9 633 jne .LBB4_7 634 .LBB4_8: 635 mov dword ptr [rcx], esi 636 mov dword ptr [rdx], eax 637 mov rsp, rbp 638 pop rbp 639 vzeroupper 640 ret 641 .Lfunc_end4: 642 .size int32_max_min_avx2, .Lfunc_end4-int32_max_min_avx2 643 # -- End function 644 .globl uint32_max_min_avx2 # -- Begin function uint32_max_min_avx2 645 .p2align 4, 0x90 646 .type uint32_max_min_avx2,@function 647 uint32_max_min_avx2: # @uint32_max_min_avx2 648 # %bb.0: 649 push rbp 650 mov rbp, rsp 651 and rsp, -8 652 test esi, esi 653 jle .LBB5_1 654 # %bb.2: 655 mov r8d, esi 656 cmp esi, 31 657 ja .LBB5_4 658 # %bb.3: 659 xor r9d, r9d 660 mov eax, -1 661 xor r10d, r10d 662 jmp .LBB5_7 663 .LBB5_1: 664 mov eax, -1 665 xor esi, esi 666 jmp .LBB5_8 667 .LBB5_4: 668 mov r9d, r8d 669 and r9d, -32 670 vpxor xmm4, xmm4, xmm4 671 vpcmpeqd ymm0, ymm0, ymm0 672 xor eax, eax 673 vpcmpeqd ymm1, ymm1, ymm1 674 vpcmpeqd ymm2, ymm2, ymm2 675 vpcmpeqd ymm3, ymm3, ymm3 676 vpxor xmm5, xmm5, xmm5 677 vpxor xmm6, xmm6, xmm6 678 vpxor xmm7, xmm7, xmm7 679 .p2align 4, 0x90 680 .LBB5_5: # =>This Inner Loop Header: Depth=1 681 vmovdqu ymm8, ymmword ptr [rdi + 4*rax] 682 vmovdqu ymm9, ymmword ptr [rdi + 4*rax + 32] 683 vmovdqu ymm10, ymmword ptr [rdi + 4*rax + 64] 684 vmovdqu ymm11, ymmword ptr [rdi + 4*rax + 96] 685 vpminud ymm0, ymm0, ymm8 686 vpminud ymm1, ymm1, ymm9 687 vpminud ymm2, ymm2, ymm10 688 vpminud ymm3, ymm3, ymm11 689 vpmaxud ymm4, ymm4, ymm8 690 vpmaxud ymm5, ymm5, ymm9 691 vpmaxud ymm6, ymm6, ymm10 692 vpmaxud ymm7, ymm7, ymm11 693 add rax, 32 694 cmp r9, rax 695 jne .LBB5_5 696 # %bb.6: 697 vpmaxud ymm4, ymm4, ymm5 698 vpmaxud ymm4, ymm4, ymm6 699 vpmaxud ymm4, ymm4, ymm7 700 vextracti128 xmm5, ymm4, 1 701 vpmaxud xmm4, xmm4, xmm5 702 vpshufd xmm5, xmm4, 78 # xmm5 = xmm4[2,3,0,1] 703 vpmaxud xmm4, xmm4, xmm5 704 vpshufd xmm5, xmm4, 229 # xmm5 = xmm4[1,1,2,3] 705 vpmaxud xmm4, xmm4, xmm5 706 vmovd r10d, xmm4 707 vpminud ymm0, ymm0, ymm1 708 vpminud ymm0, ymm0, ymm2 709 vpminud ymm0, ymm0, ymm3 710 vextracti128 xmm1, ymm0, 1 711 vpminud xmm0, xmm0, xmm1 712 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] 713 vpminud xmm0, xmm0, xmm1 714 vpshufd xmm1, xmm0, 229 # xmm1 = xmm0[1,1,2,3] 715 vpminud xmm0, xmm0, xmm1 716 vmovd eax, xmm0 717 mov esi, r10d 718 cmp r9, r8 719 je .LBB5_8 720 .p2align 4, 0x90 721 .LBB5_7: # =>This Inner Loop Header: Depth=1 722 mov esi, dword ptr [rdi + 4*r9] 723 cmp eax, esi 724 cmovae eax, esi 725 cmp r10d, esi 726 cmova esi, r10d 727 add r9, 1 728 mov r10d, esi 729 cmp r8, r9 730 jne .LBB5_7 731 .LBB5_8: 732 mov dword ptr [rcx], esi 733 mov dword ptr [rdx], eax 734 mov rsp, rbp 735 pop rbp 736 vzeroupper 737 ret 738 .Lfunc_end5: 739 .size uint32_max_min_avx2, .Lfunc_end5-uint32_max_min_avx2 740 # -- End function 741 .section .rodata.cst8,"aM",@progbits,8 742 .p2align 3 # -- Begin function int64_max_min_avx2 743 .LCPI6_0: 744 .quad -9223372036854775808 # 0x8000000000000000 745 .LCPI6_1: 746 .quad 9223372036854775807 # 0x7fffffffffffffff 747 .text 748 .globl int64_max_min_avx2 749 .p2align 4, 0x90 750 .type int64_max_min_avx2,@function 751 int64_max_min_avx2: # @int64_max_min_avx2 752 # %bb.0: 753 push rbp 754 mov rbp, rsp 755 and rsp, -8 756 movabs rax, 9223372036854775807 757 test esi, esi 758 jle .LBB6_1 759 # %bb.2: 760 mov r8d, esi 761 cmp esi, 15 762 ja .LBB6_4 763 # %bb.3: 764 lea r10, [rax + 1] 765 xor r9d, r9d 766 jmp .LBB6_7 767 .LBB6_1: 768 lea rsi, [rax + 1] 769 jmp .LBB6_8 770 .LBB6_4: 771 mov r9d, r8d 772 vpbroadcastq ymm4, qword ptr [rip + .LCPI6_0] # ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 773 and r9d, -16 774 vpbroadcastq ymm0, qword ptr [rip + .LCPI6_1] # ymm0 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] 775 xor eax, eax 776 vmovdqa ymm3, ymm0 777 vmovdqa ymm2, ymm0 778 vmovdqa ymm1, ymm0 779 vmovdqa ymm7, ymm4 780 vmovdqa ymm6, ymm4 781 vmovdqa ymm5, ymm4 782 .p2align 4, 0x90 783 .LBB6_5: # =>This Inner Loop Header: Depth=1 784 vmovdqu ymm8, ymmword ptr [rdi + 8*rax] 785 vpcmpgtq ymm9, ymm8, ymm0 786 vblendvpd ymm0, ymm8, ymm0, ymm9 787 vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 32] 788 vpcmpgtq ymm10, ymm9, ymm3 789 vblendvpd ymm3, ymm9, ymm3, ymm10 790 vmovdqu ymm10, ymmword ptr [rdi + 8*rax + 64] 791 vpcmpgtq ymm11, ymm10, ymm2 792 vblendvpd ymm2, ymm10, ymm2, ymm11 793 vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 96] 794 vpcmpgtq ymm12, ymm11, ymm1 795 vblendvpd ymm1, ymm11, ymm1, ymm12 796 vpcmpgtq ymm12, ymm4, ymm8 797 vblendvpd ymm4, ymm8, ymm4, ymm12 798 vpcmpgtq ymm8, ymm7, ymm9 799 vblendvpd ymm7, ymm9, ymm7, ymm8 800 vpcmpgtq ymm8, ymm6, ymm10 801 vblendvpd ymm6, ymm10, ymm6, ymm8 802 vpcmpgtq ymm8, ymm5, ymm11 803 vblendvpd ymm5, ymm11, ymm5, ymm8 804 add rax, 16 805 cmp r9, rax 806 jne .LBB6_5 807 # %bb.6: 808 vpcmpgtq ymm8, ymm4, ymm7 809 vblendvpd ymm4, ymm7, ymm4, ymm8 810 vpcmpgtq ymm7, ymm4, ymm6 811 vblendvpd ymm4, ymm6, ymm4, ymm7 812 vpcmpgtq ymm6, ymm4, ymm5 813 vblendvpd ymm4, ymm5, ymm4, ymm6 814 vextractf128 xmm5, ymm4, 1 815 vpcmpgtq xmm6, xmm4, xmm5 816 vblendvpd xmm4, xmm5, xmm4, xmm6 817 vpermilps xmm5, xmm4, 78 # xmm5 = xmm4[2,3,0,1] 818 vpcmpgtq xmm6, xmm4, xmm5 819 vblendvpd xmm4, xmm5, xmm4, xmm6 820 vmovq r10, xmm4 821 vpcmpgtq ymm4, ymm3, ymm0 822 vblendvpd ymm0, ymm3, ymm0, ymm4 823 vpcmpgtq ymm3, ymm2, ymm0 824 vblendvpd ymm0, ymm2, ymm0, ymm3 825 vpcmpgtq ymm2, ymm1, ymm0 826 vblendvpd ymm0, ymm1, ymm0, ymm2 827 vextractf128 xmm1, ymm0, 1 828 vpcmpgtq xmm2, xmm1, xmm0 829 vblendvpd xmm0, xmm1, xmm0, xmm2 830 vpermilps xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] 831 vpcmpgtq xmm2, xmm1, xmm0 832 vblendvpd xmm0, xmm1, xmm0, xmm2 833 vmovq rax, xmm0 834 mov rsi, r10 835 cmp r9, r8 836 je .LBB6_8 837 .p2align 4, 0x90 838 .LBB6_7: # =>This Inner Loop Header: Depth=1 839 mov rsi, qword ptr [rdi + 8*r9] 840 cmp rax, rsi 841 cmovg rax, rsi 842 cmp r10, rsi 843 cmovge rsi, r10 844 add r9, 1 845 mov r10, rsi 846 cmp r8, r9 847 jne .LBB6_7 848 .LBB6_8: 849 mov qword ptr [rcx], rsi 850 mov qword ptr [rdx], rax 851 mov rsp, rbp 852 pop rbp 853 vzeroupper 854 ret 855 .Lfunc_end6: 856 .size int64_max_min_avx2, .Lfunc_end6-int64_max_min_avx2 857 # -- End function 858 .section .rodata.cst8,"aM",@progbits,8 859 .p2align 3 # -- Begin function uint64_max_min_avx2 860 .LCPI7_0: 861 .quad -9223372036854775808 # 0x8000000000000000 862 .text 863 .globl uint64_max_min_avx2 864 .p2align 4, 0x90 865 .type uint64_max_min_avx2,@function 866 uint64_max_min_avx2: # @uint64_max_min_avx2 867 # %bb.0: 868 push rbp 869 mov rbp, rsp 870 and rsp, -8 871 test esi, esi 872 jle .LBB7_1 873 # %bb.2: 874 mov r8d, esi 875 cmp esi, 15 876 ja .LBB7_4 877 # %bb.3: 878 mov rax, -1 879 xor r9d, r9d 880 xor r10d, r10d 881 jmp .LBB7_7 882 .LBB7_1: 883 mov rax, -1 884 xor esi, esi 885 jmp .LBB7_8 886 .LBB7_4: 887 mov r9d, r8d 888 and r9d, -16 889 vpxor xmm5, xmm5, xmm5 890 vpcmpeqd ymm1, ymm1, ymm1 891 xor eax, eax 892 vpbroadcastq ymm0, qword ptr [rip + .LCPI7_0] # ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 893 vpcmpeqd ymm4, ymm4, ymm4 894 vpcmpeqd ymm3, ymm3, ymm3 895 vpcmpeqd ymm2, ymm2, ymm2 896 vpxor xmm8, xmm8, xmm8 897 vpxor xmm7, xmm7, xmm7 898 vpxor xmm6, xmm6, xmm6 899 .p2align 4, 0x90 900 .LBB7_5: # =>This Inner Loop Header: Depth=1 901 vmovdqu ymm9, ymmword ptr [rdi + 8*rax] 902 vpxor ymm10, ymm1, ymm0 903 vpxor ymm11, ymm9, ymm0 904 vpcmpgtq ymm10, ymm11, ymm10 905 vblendvpd ymm1, ymm9, ymm1, ymm10 906 vpxor ymm10, ymm5, ymm0 907 vpcmpgtq ymm10, ymm10, ymm11 908 vblendvpd ymm5, ymm9, ymm5, ymm10 909 vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 32] 910 vpxor ymm10, ymm4, ymm0 911 vpxor ymm11, ymm9, ymm0 912 vpcmpgtq ymm10, ymm11, ymm10 913 vblendvpd ymm4, ymm9, ymm4, ymm10 914 vpxor ymm10, ymm8, ymm0 915 vpcmpgtq ymm10, ymm10, ymm11 916 vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 64] 917 vblendvpd ymm8, ymm9, ymm8, ymm10 918 vpxor ymm9, ymm3, ymm0 919 vpxor ymm10, ymm11, ymm0 920 vpcmpgtq ymm9, ymm10, ymm9 921 vblendvpd ymm3, ymm11, ymm3, ymm9 922 vpxor ymm9, ymm7, ymm0 923 vpcmpgtq ymm9, ymm9, ymm10 924 vblendvpd ymm7, ymm11, ymm7, ymm9 925 vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 96] 926 vpxor ymm10, ymm2, ymm0 927 vpxor ymm11, ymm9, ymm0 928 vpcmpgtq ymm10, ymm11, ymm10 929 vblendvpd ymm2, ymm9, ymm2, ymm10 930 vpxor ymm10, ymm6, ymm0 931 vpcmpgtq ymm10, ymm10, ymm11 932 vblendvpd ymm6, ymm9, ymm6, ymm10 933 add rax, 16 934 cmp r9, rax 935 jne .LBB7_5 936 # %bb.6: 937 vpxor ymm9, ymm8, ymm0 938 vpxor ymm10, ymm5, ymm0 939 vpcmpgtq ymm9, ymm10, ymm9 940 vblendvpd ymm5, ymm8, ymm5, ymm9 941 vxorpd ymm8, ymm5, ymm0 942 vpxor ymm9, ymm7, ymm0 943 vpcmpgtq ymm8, ymm8, ymm9 944 vblendvpd ymm5, ymm7, ymm5, ymm8 945 vxorpd ymm7, ymm5, ymm0 946 vpxor ymm8, ymm6, ymm0 947 vpcmpgtq ymm7, ymm7, ymm8 948 vblendvpd ymm5, ymm6, ymm5, ymm7 949 vextractf128 xmm6, ymm5, 1 950 vxorpd xmm8, xmm6, xmm0 951 vxorpd xmm7, xmm5, xmm0 952 vpcmpgtq xmm7, xmm7, xmm8 953 vblendvpd xmm5, xmm6, xmm5, xmm7 954 vpermilps xmm6, xmm5, 78 # xmm6 = xmm5[2,3,0,1] 955 vxorpd xmm8, xmm5, xmm0 956 vxorpd xmm7, xmm6, xmm0 957 vpcmpgtq xmm7, xmm8, xmm7 958 vblendvpd xmm5, xmm6, xmm5, xmm7 959 vpxor ymm6, ymm1, ymm0 960 vpxor ymm7, ymm4, ymm0 961 vpcmpgtq ymm6, ymm7, ymm6 962 vblendvpd ymm1, ymm4, ymm1, ymm6 963 vxorpd ymm4, ymm1, ymm0 964 vpxor ymm6, ymm3, ymm0 965 vpcmpgtq ymm4, ymm6, ymm4 966 vblendvpd ymm1, ymm3, ymm1, ymm4 967 vmovq r10, xmm5 968 vxorpd ymm3, ymm1, ymm0 969 vpxor ymm4, ymm2, ymm0 970 vpcmpgtq ymm3, ymm4, ymm3 971 vblendvpd ymm1, ymm2, ymm1, ymm3 972 vextractf128 xmm2, ymm1, 1 973 vxorpd xmm3, xmm1, xmm0 974 vxorpd xmm4, xmm2, xmm0 975 vpcmpgtq xmm3, xmm4, xmm3 976 vblendvpd xmm1, xmm2, xmm1, xmm3 977 vpermilps xmm2, xmm1, 78 # xmm2 = xmm1[2,3,0,1] 978 vxorpd xmm3, xmm1, xmm0 979 vxorpd xmm0, xmm2, xmm0 980 vpcmpgtq xmm0, xmm0, xmm3 981 vblendvpd xmm0, xmm2, xmm1, xmm0 982 vmovq rax, xmm0 983 mov rsi, r10 984 cmp r9, r8 985 je .LBB7_8 986 .p2align 4, 0x90 987 .LBB7_7: # =>This Inner Loop Header: Depth=1 988 mov rsi, qword ptr [rdi + 8*r9] 989 cmp rax, rsi 990 cmovae rax, rsi 991 cmp r10, rsi 992 cmova rsi, r10 993 add r9, 1 994 mov r10, rsi 995 cmp r8, r9 996 jne .LBB7_7 997 .LBB7_8: 998 mov qword ptr [rcx], rsi 999 mov qword ptr [rdx], rax 1000 mov rsp, rbp 1001 pop rbp 1002 vzeroupper 1003 ret 1004 .Lfunc_end7: 1005 .size uint64_max_min_avx2, .Lfunc_end7-uint64_max_min_avx2 1006 # -- End function 1007 .ident "Debian clang version 11.0.1-2" 1008 .section ".note.GNU-stack","",@progbits 1009 .addrsig