github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/utils/_lib/min_max_avx2.s (about) 1 .text 2 .intel_syntax noprefix 3 .file "min_max.c" 4 .section .rodata.cst4,"aM",@progbits,4 5 .p2align 2 # -- Begin function int32_max_min_avx2 6 .LCPI0_0: 7 .long 2147483648 # 0x80000000 8 .LCPI0_1: 9 .long 2147483647 # 0x7fffffff 10 .text 11 .globl int32_max_min_avx2 12 .p2align 4, 0x90 13 .type int32_max_min_avx2,@function 14 int32_max_min_avx2: # @int32_max_min_avx2 15 # %bb.0: 16 push rbp 17 mov rbp, rsp 18 and rsp, -8 19 test esi, esi 20 jle .LBB0_1 21 # %bb.2: 22 mov r8d, esi 23 cmp esi, 31 24 ja .LBB0_4 25 # %bb.3: 26 mov r10d, -2147483648 27 mov eax, 2147483647 28 xor r9d, r9d 29 jmp .LBB0_7 30 .LBB0_1: 31 mov eax, 2147483647 32 mov esi, -2147483648 33 jmp .LBB0_8 34 .LBB0_4: 35 mov r9d, r8d 36 vpbroadcastd ymm4, dword ptr [rip + .LCPI0_0] # ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] 37 and r9d, -32 38 vpbroadcastd ymm0, dword ptr [rip + .LCPI0_1] # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] 39 xor eax, eax 40 vmovdqa ymm1, ymm0 41 vmovdqa ymm2, ymm0 42 vmovdqa ymm3, ymm0 43 vmovdqa ymm5, ymm4 44 vmovdqa ymm6, ymm4 45 vmovdqa ymm7, ymm4 46 .p2align 4, 0x90 47 .LBB0_5: # =>This Inner Loop Header: Depth=1 48 vmovdqu ymm8, ymmword ptr [rdi + 4*rax] 49 vmovdqu ymm9, ymmword ptr [rdi + 4*rax + 32] 50 vmovdqu ymm10, ymmword ptr [rdi + 4*rax + 64] 51 vmovdqu ymm11, ymmword ptr [rdi + 4*rax + 96] 52 vpminsd ymm0, ymm0, ymm8 53 vpminsd ymm1, ymm1, ymm9 54 vpminsd ymm2, ymm2, ymm10 55 vpminsd ymm3, ymm3, ymm11 56 vpmaxsd ymm4, ymm4, ymm8 57 vpmaxsd ymm5, ymm5, ymm9 58 vpmaxsd ymm6, ymm6, ymm10 59 vpmaxsd ymm7, ymm7, ymm11 60 add rax, 32 61 cmp r9, rax 62 jne .LBB0_5 63 # %bb.6: 64 vpmaxsd ymm4, ymm4, ymm5 65 vpmaxsd ymm4, ymm4, ymm6 66 vpmaxsd ymm4, ymm4, ymm7 67 vextracti128 xmm5, ymm4, 1 68 vpmaxsd xmm4, xmm4, xmm5 69 vpshufd xmm5, xmm4, 78 # xmm5 = xmm4[2,3,0,1] 70 vpmaxsd xmm4, xmm4, xmm5 71 vpshufd xmm5, xmm4, 229 # xmm5 = xmm4[1,1,2,3] 72 vpmaxsd xmm4, xmm4, xmm5 73 vmovd r10d, xmm4 74 vpminsd ymm0, ymm0, ymm1 75 vpminsd ymm0, ymm0, ymm2 76 vpminsd ymm0, ymm0, ymm3 77 vextracti128 xmm1, ymm0, 1 78 vpminsd xmm0, xmm0, xmm1 79 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] 80 vpminsd xmm0, xmm0, xmm1 81 vpshufd xmm1, xmm0, 229 # xmm1 = xmm0[1,1,2,3] 82 vpminsd xmm0, xmm0, xmm1 83 vmovd eax, xmm0 84 mov esi, r10d 85 cmp r9, r8 86 je .LBB0_8 87 .p2align 4, 0x90 88 .LBB0_7: # =>This Inner Loop Header: Depth=1 89 mov esi, dword ptr [rdi + 4*r9] 90 cmp eax, esi 91 cmovg eax, esi 92 cmp r10d, esi 93 cmovge esi, r10d 94 add r9, 1 95 mov r10d, esi 96 cmp r8, r9 97 jne .LBB0_7 98 .LBB0_8: 99 mov dword ptr [rcx], esi 100 mov dword ptr [rdx], eax 101 mov rsp, rbp 102 pop rbp 103 vzeroupper 104 ret 105 .Lfunc_end0: 106 .size int32_max_min_avx2, .Lfunc_end0-int32_max_min_avx2 107 # -- End function 108 .globl uint32_max_min_avx2 # -- Begin function uint32_max_min_avx2 109 .p2align 4, 0x90 110 .type uint32_max_min_avx2,@function 111 uint32_max_min_avx2: # @uint32_max_min_avx2 112 # %bb.0: 113 push rbp 114 mov rbp, rsp 115 and rsp, -8 116 test esi, esi 117 jle .LBB1_1 118 # %bb.2: 119 mov r8d, esi 120 cmp esi, 31 121 ja .LBB1_4 122 # %bb.3: 123 xor r9d, r9d 124 mov eax, -1 125 xor r10d, r10d 126 jmp .LBB1_7 127 .LBB1_1: 128 mov eax, -1 129 xor esi, esi 130 jmp .LBB1_8 131 .LBB1_4: 132 mov r9d, r8d 133 and r9d, -32 134 vpxor xmm4, xmm4, xmm4 135 vpcmpeqd ymm0, ymm0, ymm0 136 xor eax, eax 137 vpcmpeqd ymm1, ymm1, ymm1 138 vpcmpeqd ymm2, ymm2, ymm2 139 vpcmpeqd ymm3, ymm3, ymm3 140 vpxor xmm5, xmm5, xmm5 141 vpxor xmm6, xmm6, xmm6 142 vpxor xmm7, xmm7, xmm7 143 .p2align 4, 0x90 144 .LBB1_5: # =>This Inner Loop Header: Depth=1 145 vmovdqu ymm8, ymmword ptr [rdi + 4*rax] 146 vmovdqu ymm9, ymmword ptr [rdi + 4*rax + 32] 147 vmovdqu ymm10, ymmword ptr [rdi + 4*rax + 64] 148 vmovdqu ymm11, ymmword ptr [rdi + 4*rax + 96] 149 vpminud ymm0, ymm0, ymm8 150 vpminud ymm1, ymm1, ymm9 151 vpminud ymm2, ymm2, ymm10 152 vpminud ymm3, ymm3, ymm11 153 vpmaxud ymm4, ymm4, ymm8 154 vpmaxud ymm5, ymm5, ymm9 155 vpmaxud ymm6, ymm6, ymm10 156 vpmaxud ymm7, ymm7, ymm11 157 add rax, 32 158 cmp r9, rax 159 jne .LBB1_5 160 # %bb.6: 161 vpmaxud ymm4, ymm4, ymm5 162 vpmaxud ymm4, ymm4, ymm6 163 vpmaxud ymm4, ymm4, ymm7 164 vextracti128 xmm5, ymm4, 1 165 vpmaxud xmm4, xmm4, xmm5 166 vpshufd xmm5, xmm4, 78 # xmm5 = xmm4[2,3,0,1] 167 vpmaxud xmm4, xmm4, xmm5 168 vpshufd xmm5, xmm4, 229 # xmm5 = xmm4[1,1,2,3] 169 vpmaxud xmm4, xmm4, xmm5 170 vmovd r10d, xmm4 171 vpminud ymm0, ymm0, ymm1 172 vpminud ymm0, ymm0, ymm2 173 vpminud ymm0, ymm0, ymm3 174 vextracti128 xmm1, ymm0, 1 175 vpminud xmm0, xmm0, xmm1 176 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] 177 vpminud xmm0, xmm0, xmm1 178 vpshufd xmm1, xmm0, 229 # xmm1 = xmm0[1,1,2,3] 179 vpminud xmm0, xmm0, xmm1 180 vmovd eax, xmm0 181 mov esi, r10d 182 cmp r9, r8 183 je .LBB1_8 184 .p2align 4, 0x90 185 .LBB1_7: # =>This Inner Loop Header: Depth=1 186 mov esi, dword ptr [rdi + 4*r9] 187 cmp eax, esi 188 cmovae eax, esi 189 cmp r10d, esi 190 cmova esi, r10d 191 add r9, 1 192 mov r10d, esi 193 cmp r8, r9 194 jne .LBB1_7 195 .LBB1_8: 196 mov dword ptr [rcx], esi 197 mov dword ptr [rdx], eax 198 mov rsp, rbp 199 pop rbp 200 vzeroupper 201 ret 202 .Lfunc_end1: 203 .size uint32_max_min_avx2, .Lfunc_end1-uint32_max_min_avx2 204 # -- End function 205 .section .rodata.cst8,"aM",@progbits,8 206 .p2align 3 # -- Begin function int64_max_min_avx2 207 .LCPI2_0: 208 .quad -9223372036854775808 # 0x8000000000000000 209 .LCPI2_1: 210 .quad 9223372036854775807 # 0x7fffffffffffffff 211 .text 212 .globl int64_max_min_avx2 213 .p2align 4, 0x90 214 .type int64_max_min_avx2,@function 215 int64_max_min_avx2: # @int64_max_min_avx2 216 # %bb.0: 217 push rbp 218 mov rbp, rsp 219 and rsp, -8 220 movabs rax, 9223372036854775807 221 test esi, esi 222 jle .LBB2_1 223 # %bb.2: 224 mov r8d, esi 225 cmp esi, 15 226 ja .LBB2_4 227 # %bb.3: 228 lea r10, [rax + 1] 229 xor r9d, r9d 230 jmp .LBB2_7 231 .LBB2_1: 232 lea rsi, [rax + 1] 233 jmp .LBB2_8 234 .LBB2_4: 235 mov r9d, r8d 236 vpbroadcastq ymm4, qword ptr [rip + .LCPI2_0] # ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 237 and r9d, -16 238 vpbroadcastq ymm0, qword ptr [rip + .LCPI2_1] # ymm0 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] 239 xor eax, eax 240 vmovdqa ymm3, ymm0 241 vmovdqa ymm2, ymm0 242 vmovdqa ymm1, ymm0 243 vmovdqa ymm7, ymm4 244 vmovdqa ymm6, ymm4 245 vmovdqa ymm5, ymm4 246 .p2align 4, 0x90 247 .LBB2_5: # =>This Inner Loop Header: Depth=1 248 vmovdqu ymm8, ymmword ptr [rdi + 8*rax] 249 vpcmpgtq ymm9, ymm8, ymm0 250 vblendvpd ymm0, ymm8, ymm0, ymm9 251 vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 32] 252 vpcmpgtq ymm10, ymm9, ymm3 253 vblendvpd ymm3, ymm9, ymm3, ymm10 254 vmovdqu ymm10, ymmword ptr [rdi + 8*rax + 64] 255 vpcmpgtq ymm11, ymm10, ymm2 256 vblendvpd ymm2, ymm10, ymm2, ymm11 257 vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 96] 258 vpcmpgtq ymm12, ymm11, ymm1 259 vblendvpd ymm1, ymm11, ymm1, ymm12 260 vpcmpgtq ymm12, ymm4, ymm8 261 vblendvpd ymm4, ymm8, ymm4, ymm12 262 vpcmpgtq ymm8, ymm7, ymm9 263 vblendvpd ymm7, ymm9, ymm7, ymm8 264 vpcmpgtq ymm8, ymm6, ymm10 265 vblendvpd ymm6, ymm10, ymm6, ymm8 266 vpcmpgtq ymm8, ymm5, ymm11 267 vblendvpd ymm5, ymm11, ymm5, ymm8 268 add rax, 16 269 cmp r9, rax 270 jne .LBB2_5 271 # %bb.6: 272 vpcmpgtq ymm8, ymm4, ymm7 273 vblendvpd ymm4, ymm7, ymm4, ymm8 274 vpcmpgtq ymm7, ymm4, ymm6 275 vblendvpd ymm4, ymm6, ymm4, ymm7 276 vpcmpgtq ymm6, ymm4, ymm5 277 vblendvpd ymm4, ymm5, ymm4, ymm6 278 vextractf128 xmm5, ymm4, 1 279 vpcmpgtq xmm6, xmm4, xmm5 280 vblendvpd xmm4, xmm5, xmm4, xmm6 281 vpermilps xmm5, xmm4, 78 # xmm5 = xmm4[2,3,0,1] 282 vpcmpgtq xmm6, xmm4, xmm5 283 vblendvpd xmm4, xmm5, xmm4, xmm6 284 vmovq r10, xmm4 285 vpcmpgtq ymm4, ymm3, ymm0 286 vblendvpd ymm0, ymm3, ymm0, ymm4 287 vpcmpgtq ymm3, ymm2, ymm0 288 vblendvpd ymm0, ymm2, ymm0, ymm3 289 vpcmpgtq ymm2, ymm1, ymm0 290 vblendvpd ymm0, ymm1, ymm0, ymm2 291 vextractf128 xmm1, ymm0, 1 292 vpcmpgtq xmm2, xmm1, xmm0 293 vblendvpd xmm0, xmm1, xmm0, xmm2 294 vpermilps xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] 295 vpcmpgtq xmm2, xmm1, xmm0 296 vblendvpd xmm0, xmm1, xmm0, xmm2 297 vmovq rax, xmm0 298 mov rsi, r10 299 cmp r9, r8 300 je .LBB2_8 301 .p2align 4, 0x90 302 .LBB2_7: # =>This Inner Loop Header: Depth=1 303 mov rsi, qword ptr [rdi + 8*r9] 304 cmp rax, rsi 305 cmovg rax, rsi 306 cmp r10, rsi 307 cmovge rsi, r10 308 add r9, 1 309 mov r10, rsi 310 cmp r8, r9 311 jne .LBB2_7 312 .LBB2_8: 313 mov qword ptr [rcx], rsi 314 mov qword ptr [rdx], rax 315 mov rsp, rbp 316 pop rbp 317 vzeroupper 318 ret 319 .Lfunc_end2: 320 .size int64_max_min_avx2, .Lfunc_end2-int64_max_min_avx2 321 # -- End function 322 .section .rodata.cst8,"aM",@progbits,8 323 .p2align 3 # -- Begin function uint64_max_min_avx2 324 .LCPI3_0: 325 .quad -9223372036854775808 # 0x8000000000000000 326 .text 327 .globl uint64_max_min_avx2 328 .p2align 4, 0x90 329 .type uint64_max_min_avx2,@function 330 uint64_max_min_avx2: # @uint64_max_min_avx2 331 # %bb.0: 332 push rbp 333 mov rbp, rsp 334 and rsp, -8 335 test esi, esi 336 jle .LBB3_1 337 # %bb.2: 338 mov r8d, esi 339 cmp esi, 15 340 ja .LBB3_4 341 # %bb.3: 342 mov rax, -1 343 xor r9d, r9d 344 xor r10d, r10d 345 jmp .LBB3_7 346 .LBB3_1: 347 mov rax, -1 348 xor esi, esi 349 jmp .LBB3_8 350 .LBB3_4: 351 mov r9d, r8d 352 and r9d, -16 353 vpxor xmm5, xmm5, xmm5 354 vpcmpeqd ymm1, ymm1, ymm1 355 xor eax, eax 356 vpbroadcastq ymm0, qword ptr [rip + .LCPI3_0] # ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 357 vpcmpeqd ymm4, ymm4, ymm4 358 vpcmpeqd ymm3, ymm3, ymm3 359 vpcmpeqd ymm2, ymm2, ymm2 360 vpxor xmm8, xmm8, xmm8 361 vpxor xmm7, xmm7, xmm7 362 vpxor xmm6, xmm6, xmm6 363 .p2align 4, 0x90 364 .LBB3_5: # =>This Inner Loop Header: Depth=1 365 vmovdqu ymm9, ymmword ptr [rdi + 8*rax] 366 vpxor ymm10, ymm1, ymm0 367 vpxor ymm11, ymm9, ymm0 368 vpcmpgtq ymm10, ymm11, ymm10 369 vblendvpd ymm1, ymm9, ymm1, ymm10 370 vpxor ymm10, ymm5, ymm0 371 vpcmpgtq ymm10, ymm10, ymm11 372 vblendvpd ymm5, ymm9, ymm5, ymm10 373 vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 32] 374 vpxor ymm10, ymm4, ymm0 375 vpxor ymm11, ymm9, ymm0 376 vpcmpgtq ymm10, ymm11, ymm10 377 vblendvpd ymm4, ymm9, ymm4, ymm10 378 vpxor ymm10, ymm8, ymm0 379 vpcmpgtq ymm10, ymm10, ymm11 380 vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 64] 381 vblendvpd ymm8, ymm9, ymm8, ymm10 382 vpxor ymm9, ymm3, ymm0 383 vpxor ymm10, ymm11, ymm0 384 vpcmpgtq ymm9, ymm10, ymm9 385 vblendvpd ymm3, ymm11, ymm3, ymm9 386 vpxor ymm9, ymm7, ymm0 387 vpcmpgtq ymm9, ymm9, ymm10 388 vblendvpd ymm7, ymm11, ymm7, ymm9 389 vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 96] 390 vpxor ymm10, ymm2, ymm0 391 vpxor ymm11, ymm9, ymm0 392 vpcmpgtq ymm10, ymm11, ymm10 393 vblendvpd ymm2, ymm9, ymm2, ymm10 394 vpxor ymm10, ymm6, ymm0 395 vpcmpgtq ymm10, ymm10, ymm11 396 vblendvpd ymm6, ymm9, ymm6, ymm10 397 add rax, 16 398 cmp r9, rax 399 jne .LBB3_5 400 # %bb.6: 401 vpxor ymm9, ymm8, ymm0 402 vpxor ymm10, ymm5, ymm0 403 vpcmpgtq ymm9, ymm10, ymm9 404 vblendvpd ymm5, ymm8, ymm5, ymm9 405 vxorpd ymm8, ymm5, ymm0 406 vpxor ymm9, ymm7, ymm0 407 vpcmpgtq ymm8, ymm8, ymm9 408 vblendvpd ymm5, ymm7, ymm5, ymm8 409 vxorpd ymm7, ymm5, ymm0 410 vpxor ymm8, ymm6, ymm0 411 vpcmpgtq ymm7, ymm7, ymm8 412 vblendvpd ymm5, ymm6, ymm5, ymm7 413 vextractf128 xmm6, ymm5, 1 414 vxorpd xmm8, xmm6, xmm0 415 vxorpd xmm7, xmm5, xmm0 416 vpcmpgtq xmm7, xmm7, xmm8 417 vblendvpd xmm5, xmm6, xmm5, xmm7 418 vpermilps xmm6, xmm5, 78 # xmm6 = xmm5[2,3,0,1] 419 vxorpd xmm8, xmm5, xmm0 420 vxorpd xmm7, xmm6, xmm0 421 vpcmpgtq xmm7, xmm8, xmm7 422 vblendvpd xmm5, xmm6, xmm5, xmm7 423 vpxor ymm6, ymm1, ymm0 424 vpxor ymm7, ymm4, ymm0 425 vpcmpgtq ymm6, ymm7, ymm6 426 vblendvpd ymm1, ymm4, ymm1, ymm6 427 vxorpd ymm4, ymm1, ymm0 428 vpxor ymm6, ymm3, ymm0 429 vpcmpgtq ymm4, ymm6, ymm4 430 vblendvpd ymm1, ymm3, ymm1, ymm4 431 vmovq r10, xmm5 432 vxorpd ymm3, ymm1, ymm0 433 vpxor ymm4, ymm2, ymm0 434 vpcmpgtq ymm3, ymm4, ymm3 435 vblendvpd ymm1, ymm2, ymm1, ymm3 436 vextractf128 xmm2, ymm1, 1 437 vxorpd xmm3, xmm1, xmm0 438 vxorpd xmm4, xmm2, xmm0 439 vpcmpgtq xmm3, xmm4, xmm3 440 vblendvpd xmm1, xmm2, xmm1, xmm3 441 vpermilps xmm2, xmm1, 78 # xmm2 = xmm1[2,3,0,1] 442 vxorpd xmm3, xmm1, xmm0 443 vxorpd xmm0, xmm2, xmm0 444 vpcmpgtq xmm0, xmm0, xmm3 445 vblendvpd xmm0, xmm2, xmm1, xmm0 446 vmovq rax, xmm0 447 mov rsi, r10 448 cmp r9, r8 449 je .LBB3_8 450 .p2align 4, 0x90 451 .LBB3_7: # =>This Inner Loop Header: Depth=1 452 mov rsi, qword ptr [rdi + 8*r9] 453 cmp rax, rsi 454 cmovae rax, rsi 455 cmp r10, rsi 456 cmova rsi, r10 457 add r9, 1 458 mov r10, rsi 459 cmp r8, r9 460 jne .LBB3_7 461 .LBB3_8: 462 mov qword ptr [rcx], rsi 463 mov qword ptr [rdx], rax 464 mov rsp, rbp 465 pop rbp 466 vzeroupper 467 ret 468 .Lfunc_end3: 469 .size uint64_max_min_avx2, .Lfunc_end3-uint64_max_min_avx2 470 # -- End function 471 .ident "Debian clang version 11.1.0-++20210428103820+1fdec59bffc1-1~exp1~20210428204437.162" 472 .section ".note.GNU-stack","",@progbits 473 .addrsig