github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/page_min_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 // func minInt32(data []int32) int32 6 TEXT ·minInt32(SB), NOSPLIT, $-28 7 MOVQ data_base+0(FP), AX 8 MOVQ data_len+8(FP), CX 9 XORQ BX, BX 10 11 CMPQ CX, $0 12 JE done 13 XORQ SI, SI 14 MOVLQZX (AX), BX 15 16 CMPB ·hasAVX512VL(SB), $0 17 JE loop 18 19 CMPQ CX, $32 20 JB loop 21 22 MOVQ CX, DI 23 SHRQ $5, DI 24 SHLQ $5, DI 25 VPBROADCASTD (AX), Z0 26 loop32: 27 VMOVDQU32 (AX)(SI*4), Z1 28 VMOVDQU32 64(AX)(SI*4), Z2 29 VPMINSD Z1, Z0, Z0 30 VPMINSD Z2, Z0, Z0 31 ADDQ $32, SI 32 CMPQ SI, DI 33 JNE loop32 34 35 VMOVDQU32 swap32+0(SB), Z1 36 VPERMI2D Z0, Z0, Z1 37 VPMINSD Y1, Y0, Y0 38 39 VMOVDQU32 swap32+32(SB), Y1 40 VPERMI2D Y0, Y0, Y1 41 VPMINSD X1, X0, X0 42 43 VMOVDQU32 swap32+48(SB), X1 44 VPERMI2D X0, X0, X1 45 VPMINSD X1, X0, X0 46 VZEROUPPER 47 48 MOVQ X0, DX 49 MOVL DX, BX 50 SHRQ $32, DX 51 CMPL DX, BX 52 CMOVLLT DX, BX 53 54 CMPQ SI, CX 55 JE done 56 loop: 57 MOVLQZX (AX)(SI*4), DX 58 CMPL DX, BX 59 CMOVLLT DX, BX 60 INCQ SI 61 CMPQ SI, CX 62 JNE loop 63 done: 64 MOVL BX, ret+24(FP) 65 RET 66 67 // func minInt64(data []int64) int64 68 TEXT ·minInt64(SB), NOSPLIT, $-32 69 MOVQ data_base+0(FP), AX 70 MOVQ data_len+8(FP), CX 71 XORQ BX, BX 72 73 CMPQ CX, $0 74 JE done 75 XORQ SI, SI 76 MOVQ (AX), BX 77 78 CMPB ·hasAVX512VL(SB), $0 79 JE loop 80 81 CMPQ CX, $32 82 JB loop 83 84 MOVQ CX, DI 85 SHRQ $5, DI 86 SHLQ $5, DI 87 VPBROADCASTQ (AX), Z0 88 loop32: 89 VMOVDQU64 (AX)(SI*8), Z1 90 VMOVDQU64 64(AX)(SI*8), Z2 91 VMOVDQU64 128(AX)(SI*8), Z3 92 VMOVDQU64 192(AX)(SI*8), Z4 93 VPMINSQ Z1, Z2, Z5 94 VPMINSQ Z3, Z4, Z6 95 VPMINSQ Z5, Z6, Z1 96 VPMINSQ Z1, Z0, Z0 97 ADDQ $32, SI 98 CMPQ SI, DI 99 JNE loop32 100 101 VMOVDQU32 swap32+0(SB), Z1 102 VPERMI2D Z0, Z0, Z1 103 VPMINSQ Y1, Y0, Y0 104 105 VMOVDQU32 swap32+32(SB), Y1 106 VPERMI2D Y0, Y0, Y1 107 VPMINSQ X1, X0, X0 108 109 VMOVDQU32 swap32+48(SB), X1 110 VPERMI2D X0, X0, X1 111 VPMINSQ X1, X0, X0 112 VZEROUPPER 113 114 MOVQ X0, BX 115 CMPQ SI, CX 116 JE done 117 loop: 118 MOVQ (AX)(SI*8), DX 119 CMPQ DX, BX 120 CMOVQLT DX, BX 121 INCQ SI 122 CMPQ SI, CX 123 JNE loop 124 done: 125 MOVQ BX, ret+24(FP) 126 RET 127 128 // func minUint32(data []int32) int32 129 TEXT ·minUint32(SB), NOSPLIT, $-28 130 MOVQ data_base+0(FP), AX 131 MOVQ data_len+8(FP), CX 132 XORQ BX, BX 133 134 CMPQ CX, $0 135 JE done 136 XORQ SI, SI 137 MOVLQZX (AX), BX 138 139 CMPB ·hasAVX512VL(SB), $0 140 JE loop 141 142 CMPQ CX, $32 143 JB loop 144 145 MOVQ CX, DI 146 SHRQ $5, DI 147 SHLQ $5, DI 148 VPBROADCASTD (AX), Z0 149 loop32: 150 VMOVDQU32 (AX)(SI*4), Z1 151 VMOVDQU32 64(AX)(SI*4), Z2 152 VPMINUD Z1, Z0, Z0 153 VPMINUD Z2, Z0, Z0 154 ADDQ $32, SI 155 CMPQ SI, DI 156 JNE loop32 157 158 VMOVDQU32 swap32+0(SB), Z1 159 VPERMI2D Z0, Z0, Z1 160 VPMINUD Y1, Y0, Y0 161 162 VMOVDQU32 swap32+32(SB), Y1 163 VPERMI2D Y0, Y0, Y1 164 VPMINUD X1, X0, X0 165 166 VMOVDQU32 swap32+48(SB), X1 167 VPERMI2D X0, X0, X1 168 VPMINUD X1, X0, X0 169 VZEROUPPER 170 171 MOVQ X0, DX 172 MOVL DX, BX 173 SHRQ $32, DX 174 CMPL DX, BX 175 CMOVLCS DX, BX 176 177 CMPQ SI, CX 178 JE done 179 loop: 180 MOVLQZX (AX)(SI*4), DX 181 CMPL DX, BX 182 CMOVLCS DX, BX 183 INCQ SI 184 CMPQ SI, CX 185 JNE loop 186 done: 187 MOVL BX, ret+24(FP) 188 RET 189 190 // func minUint64(data []uint64) uint64 191 TEXT ·minUint64(SB), NOSPLIT, $-32 192 MOVQ data_base+0(FP), AX 193 MOVQ data_len+8(FP), CX 194 XORQ BX, BX 195 196 CMPQ CX, $0 197 JE done 198 XORQ SI, SI 199 MOVQ (AX), BX 200 201 CMPB ·hasAVX512VL(SB), $0 202 JE loop 203 204 CMPQ CX, $32 205 JB loop 206 207 MOVQ CX, DI 208 SHRQ $5, DI 209 SHLQ $5, DI 210 VPBROADCASTQ (AX), Z0 211 loop32: 212 VMOVDQU64 (AX)(SI*8), Z1 213 VMOVDQU64 64(AX)(SI*8), Z2 214 VMOVDQU64 128(AX)(SI*8), Z3 215 VMOVDQU64 192(AX)(SI*8), Z4 216 VPMINUQ Z1, Z2, Z5 217 VPMINUQ Z3, Z4, Z6 218 VPMINUQ Z5, Z6, Z1 219 VPMINUQ Z1, Z0, Z0 220 ADDQ $32, SI 221 CMPQ SI, DI 222 JNE loop32 223 224 VMOVDQU32 swap32+0(SB), Z1 225 VPERMI2D Z0, Z0, Z1 226 VPMINUQ Y1, Y0, Y0 227 228 VMOVDQU32 swap32+32(SB), Y1 229 VPERMI2D Y0, Y0, Y1 230 VPMINUQ X1, X0, X0 231 232 VMOVDQU32 swap32+48(SB), X1 233 VPERMI2D X0, X0, X1 234 VPMINUQ X1, X0, X0 235 VZEROUPPER 236 237 MOVQ X0, BX 238 CMPQ SI, CX 239 JE done 240 loop: 241 MOVQ (AX)(SI*8), DX 242 CMPQ DX, BX 243 CMOVQCS DX, BX 244 INCQ SI 245 CMPQ SI, CX 246 JNE loop 247 done: 248 MOVQ BX, ret+24(FP) 249 RET 250 251 // func minFloat32(data []float32) float32 252 TEXT ·minFloat32(SB), NOSPLIT, $-28 253 MOVQ data_base+0(FP), AX 254 MOVQ data_len+8(FP), CX 255 XORQ BX, BX 256 257 CMPQ CX, $0 258 JE done 259 XORPS X0, X0 260 XORPS X1, X1 261 XORQ SI, SI 262 MOVLQZX (AX), BX 263 MOVQ BX, X0 264 265 CMPB ·hasAVX512VL(SB), $0 266 JE loop 267 268 CMPQ CX, $64 269 JB loop 270 271 MOVQ CX, DI 272 SHRQ $6, DI 273 SHLQ $6, DI 274 VPBROADCASTD (AX), Z0 275 loop64: 276 VMOVDQU32 (AX)(SI*4), Z1 277 VMOVDQU32 64(AX)(SI*4), Z2 278 VMOVDQU32 128(AX)(SI*4), Z3 279 VMOVDQU32 192(AX)(SI*4), Z4 280 VMINPS Z1, Z2, Z5 281 VMINPS Z3, Z4, Z6 282 VMINPS Z5, Z6, Z1 283 VMINPS Z1, Z0, Z0 284 ADDQ $64, SI 285 CMPQ SI, DI 286 JNE loop64 287 288 VMOVDQU32 swap32+0(SB), Z1 289 VPERMI2D Z0, Z0, Z1 290 VMINPS Y1, Y0, Y0 291 292 VMOVDQU32 swap32+32(SB), Y1 293 VPERMI2D Y0, Y0, Y1 294 VMINPS X1, X0, X0 295 296 VMOVDQU32 swap32+48(SB), X1 297 VPERMI2D X0, X0, X1 298 VMINPS X1, X0, X0 299 VZEROUPPER 300 301 MOVAPS X0, X1 302 PSRLQ $32, X1 303 MOVQ X0, BX 304 MOVQ X1, DX 305 UCOMISS X0, X1 306 CMOVLCS DX, BX 307 308 CMPQ SI, CX 309 JE done 310 MOVQ BX, X0 311 loop: 312 MOVLQZX (AX)(SI*4), DX 313 MOVQ DX, X1 314 UCOMISS X0, X1 315 CMOVLCS DX, BX 316 MOVQ BX, X0 317 INCQ SI 318 CMPQ SI, CX 319 JNE loop 320 done: 321 MOVL BX, ret+24(FP) 322 RET 323 324 // func minFloat64(data []float64) float64 325 TEXT ·minFloat64(SB), NOSPLIT, $-32 326 MOVQ data_base+0(FP), AX 327 MOVQ data_len+8(FP), CX 328 XORQ BX, BX 329 330 CMPQ CX, $0 331 JE done 332 XORPD X0, X0 333 XORPD X1, X1 334 XORQ SI, SI 335 MOVQ (AX), BX 336 MOVQ BX, X0 337 338 CMPB ·hasAVX512VL(SB), $0 339 JE loop 340 341 CMPQ CX, $32 342 JB loop 343 344 MOVQ CX, DI 345 SHRQ $5, DI 346 SHLQ $5, DI 347 VPBROADCASTQ (AX), Z0 348 loop32: 349 VMOVDQU64 (AX)(SI*8), Z1 350 VMOVDQU64 64(AX)(SI*8), Z2 351 VMOVDQU64 128(AX)(SI*8), Z3 352 VMOVDQU64 192(AX)(SI*8), Z4 353 VMINPD Z1, Z2, Z5 354 VMINPD Z3, Z4, Z6 355 VMINPD Z5, Z6, Z1 356 VMINPD Z1, Z0, Z0 357 ADDQ $32, SI 358 CMPQ SI, DI 359 JNE loop32 360 361 VMOVDQU64 swap32+0(SB), Z1 362 VPERMI2D Z0, Z0, Z1 363 VMINPD Y1, Y0, Y0 364 365 VMOVDQU64 swap32+32(SB), Y1 366 VPERMI2D Y0, Y0, Y1 367 VMINPD X1, X0, X0 368 369 VMOVDQU64 swap32+48(SB), X1 370 VPERMI2D X0, X0, X1 371 VMINPD X1, X0, X0 372 VZEROUPPER 373 374 MOVQ X0, BX 375 CMPQ SI, CX 376 JE done 377 loop: 378 MOVQ (AX)(SI*8), DX 379 MOVQ DX, X1 380 UCOMISD X0, X1 381 CMOVQCS DX, BX 382 MOVQ BX, X0 383 INCQ SI 384 CMPQ SI, CX 385 JNE loop 386 done: 387 MOVQ BX, ret+24(FP) 388 RET 389 390 // vpminu128 is a macro comparing unsigned 128 bits values held in the 391 // `srcValues` and `minValues` vectors. The `srcIndexes` and `minIndexes` 392 // vectors contain the indexes of elements in the value vectors. Remaining 393 // K and R arguments are mask and general purpose registers needed to hold 394 // temporary values during the computation. The last M argument is a mask 395 // generated by vpminu128mask. 396 // 397 // The routine uses AVX-512 instructions (VPCMPUQ, VPBLENDMQ) to implement 398 // the comparison of 128 bits values. The values are expected to be stored 399 // in the vectors as a little-endian pair of two consecutive quad words. 400 // 401 // The results are written to the `minValues` and `minIndexes` vectors, 402 // overwriting the inputs. `srcValues` and `srcIndexes` are read-only 403 // parameters. 404 // 405 // At a high level, for two pairs of quad words forming two 128 bits values 406 // A and B, the test implemented by this macro is: 407 // 408 // A[1] < B[1] || (A[1] == B[1] && A[0] < B[0]) 409 // 410 // Values in the source vector that evalute to true on this expression are 411 // written to the vector of minimum values, and their indexes are written to 412 // the vector of indexes. 413 #define vpminu128(srcValues, srcIndexes, minValues, minIndexes, K1, K2, R1, R2, R3, M) \ 414 VPCMPUQ $0, minValues, srcValues, K1 \ 415 VPCMPUQ $1, minValues, srcValues, K2 \ 416 KMOVB K1, R1 \ 417 KMOVB K2, R2 \ 418 MOVB R2, R3 \ 419 SHLB $1, R3 \ 420 ANDB R3, R1 \ 421 ORB R2, R1 \ 422 ANDB M, R1 \ 423 MOVB R1, R2 \ 424 SHRB $1, R2 \ 425 ORB R2, R1 \ 426 KMOVB R1, K1 \ 427 VPBLENDMQ srcValues, minValues, K1, minValues \ 428 VPBLENDMQ srcIndexes, minIndexes, K1, minIndexes 429 430 // vpminu128mask is a macro used to initialize the mask passed as last argument 431 // to vpminu128. The argument M is intended to be a general purpose register. 432 // 433 // The bit mask is used to merge the results of the "less than" and "equal" 434 // comparison that are performed on each lane of minimum vectors. The upper bits 435 // are used to compute results of the operation to determines which of the pairs 436 // of quad words representing the 128 bits elements are the minimums. 437 #define vpminu128mask(M) MOVB $0b10101010, M 438 439 // func minBE128(data [][16]byte) []byte 440 TEXT ·minBE128(SB), NOSPLIT, $-48 441 MOVQ data_base+0(FP), AX 442 MOVQ data_len+8(FP), CX 443 CMPQ CX, $0 444 JE null 445 446 SHLQ $4, CX 447 MOVQ CX, DX // len 448 MOVQ AX, BX // min 449 ADDQ AX, CX // end 450 451 CMPQ DX, $256 452 JB loop 453 454 CMPB ·hasAVX512MinMaxBE128(SB), $0 455 JE loop 456 457 // Z19 holds a vector of the count by which we increment the vectors of 458 // swap at each loop iteration. 459 MOVQ $16, DI 460 VPBROADCASTQ DI, Z19 461 462 // Z31 holds the shuffle mask used to convert 128 bits elements from big to 463 // little endian so we can apply vectorized comparison instructions. 464 VMOVDQU64 bswap128(SB), Z31 465 466 // These vectors hold four lanes of minimum values found in the input. 467 VBROADCASTI64X2 (AX), Z0 468 VPSHUFB Z31, Z0, Z0 469 VMOVDQU64 Z0, Z5 470 VMOVDQU64 Z0, Z10 471 VMOVDQU64 Z0, Z15 472 473 // These vectors hold four lanes of swap of minimum values. 474 // 475 // We initialize them at zero because we broadcast the first value of the 476 // input in the vectors that track the minimums of each lane; in other 477 // words, we assume the minimum value is at the first offset and work our 478 // way up from there. 479 VPXORQ Z2, Z2, Z2 480 VPXORQ Z7, Z7, Z7 481 VPXORQ Z12, Z12, Z12 482 VPXORQ Z17, Z17, Z17 483 484 // These vectors are used to compute the swap of minimum values held 485 // in [Z1, Z5, Z10, Z15]. Each vector holds a contiguous sequence of 486 // swap; for example, Z3 is initialized with [0, 1, 2, 3]. At each 487 // loop iteration, the swap are incremented by the number of elements 488 // consumed from the input (4x4=16). 489 VMOVDQU64 indexes128(SB), Z3 490 VPXORQ Z8, Z8, Z8 491 VPXORQ Z13, Z13, Z13 492 VPXORQ Z18, Z18, Z18 493 MOVQ $4, DI 494 VPBROADCASTQ DI, Z1 495 VPADDQ Z1, Z3, Z8 496 VPADDQ Z1, Z8, Z13 497 VPADDQ Z1, Z13, Z18 498 499 vpminu128mask(DI) 500 SHRQ $8, DX 501 SHLQ $8, DX 502 ADDQ AX, DX 503 loop16: 504 // Compute 4x4 minimum values in vector registers, along with their swap 505 // in the input array. 506 VMOVDQU64 (AX), Z1 507 VMOVDQU64 64(AX), Z6 508 VMOVDQU64 128(AX), Z11 509 VMOVDQU64 192(AX), Z16 510 VPSHUFB Z31, Z1, Z1 511 VPSHUFB Z31, Z6, Z6 512 VPSHUFB Z31, Z11, Z11 513 VPSHUFB Z31, Z16, Z16 514 vpminu128(Z1, Z3, Z0, Z2, K1, K2, R8, R9, R10, DI) 515 vpminu128(Z6, Z8, Z5, Z7, K3, K4, R11, R12, R13, DI) 516 vpminu128(Z11, Z13, Z10, Z12, K1, K2, R8, R9, R10, DI) 517 vpminu128(Z16, Z18, Z15, Z17, K3, K4, R11, R12, R13, DI) 518 VPADDQ Z19, Z3, Z3 519 VPADDQ Z19, Z8, Z8 520 VPADDQ Z19, Z13, Z13 521 VPADDQ Z19, Z18, Z18 522 ADDQ $256, AX 523 CMPQ AX, DX 524 JB loop16 525 526 // After the loop completed, we need to merge the lanes that each contain 527 // 4 minimum values (so 16 total candidate at this stage). The results are 528 // reduced into 4 candidates in Z0, with their swap in Z2. 529 vpminu128(Z10, Z12, Z0, Z2, K1, K2, R8, R9, R10, DI) 530 vpminu128(Z15, Z17, Z5, Z7, K3, K4, R11, R12, R13, DI) 531 vpminu128(Z5, Z7, Z0, Z2, K1, K2, R8, R9, R10, DI) 532 533 // Further reduce the results by swapping the upper and lower parts of the 534 // vector registers, and comparing them to determine which values are the 535 // smallest. We compare 2x2 values at this step, then 2x1 values at the next 536 // to find the index of the minimum. 537 VMOVDQU64 swap64+0(SB), Z1 538 VMOVDQU64 swap64+0(SB), Z3 539 VPERMI2Q Z0, Z0, Z1 540 VPERMI2Q Z2, Z2, Z3 541 vpminu128(Y1, Y3, Y0, Y2, K1, K2, R8, R9, R10, DI) 542 543 VMOVDQU64 swap64+32(SB), Y1 544 VMOVDQU64 swap64+32(SB), Y3 545 VPERMI2Q Y0, Y0, Y1 546 VPERMI2Q Y2, Y2, Y3 547 vpminu128(X1, X3, X0, X2, K1, K2, R8, R9, R10, DI) 548 VZEROUPPER 549 550 // Extract the index of the minimum value computed in the lower 64 bits of 551 // X2 and position the BX pointer at the index of the minimum value. 552 MOVQ X2, DX 553 SHLQ $4, DX 554 ADDQ DX, BX 555 CMPQ AX, CX 556 JE done 557 558 // Unless the input was aligned on 256 bytes, we need to perform a few more 559 // iterations on the remaining elements. 560 // 561 // This loop is also taken if the CPU has no support for AVX-512. 562 loop: 563 MOVQ (AX), R8 564 MOVQ (BX), R9 565 BSWAPQ R8 566 BSWAPQ R9 567 CMPQ R8, R9 568 JB less 569 JA next 570 MOVQ 8(AX), R8 571 MOVQ 8(BX), R9 572 BSWAPQ R8 573 BSWAPQ R9 574 CMPQ R8, R9 575 JAE next 576 less: 577 MOVQ AX, BX 578 next: 579 ADDQ $16, AX 580 CMPQ AX, CX 581 JB loop 582 done: 583 MOVQ BX, ret_base+24(FP) 584 MOVQ $16, ret_len+32(FP) 585 MOVQ $16, ret_cap+40(FP) 586 RET 587 null: 588 XORQ BX, BX 589 MOVQ BX, ret_base+24(FP) 590 MOVQ BX, ret_len+32(FP) 591 MOVQ BX, ret_cap+40(FP) 592 RET