github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/page_max_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 // func maxInt32(data []int32) int32 6 TEXT ·maxInt32(SB), NOSPLIT, $-28 7 MOVQ data_base+0(FP), AX 8 MOVQ data_len+8(FP), CX 9 XORQ BX, BX 10 11 CMPQ CX, $0 12 JE done 13 XORQ SI, SI 14 MOVLQZX (AX), BX 15 16 CMPB ·hasAVX512VL(SB), $0 17 JE loop 18 19 CMPQ CX, $32 20 JB loop 21 22 MOVQ CX, DI 23 SHRQ $5, DI 24 SHLQ $5, DI 25 VPBROADCASTD (AX), Z0 26 loop32: 27 VMOVDQU32 (AX)(SI*4), Z1 28 VMOVDQU32 64(AX)(SI*4), Z2 29 VPMAXSD Z1, Z0, Z0 30 VPMAXSD Z2, Z0, Z0 31 ADDQ $32, SI 32 CMPQ SI, DI 33 JNE loop32 34 35 VMOVDQU32 swap32+0(SB), Z1 36 VPERMI2D Z0, Z0, Z1 37 VPMAXSD Y1, Y0, Y0 38 39 VMOVDQU32 swap32+32(SB), Y1 40 VPERMI2D Y0, Y0, Y1 41 VPMAXSD X1, X0, X0 42 43 VMOVDQU32 swap32+48(SB), X1 44 VPERMI2D X0, X0, X1 45 VPMAXSD X1, X0, X0 46 VZEROUPPER 47 48 MOVQ X0, DX 49 MOVL DX, BX 50 SHRQ $32, DX 51 CMPL DX, BX 52 CMOVLGT DX, BX 53 54 CMPQ SI, CX 55 JE done 56 loop: 57 MOVLQZX (AX)(SI*4), DX 58 CMPL DX, BX 59 CMOVLGT DX, BX 60 INCQ SI 61 CMPQ SI, CX 62 JNE loop 63 done: 64 MOVL BX, ret+24(FP) 65 RET 66 67 // func maxInt64(data []int64) int64 68 TEXT ·maxInt64(SB), NOSPLIT, $-32 69 MOVQ data_base+0(FP), AX 70 MOVQ data_len+8(FP), CX 71 XORQ BX, BX 72 73 CMPQ CX, $0 74 JE done 75 XORQ SI, SI 76 MOVQ (AX), BX 77 78 CMPB ·hasAVX512VL(SB), $0 79 JE loop 80 81 CMPQ CX, $32 82 JB loop 83 84 MOVQ CX, DI 85 SHRQ $5, DI 86 SHLQ $5, DI 87 VPBROADCASTQ (AX), Z0 88 loop32: 89 VMOVDQU64 (AX)(SI*8), Z1 90 VMOVDQU64 64(AX)(SI*8), Z2 91 VMOVDQU64 128(AX)(SI*8), Z3 92 VMOVDQU64 192(AX)(SI*8), Z4 93 VPMAXSQ Z1, Z2, Z5 94 VPMAXSQ Z3, Z4, Z6 95 VPMAXSQ Z5, Z6, Z1 96 VPMAXSQ Z1, Z0, Z0 97 ADDQ $32, SI 98 CMPQ SI, DI 99 JNE loop32 100 101 VMOVDQU32 swap32+0(SB), Z1 102 VPERMI2D Z0, Z0, Z1 103 VPMAXSQ Y1, Y0, Y0 104 105 VMOVDQU32 swap32+32(SB), Y1 106 VPERMI2D Y0, Y0, Y1 107 VPMAXSQ X1, X0, X0 108 109 VMOVDQU32 swap32+48(SB), X1 110 VPERMI2D X0, X0, X1 111 VPMAXSQ X1, X0, X0 112 VZEROUPPER 113 114 MOVQ X0, BX 115 CMPQ SI, CX 116 JE done 117 loop: 118 MOVQ (AX)(SI*8), DX 119 CMPQ DX, BX 120 CMOVQGT DX, BX 121 INCQ SI 122 CMPQ SI, CX 123 JNE loop 124 done: 125 MOVQ BX, ret+24(FP) 126 RET 127 128 // func maxUint32(data []int32) int32 129 TEXT ·maxUint32(SB), NOSPLIT, $-28 130 MOVQ data_base+0(FP), AX 131 MOVQ data_len+8(FP), CX 132 XORQ BX, BX 133 134 CMPQ CX, $0 135 JE done 136 XORQ SI, SI 137 MOVLQZX (AX), BX 138 139 CMPB ·hasAVX512VL(SB), $0 140 JE loop 141 142 CMPQ CX, $32 143 JB loop 144 145 MOVQ CX, DI 146 SHRQ $5, DI 147 SHLQ $5, DI 148 VPBROADCASTD (AX), Z0 149 loop32: 150 VMOVDQU32 (AX)(SI*4), Z1 151 VMOVDQU32 64(AX)(SI*4), Z2 152 VPMAXUD Z1, Z0, Z0 153 VPMAXUD Z2, Z0, Z0 154 ADDQ $32, SI 155 CMPQ SI, DI 156 JNE loop32 157 158 VMOVDQU32 swap32+0(SB), Z1 159 VPERMI2D Z0, Z0, Z1 160 VPMAXUD Y1, Y0, Y0 161 162 VMOVDQU32 swap32+32(SB), Y1 163 VPERMI2D Y0, Y0, Y1 164 VPMAXUD X1, X0, X0 165 166 VMOVDQU32 swap32+48(SB), X1 167 VPERMI2D X0, X0, X1 168 VPMAXUD X1, X0, X0 169 VZEROUPPER 170 171 MOVQ X0, DX 172 MOVL DX, BX 173 SHRQ $32, DX 174 CMPL DX, BX 175 CMOVLHI DX, BX 176 177 CMPQ SI, CX 178 JE done 179 loop: 180 MOVLQZX (AX)(SI*4), DX 181 CMPL DX, BX 182 CMOVLHI DX, BX 183 INCQ SI 184 CMPQ SI, CX 185 JNE loop 186 done: 187 MOVL BX, ret+24(FP) 188 RET 189 190 // func maxUint64(data []uint64) uint64 191 TEXT ·maxUint64(SB), NOSPLIT, $-32 192 MOVQ data_base+0(FP), AX 193 MOVQ data_len+8(FP), CX 194 XORQ BX, BX 195 196 CMPQ CX, $0 197 JE done 198 XORQ SI, SI 199 MOVQ (AX), BX 200 201 CMPB ·hasAVX512VL(SB), $0 202 JE loop 203 204 CMPQ CX, $32 205 JB loop 206 207 MOVQ CX, DI 208 SHRQ $5, DI 209 SHLQ $5, DI 210 VPBROADCASTQ (AX), Z0 211 loop32: 212 VMOVDQU64 (AX)(SI*8), Z1 213 VMOVDQU64 64(AX)(SI*8), Z2 214 VMOVDQU64 128(AX)(SI*8), Z3 215 VMOVDQU64 192(AX)(SI*8), Z4 216 VPMAXUQ Z1, Z2, Z5 217 VPMAXUQ Z3, Z4, Z6 218 VPMAXUQ Z5, Z6, Z1 219 VPMAXUQ Z1, Z0, Z0 220 ADDQ $32, SI 221 CMPQ SI, DI 222 JNE loop32 223 224 VMOVDQU32 swap32+0(SB), Z1 225 VPERMI2D Z0, Z0, Z1 226 VPMAXUQ Y1, Y0, Y0 227 228 VMOVDQU32 swap32+32(SB), Y1 229 VPERMI2D Y0, Y0, Y1 230 VPMAXUQ X1, X0, X0 231 232 VMOVDQU32 swap32+48(SB), X1 233 VPERMI2D X0, X0, X1 234 VPMAXUQ X1, X0, X0 235 VZEROUPPER 236 237 MOVQ X0, BX 238 CMPQ SI, CX 239 JE done 240 loop: 241 MOVQ (AX)(SI*8), DX 242 CMPQ DX, BX 243 CMOVQHI DX, BX 244 INCQ SI 245 CMPQ SI, CX 246 JNE loop 247 done: 248 MOVQ BX, ret+24(FP) 249 RET 250 251 // func maxFloat32(data []float32) float32 252 TEXT ·maxFloat32(SB), NOSPLIT, $-28 253 MOVQ data_base+0(FP), AX 254 MOVQ data_len+8(FP), CX 255 XORQ BX, BX 256 257 CMPQ CX, $0 258 JE done 259 XORPS X0, X0 260 XORPS X1, X1 261 XORQ SI, SI 262 MOVLQZX (AX), BX 263 MOVQ BX, X0 264 265 CMPB ·hasAVX512VL(SB), $0 266 JE loop 267 268 CMPQ CX, $64 269 JB loop 270 271 MOVQ CX, DI 272 SHRQ $6, DI 273 SHLQ $6, DI 274 VPBROADCASTD (AX), Z0 275 loop64: 276 VMOVDQU32 (AX)(SI*4), Z1 277 VMOVDQU32 64(AX)(SI*4), Z2 278 VMOVDQU32 128(AX)(SI*4), Z3 279 VMOVDQU32 192(AX)(SI*4), Z4 280 VMAXPS Z1, Z2, Z5 281 VMAXPS Z3, Z4, Z6 282 VMAXPS Z5, Z6, Z1 283 VMAXPS Z1, Z0, Z0 284 ADDQ $64, SI 285 CMPQ SI, DI 286 JNE loop64 287 288 VMOVDQU32 swap32+0(SB), Z1 289 VPERMI2D Z0, Z0, Z1 290 VMAXPS Y1, Y0, Y0 291 292 VMOVDQU32 swap32+32(SB), Y1 293 VPERMI2D Y0, Y0, Y1 294 VMAXPS X1, X0, X0 295 296 VMOVDQU32 swap32+48(SB), X1 297 VPERMI2D X0, X0, X1 298 VMAXPS X1, X0, X0 299 VZEROUPPER 300 301 MOVAPS X0, X1 302 PSRLQ $32, X1 303 MOVQ X0, BX 304 MOVQ X1, DX 305 UCOMISS X0, X1 306 CMOVLHI DX, BX 307 308 CMPQ SI, CX 309 JE done 310 MOVQ BX, X0 311 loop: 312 MOVLQZX (AX)(SI*4), DX 313 MOVQ DX, X1 314 UCOMISS X0, X1 315 CMOVLHI DX, BX 316 MOVQ BX, X0 317 INCQ SI 318 CMPQ SI, CX 319 JNE loop 320 done: 321 MOVL BX, ret+24(FP) 322 RET 323 324 // func maxFloat64(data []float64) float64 325 TEXT ·maxFloat64(SB), NOSPLIT, $-32 326 MOVQ data_base+0(FP), AX 327 MOVQ data_len+8(FP), CX 328 XORQ BX, BX 329 330 CMPQ CX, $0 331 JE done 332 XORPD X0, X0 333 XORPD X1, X1 334 XORQ SI, SI 335 MOVQ (AX), BX 336 MOVQ BX, X0 337 338 CMPB ·hasAVX512VL(SB), $0 339 JE loop 340 341 CMPQ CX, $32 342 JB loop 343 344 MOVQ CX, DI 345 SHRQ $5, DI 346 SHLQ $5, DI 347 VPBROADCASTQ (AX), Z0 348 loop32: 349 VMOVDQU64 (AX)(SI*8), Z1 350 VMOVDQU64 64(AX)(SI*8), Z2 351 VMOVDQU64 128(AX)(SI*8), Z3 352 VMOVDQU64 192(AX)(SI*8), Z4 353 VMAXPD Z1, Z2, Z5 354 VMAXPD Z3, Z4, Z6 355 VMAXPD Z5, Z6, Z1 356 VMAXPD Z1, Z0, Z0 357 ADDQ $32, SI 358 CMPQ SI, DI 359 JNE loop32 360 361 VMOVDQU64 swap32+0(SB), Z1 362 VPERMI2D Z0, Z0, Z1 363 VMAXPD Y1, Y0, Y0 364 365 VMOVDQU64 swap32+32(SB), Y1 366 VPERMI2D Y0, Y0, Y1 367 VMAXPD X1, X0, X0 368 369 VMOVDQU64 swap32+48(SB), X1 370 VPERMI2D X0, X0, X1 371 VMAXPD X1, X0, X0 372 VZEROUPPER 373 374 MOVQ X0, BX 375 CMPQ SI, CX 376 JE done 377 loop: 378 MOVQ (AX)(SI*8), DX 379 MOVQ DX, X1 380 UCOMISD X0, X1 381 CMOVQHI DX, BX 382 MOVQ BX, X0 383 INCQ SI 384 CMPQ SI, CX 385 JNE loop 386 done: 387 MOVQ BX, ret+24(FP) 388 RET 389 390 // vpmaxu128 is a macro comparing unsigned 128 bits values held in the 391 // `srcValues` and `maxValues` vectors. The `srcIndexes` and `maxIndexes` 392 // vectors contain the indexes of elements in the value vectors. Remaining 393 // K and R arguments are mask and general purpose registers needed to hold 394 // temporary values during the computation. The last M argument is a mask 395 // generated by vpmaxu128mask. 396 // 397 // The routine uses AVX-512 instructions (VPCMPUQ, VPBLENDMQ) to implement 398 // the comparison of 128 bits values. The values are expected to be stored 399 // in the vectors as a little-endian pair of two consecutive quad words. 400 // 401 // The results are written to the `maxValues` and `maxIndexes` vectors, 402 // overwriting the inputs. `srcValues` and `srcIndexes` are read-only 403 // parameters. 404 // 405 // At a high level, for two pairs of quad words formaxg two 128 bits values 406 // A and B, the test implemented by this macro is: 407 // 408 // A[1] > B[1] || (A[1] == B[1] && A[0] > B[0]) 409 // 410 // Values in the source vector that evaluate to true on this expression are 411 // written to the vector of maximum values, and their indexes are written to 412 // the vector of indexes. 413 #define vpmaxu128(srcValues, srcIndexes, maxValues, maxIndexes, K1, K2, R1, R2, R3, M) \ 414 VPCMPUQ $0, maxValues, srcValues, K1 \ 415 VPCMPUQ $6, maxValues, srcValues, K2 \ 416 KMOVB K1, R1 \ 417 KMOVB K2, R2 \ 418 MOVB R2, R3 \ 419 SHLB $1, R3 \ 420 ANDB R3, R1 \ 421 ORB R2, R1 \ 422 ANDB M, R1 \ 423 MOVB R1, R2 \ 424 SHRB $1, R2 \ 425 ORB R2, R1 \ 426 KMOVB R1, K1 \ 427 VPBLENDMQ srcValues, maxValues, K1, maxValues \ 428 VPBLENDMQ srcIndexes, maxIndexes, K1, maxIndexes 429 430 // vpmaxu128mask is a macro used to initialize the mask passed as last argument 431 // to vpmaxu128. The argument M is intended to be a general purpose register. 432 // 433 // The bit mask is used to merge the results of the "greater than" and "equal" 434 // comparison that are performed on each lane of maximum vectors. The upper bits 435 // are used to compute results of the operation to determine which of the pairs 436 // of quad words representing the 128 bits elements are the maximums. 437 #define vpmaxu128mask(M) MOVB $0b10101010, M 438 439 // func maxBE128(data [][16]byte) []byte 440 TEXT ·maxBE128(SB), NOSPLIT, $-48 441 MOVQ data_base+0(FP), AX 442 MOVQ data_len+8(FP), CX 443 CMPQ CX, $0 444 JE null 445 446 SHLQ $4, CX 447 MOVQ CX, DX // len 448 MOVQ AX, BX // max 449 ADDQ AX, CX // end 450 451 CMPQ DX, $256 452 JB loop 453 454 CMPB ·hasAVX512MinMaxBE128(SB), $0 455 JE loop 456 457 // Z19 holds a vector of the count by which we increment the vectors of 458 // swap at each loop iteration. 459 MOVQ $16, DI 460 VPBROADCASTQ DI, Z19 461 462 // Z31 holds the shuffle mask used to convert 128 bits elements from big to 463 // little endian so we can apply vectorized comparison instructions. 464 VMOVDQU64 bswap128(SB), Z31 465 466 // These vectors hold four lanes of maximum values found in the input. 467 VBROADCASTI64X2 (AX), Z0 468 VPSHUFB Z31, Z0, Z0 469 VMOVDQU64 Z0, Z5 470 VMOVDQU64 Z0, Z10 471 VMOVDQU64 Z0, Z15 472 473 // These vectors hold four lanes of swap of maximum values. 474 // 475 // We initialize them at zero because we broadcast the first value of the 476 // input in the vectors that track the maximums of each lane; in other 477 // words, we assume the maximum value is at the first offset and work our 478 // way up from there. 479 VPXORQ Z2, Z2, Z2 480 VPXORQ Z7, Z7, Z7 481 VPXORQ Z12, Z12, Z12 482 VPXORQ Z17, Z17, Z17 483 484 // These vectors are used to compute the swap of maximum values held 485 // in [Z1, Z5, Z10, Z15]. Each vector holds a contiguous sequence of 486 // swap; for example, Z3 is initialized with [0, 1, 2, 3]. At each 487 // loop iteration, the swap are incremented by the number of elements 488 // consumed from the input (4x4=16). 489 VMOVDQU64 indexes128(SB), Z3 490 VPXORQ Z8, Z8, Z8 491 VPXORQ Z13, Z13, Z13 492 VPXORQ Z18, Z18, Z18 493 MOVQ $4, DI 494 VPBROADCASTQ DI, Z1 495 VPADDQ Z1, Z3, Z8 496 VPADDQ Z1, Z8, Z13 497 VPADDQ Z1, Z13, Z18 498 499 // This bit mask is used to merge the results of the "less than" and "equal" 500 // comparison that we perform on each lane of maximum vectors. We use the 501 // upper bits to compute four results of the operation which determines 502 // which of the pair of quad words representing the 128 bits elements is the 503 // maximum. 504 vpmaxu128mask(DI) 505 SHRQ $8, DX 506 SHLQ $8, DX 507 ADDQ AX, DX 508 loop16: 509 // Compute 4x4 maximum values in vector registers, along with their swap 510 // in the input array. 511 VMOVDQU64 (AX), Z1 512 VMOVDQU64 64(AX), Z6 513 VMOVDQU64 128(AX), Z11 514 VMOVDQU64 192(AX), Z16 515 VPSHUFB Z31, Z1, Z1 516 VPSHUFB Z31, Z6, Z6 517 VPSHUFB Z31, Z11, Z11 518 VPSHUFB Z31, Z16, Z16 519 vpmaxu128(Z1, Z3, Z0, Z2, K1, K2, R8, R9, R10, DI) 520 vpmaxu128(Z6, Z8, Z5, Z7, K3, K4, R11, R12, R13, DI) 521 vpmaxu128(Z11, Z13, Z10, Z12, K1, K2, R8, R9, R10, DI) 522 vpmaxu128(Z16, Z18, Z15, Z17, K3, K4, R11, R12, R13, DI) 523 VPADDQ Z19, Z3, Z3 524 VPADDQ Z19, Z8, Z8 525 VPADDQ Z19, Z13, Z13 526 VPADDQ Z19, Z18, Z18 527 ADDQ $256, AX 528 CMPQ AX, DX 529 JB loop16 530 531 // After the loop completed, we need to merge the lanes that each contain 532 // 4 maximum values (so 16 total candidate at this stage). The results are 533 // reduced into 4 candidates in Z0, with their swap in Z2. 534 vpmaxu128(Z10, Z12, Z0, Z2, K1, K2, R8, R9, R10, DI) 535 vpmaxu128(Z15, Z17, Z5, Z7, K3, K4, R11, R12, R13, DI) 536 vpmaxu128(Z5, Z7, Z0, Z2, K1, K2, R8, R9, R10, DI) 537 538 // Further reduce the results by swapping the upper and lower parts of the 539 // vector registers, and comparing them to determaxe which values are the 540 // smallest. We compare 2x2 values at this step, then 2x1 values at the next 541 // to find the index of the maximum. 542 VMOVDQU64 swap64+0(SB), Z1 543 VMOVDQU64 swap64+0(SB), Z3 544 VPERMI2Q Z0, Z0, Z1 545 VPERMI2Q Z2, Z2, Z3 546 vpmaxu128(Y1, Y3, Y0, Y2, K1, K2, R8, R9, R10, DI) 547 548 VMOVDQU64 swap64+32(SB), Y1 549 VMOVDQU64 swap64+32(SB), Y3 550 VPERMI2Q Y0, Y0, Y1 551 VPERMI2Q Y2, Y2, Y3 552 vpmaxu128(X1, X3, X0, X2, K1, K2, R8, R9, R10, DI) 553 VZEROUPPER 554 555 // Extract the index of the maximum value computed in the lower 64 bits of 556 // X2 and position the BX pointer at the index of the maximum value. 557 MOVQ X2, DX 558 SHLQ $4, DX 559 ADDQ DX, BX 560 CMPQ AX, CX 561 JE done 562 563 // Unless the input was aligned on 256 bytes, we need to perform a few more 564 // iterations on the remaining elements. 565 // 566 // This loop is also taken if the CPU has no support for AVX-512. 567 loop: 568 MOVQ (AX), R8 569 MOVQ (BX), R9 570 BSWAPQ R8 571 BSWAPQ R9 572 CMPQ R8, R9 573 JA more 574 JB next 575 MOVQ 8(AX), R8 576 MOVQ 8(BX), R9 577 BSWAPQ R8 578 BSWAPQ R9 579 CMPQ R8, R9 580 JBE next 581 more: 582 MOVQ AX, BX 583 next: 584 ADDQ $16, AX 585 CMPQ AX, CX 586 JB loop 587 done: 588 MOVQ BX, ret_base+24(FP) 589 MOVQ $16, ret_len+32(FP) 590 MOVQ $16, ret_cap+40(FP) 591 RET 592 null: 593 XORQ BX, BX 594 MOVQ BX, ret_base+24(FP) 595 MOVQ BX, ret_len+32(FP) 596 MOVQ BX, ret_cap+40(FP) 597 RET 598