github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/page_bounds_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define bswap128lo 0x08080A0B0C0D0E0F 6 #define bswap128hi 0x0001020304050607 7 8 DATA bswap128+0(SB)/8, $bswap128lo 9 DATA bswap128+8(SB)/8, $bswap128hi 10 DATA bswap128+16(SB)/8, $bswap128lo 11 DATA bswap128+24(SB)/8, $bswap128hi 12 DATA bswap128+32(SB)/8, $bswap128lo 13 DATA bswap128+40(SB)/8, $bswap128hi 14 DATA bswap128+48(SB)/8, $bswap128lo 15 DATA bswap128+56(SB)/8, $bswap128hi 16 GLOBL bswap128(SB), RODATA|NOPTR, $64 17 18 DATA indexes128+0(SB)/8, $0 19 DATA indexes128+8(SB)/8, $0 20 DATA indexes128+16(SB)/8, $1 21 DATA indexes128+24(SB)/8, $1 22 DATA indexes128+32(SB)/8, $2 23 DATA indexes128+40(SB)/8, $2 24 DATA indexes128+48(SB)/8, $3 25 DATA indexes128+56(SB)/8, $3 26 GLOBL indexes128(SB), RODATA|NOPTR, $64 27 28 DATA swap64+0(SB)/8, $4 29 DATA swap64+8(SB)/8, $5 30 DATA swap64+16(SB)/8, $6 31 DATA swap64+24(SB)/8, $7 32 DATA swap64+32(SB)/8, $2 33 DATA swap64+40(SB)/8, $3 34 DATA swap64+48(SB)/8, $0 35 DATA swap64+56(SB)/8, $1 36 GLOBL swap64(SB), RODATA|NOPTR, $64 37 38 DATA swap32+0(SB)/4, $8 39 DATA swap32+4(SB)/4, $9 40 DATA swap32+8(SB)/4, $10 41 DATA swap32+12(SB)/4, $11 42 DATA swap32+16(SB)/4, $12 43 DATA swap32+20(SB)/4, $13 44 DATA swap32+24(SB)/4, $14 45 DATA swap32+28(SB)/4, $15 46 DATA swap32+32(SB)/4, $4 47 DATA swap32+36(SB)/4, $5 48 DATA swap32+40(SB)/4, $6 49 DATA swap32+44(SB)/4, $7 50 DATA swap32+48(SB)/4, $2 51 DATA swap32+52(SB)/4, $3 52 DATA swap32+56(SB)/4, $0 53 DATA swap32+60(SB)/4, $1 54 GLOBL swap32(SB), RODATA|NOPTR, $64 55 56 // func combinedBoundsInt32(data []int32) (min, max int32) 57 TEXT ·combinedBoundsInt32(SB), NOSPLIT, $-32 58 MOVQ data_base+0(FP), AX 59 MOVQ data_len+8(FP), CX 60 XORQ R8, R8 61 XORQ R9, R9 62 63 CMPQ CX, $0 64 JE done 65 XORQ SI, SI 66 MOVLQZX (AX), R8 // min 67 MOVLQZX (AX), R9 // max 68 69 CMPB ·hasAVX512VL(SB), $0 70 JE loop 71 72 CMPQ CX, $32 73 JB loop 74 75 MOVQ CX, DI 76 SHRQ $5, DI 77 SHLQ $5, DI 78 VPBROADCASTD (AX), Z0 79 VPBROADCASTD (AX), Z3 80 loop32: 81 VMOVDQU32 (AX)(SI*4), Z1 82 VMOVDQU32 64(AX)(SI*4), Z2 83 VPMINSD Z1, Z0, Z0 84 VPMINSD Z2, Z0, Z0 85 VPMAXSD Z1, Z3, Z3 86 VPMAXSD Z2, Z3, Z3 87 ADDQ $32, SI 88 CMPQ SI, DI 89 JNE loop32 90 91 VMOVDQU32 swap32+0(SB), Z1 92 VMOVDQU32 swap32+0(SB), Z2 93 VPERMI2D Z0, Z0, Z1 94 VPERMI2D Z3, Z3, Z2 95 VPMINSD Y1, Y0, Y0 96 VPMAXSD Y2, Y3, Y3 97 98 VMOVDQU32 swap32+32(SB), Y1 99 VMOVDQU32 swap32+32(SB), Y2 100 VPERMI2D Y0, Y0, Y1 101 VPERMI2D Y3, Y3, Y2 102 VPMINSD X1, X0, X0 103 VPMAXSD X2, X3, X3 104 105 VMOVDQU32 swap32+48(SB), X1 106 VMOVDQU32 swap32+48(SB), X2 107 VPERMI2D X0, X0, X1 108 VPERMI2D X3, X3, X2 109 VPMINSD X1, X0, X0 110 VPMAXSD X2, X3, X3 111 VZEROUPPER 112 113 MOVQ X0, BX 114 MOVQ X3, DX 115 MOVL BX, R8 116 MOVL DX, R9 117 SHRQ $32, BX 118 SHRQ $32, DX 119 CMPL BX, R8 120 CMOVLLT BX, R8 121 CMPL DX, R9 122 CMOVLGT DX, R9 123 124 CMPQ SI, CX 125 JE done 126 loop: 127 MOVLQZX (AX)(SI*4), DX 128 CMPL DX, R8 129 CMOVLLT DX, R8 130 CMPL DX, R9 131 CMOVLGT DX, R9 132 INCQ SI 133 CMPQ SI, CX 134 JNE loop 135 done: 136 MOVL R8, min+24(FP) 137 MOVL R9, max+28(FP) 138 RET 139 140 // func combinedBoundsInt64(data []int64) (min, max int64) 141 TEXT ·combinedBoundsInt64(SB), NOSPLIT, $-40 142 MOVQ data_base+0(FP), AX 143 MOVQ data_len+8(FP), CX 144 XORQ R8, R8 145 XORQ R9, R9 146 147 CMPQ CX, $0 148 JE done 149 XORQ SI, SI 150 MOVQ (AX), R8 // min 151 MOVQ (AX), R9 // max 152 153 CMPB ·hasAVX512VL(SB), $0 154 JE loop 155 156 CMPQ CX, $16 157 JB loop 158 159 MOVQ CX, DI 160 SHRQ $4, DI 161 SHLQ $4, DI 162 VPBROADCASTQ (AX), Z0 163 VPBROADCASTQ (AX), Z3 164 loop16: 165 VMOVDQU64 (AX)(SI*8), Z1 166 VMOVDQU64 64(AX)(SI*8), Z2 167 VPMINSQ Z1, Z0, Z0 168 VPMINSQ Z2, Z0, Z0 169 VPMAXSQ Z1, Z3, Z3 170 VPMAXSQ Z2, Z3, Z3 171 ADDQ $16, SI 172 CMPQ SI, DI 173 JNE loop16 174 175 VMOVDQU32 swap32+0(SB), Z1 176 VMOVDQU32 swap32+0(SB), Z2 177 VPERMI2D Z0, Z0, Z1 178 VPERMI2D Z3, Z3, Z2 179 VPMINSQ Y1, Y0, Y0 180 VPMAXSQ Y2, Y3, Y3 181 182 VMOVDQU32 swap32+32(SB), Y1 183 VMOVDQU32 swap32+32(SB), Y2 184 VPERMI2D Y0, Y0, Y1 185 VPERMI2D Y3, Y3, Y2 186 VPMINSQ X1, X0, X0 187 VPMAXSQ X2, X3, X3 188 189 VMOVDQU32 swap32+48(SB), X1 190 VMOVDQU32 swap32+48(SB), X2 191 VPERMI2D X0, X0, X1 192 VPERMI2D X3, X3, X2 193 VPMINSQ X1, X0, X0 194 VPMAXSQ X2, X3, X3 195 VZEROUPPER 196 197 MOVQ X0, R8 198 MOVQ X3, R9 199 CMPQ SI, CX 200 JE done 201 loop: 202 MOVQ (AX)(SI*8), DX 203 CMPQ DX, R8 204 CMOVQLT DX, R8 205 CMPQ DX, R9 206 CMOVQGT DX, R9 207 INCQ SI 208 CMPQ SI, CX 209 JNE loop 210 done: 211 MOVQ R8, min+24(FP) 212 MOVQ R9, max+32(FP) 213 RET 214 215 // func combinedBoundsUint32(data []uint32) (min, max uint32) 216 TEXT ·combinedBoundsUint32(SB), NOSPLIT, $-32 217 MOVQ data_base+0(FP), AX 218 MOVQ data_len+8(FP), CX 219 XORQ R8, R8 220 XORQ R9, R9 221 222 CMPQ CX, $0 223 JE done 224 XORQ SI, SI 225 MOVLQZX (AX), R8 // min 226 MOVLQZX (AX), R9 // max 227 228 CMPB ·hasAVX512VL(SB), $0 229 JE loop 230 231 CMPQ CX, $32 232 JB loop 233 234 MOVQ CX, DI 235 SHRQ $5, DI 236 SHLQ $5, DI 237 VPBROADCASTD (AX), Z0 238 VPBROADCASTD (AX), Z3 239 loop32: 240 VMOVDQU32 (AX)(SI*4), Z1 241 VMOVDQU32 64(AX)(SI*4), Z2 242 VPMINUD Z1, Z0, Z0 243 VPMINUD Z2, Z0, Z0 244 VPMAXUD Z1, Z3, Z3 245 VPMAXUD Z2, Z3, Z3 246 ADDQ $32, SI 247 CMPQ SI, DI 248 JNE loop32 249 250 VMOVDQU32 swap32+0(SB), Z1 251 VMOVDQU32 swap32+0(SB), Z2 252 VPERMI2D Z0, Z0, Z1 253 VPERMI2D Z3, Z3, Z2 254 VPMINUD Y1, Y0, Y0 255 VPMAXUD Y2, Y3, Y3 256 257 VMOVDQU32 swap32+32(SB), Y1 258 VMOVDQU32 swap32+32(SB), Y2 259 VPERMI2D Y0, Y0, Y1 260 VPERMI2D Y3, Y3, Y2 261 VPMINUD X1, X0, X0 262 VPMAXUD X2, X3, X3 263 264 VMOVDQU32 swap32+48(SB), X1 265 VMOVDQU32 swap32+48(SB), X2 266 VPERMI2D X0, X0, X1 267 VPERMI2D X3, X3, X2 268 VPMINUD X1, X0, X0 269 VPMAXUD X2, X3, X3 270 VZEROUPPER 271 272 MOVQ X0, BX 273 MOVQ X3, DX 274 MOVL BX, R8 275 MOVL DX, R9 276 SHRQ $32, BX 277 SHRQ $32, DX 278 CMPL BX, R8 279 CMOVLCS BX, R8 280 CMPL DX, R9 281 CMOVLHI DX, R9 282 283 CMPQ SI, CX 284 JE done 285 loop: 286 MOVLQZX (AX)(SI*4), DX 287 CMPL DX, R8 288 CMOVLCS DX, R8 289 CMPL DX, R9 290 CMOVLHI DX, R9 291 INCQ SI 292 CMPQ SI, CX 293 JNE loop 294 done: 295 MOVL R8, min+24(FP) 296 MOVL R9, max+28(FP) 297 RET 298 299 // func combinedBoundsUint64(data []uint64) (min, max uint64) 300 TEXT ·combinedBoundsUint64(SB), NOSPLIT, $-40 301 MOVQ data_base+0(FP), AX 302 MOVQ data_len+8(FP), CX 303 XORQ R8, R8 304 XORQ R9, R9 305 306 CMPQ CX, $0 307 JE done 308 XORQ SI, SI 309 MOVQ (AX), R8 // min 310 MOVQ (AX), R9 // max 311 312 CMPB ·hasAVX512VL(SB), $0 313 JE loop 314 315 CMPQ CX, $16 316 JB loop 317 318 MOVQ CX, DI 319 SHRQ $4, DI 320 SHLQ $4, DI 321 VPBROADCASTQ (AX), Z0 322 VPBROADCASTQ (AX), Z3 323 loop16: 324 VMOVDQU64 (AX)(SI*8), Z1 325 VMOVDQU64 64(AX)(SI*8), Z2 326 VPMINUQ Z1, Z0, Z0 327 VPMINUQ Z2, Z0, Z0 328 VPMAXUQ Z1, Z3, Z3 329 VPMAXUQ Z2, Z3, Z3 330 ADDQ $16, SI 331 CMPQ SI, DI 332 JNE loop16 333 334 VMOVDQU32 swap32+0(SB), Z1 335 VMOVDQU32 swap32+0(SB), Z2 336 VPERMI2D Z0, Z0, Z1 337 VPERMI2D Z3, Z3, Z2 338 VPMINUQ Y1, Y0, Y0 339 VPMAXUQ Y2, Y3, Y3 340 341 VMOVDQU32 swap32+32(SB), Y1 342 VMOVDQU32 swap32+32(SB), Y2 343 VPERMI2D Y0, Y0, Y1 344 VPERMI2D Y3, Y3, Y2 345 VPMINUQ X1, X0, X0 346 VPMAXUQ X2, X3, X3 347 348 VMOVDQU32 swap32+48(SB), X1 349 VMOVDQU32 swap32+48(SB), X2 350 VPERMI2D X0, X0, X1 351 VPERMI2D X3, X3, X2 352 VPMINUQ X1, X0, X0 353 VPMAXUQ X2, X3, X3 354 VZEROUPPER 355 356 MOVQ X0, R8 357 MOVQ X3, R9 358 CMPQ SI, CX 359 JE done 360 loop: 361 MOVQ (AX)(SI*8), DX 362 CMPQ DX, R8 363 CMOVQCS DX, R8 364 CMPQ DX, R9 365 CMOVQHI DX, R9 366 INCQ SI 367 CMPQ SI, CX 368 JNE loop 369 done: 370 MOVQ R8, min+24(FP) 371 MOVQ R9, max+32(FP) 372 RET 373 374 // func combinedBoundsFloat32(data []float32) (min, max float32) 375 TEXT ·combinedBoundsFloat32(SB), NOSPLIT, $-32 376 MOVQ data_base+0(FP), AX 377 MOVQ data_len+8(FP), CX 378 XORQ R8, R8 379 XORQ R9, R9 380 381 CMPQ CX, $0 382 JE done 383 XORPS X0, X0 384 XORPS X1, X1 385 XORQ SI, SI 386 MOVLQZX (AX), R8 // min 387 MOVLQZX (AX), R9 // max 388 MOVQ R8, X0 389 MOVQ R9, X1 390 391 CMPB ·hasAVX512VL(SB), $0 392 JE loop 393 394 CMPQ CX, $32 395 JB loop 396 397 MOVQ CX, DI 398 SHRQ $5, DI 399 SHLQ $5, DI 400 VPBROADCASTD (AX), Z0 401 VPBROADCASTD (AX), Z3 402 loop32: 403 VMOVDQU32 (AX)(SI*4), Z1 404 VMOVDQU32 64(AX)(SI*4), Z2 405 VMINPS Z1, Z0, Z0 406 VMINPS Z2, Z0, Z0 407 VMAXPS Z1, Z3, Z3 408 VMAXPS Z2, Z3, Z3 409 ADDQ $32, SI 410 CMPQ SI, DI 411 JNE loop32 412 413 VMOVDQU32 swap32+0(SB), Z1 414 VMOVDQU32 swap32+0(SB), Z2 415 VPERMI2D Z0, Z0, Z1 416 VPERMI2D Z3, Z3, Z2 417 VMINPS Y1, Y0, Y0 418 VMAXPS Y2, Y3, Y3 419 420 VMOVDQU32 swap32+32(SB), Y1 421 VMOVDQU32 swap32+32(SB), Y2 422 VPERMI2D Y0, Y0, Y1 423 VPERMI2D Y3, Y3, Y2 424 VMINPS X1, X0, X0 425 VMAXPS X2, X3, X3 426 427 VMOVDQU32 swap32+48(SB), X1 428 VMOVDQU32 swap32+48(SB), X2 429 VPERMI2D X0, X0, X1 430 VPERMI2D X3, X3, X2 431 VMINPS X1, X0, X0 432 VMAXPS X2, X3, X3 433 VZEROUPPER 434 435 MOVAPS X0, X1 436 MOVAPS X3, X2 437 438 PSRLQ $32, X1 439 MOVQ X0, R8 440 MOVQ X1, R10 441 UCOMISS X0, X1 442 CMOVLCS R10, R8 443 444 PSRLQ $32, X2 445 MOVQ X3, R9 446 MOVQ X2, R11 447 UCOMISS X3, X2 448 CMOVLHI R11, R9 449 450 CMPQ SI, CX 451 JE done 452 MOVQ R8, X0 453 MOVQ R9, X1 454 loop: 455 MOVLQZX (AX)(SI*4), DX 456 MOVQ DX, X2 457 UCOMISS X0, X2 458 CMOVLCS DX, R8 459 UCOMISS X1, X2 460 CMOVLHI DX, R9 461 MOVQ R8, X0 462 MOVQ R9, X1 463 INCQ SI 464 CMPQ SI, CX 465 JNE loop 466 done: 467 MOVL R8, min+24(FP) 468 MOVL R9, max+28(FP) 469 RET 470 471 // func combinedBoundsFloat64(data []float64) (min, max float64) 472 TEXT ·combinedBoundsFloat64(SB), NOSPLIT, $-40 473 MOVQ data_base+0(FP), AX 474 MOVQ data_len+8(FP), CX 475 XORQ R8, R8 476 XORQ R9, R9 477 478 CMPQ CX, $0 479 JE done 480 XORPD X0, X0 481 XORPD X1, X1 482 XORQ SI, SI 483 MOVQ (AX), R8 // min 484 MOVQ (AX), R9 // max 485 MOVQ R8, X0 486 MOVQ R9, X1 487 488 CMPB ·hasAVX512VL(SB), $0 489 JE loop 490 491 CMPQ CX, $16 492 JB loop 493 494 MOVQ CX, DI 495 SHRQ $4, DI 496 SHLQ $4, DI 497 VPBROADCASTQ (AX), Z0 498 VPBROADCASTQ (AX), Z3 499 loop16: 500 VMOVDQU64 (AX)(SI*8), Z1 501 VMOVDQU64 64(AX)(SI*8), Z2 502 VMINPD Z1, Z0, Z0 503 VMINPD Z2, Z0, Z0 504 VMAXPD Z1, Z3, Z3 505 VMAXPD Z2, Z3, Z3 506 ADDQ $16, SI 507 CMPQ SI, DI 508 JNE loop16 509 510 VMOVDQU64 swap32+0(SB), Z1 511 VMOVDQU64 swap32+0(SB), Z2 512 VPERMI2D Z0, Z0, Z1 513 VPERMI2D Z3, Z3, Z2 514 VMINPD Y1, Y0, Y0 515 VMAXPD Y2, Y3, Y3 516 517 VMOVDQU64 swap32+32(SB), Y1 518 VMOVDQU64 swap32+32(SB), Y2 519 VPERMI2D Y0, Y0, Y1 520 VPERMI2D Y3, Y3, Y2 521 VMINPD X1, X0, X0 522 VMAXPD X2, X3, X3 523 524 VMOVDQU64 swap32+48(SB), X1 525 VMOVDQU64 swap32+48(SB), X2 526 VPERMI2D X0, X0, X1 527 VPERMI2D X3, X3, X2 528 VMINPD X1, X0, X0 529 VMAXPD X2, X3, X1 530 VZEROUPPER 531 532 MOVQ X0, R8 533 MOVQ X1, R9 534 CMPQ SI, CX 535 JE done 536 loop: 537 MOVQ (AX)(SI*8), DX 538 MOVQ DX, X2 539 UCOMISD X0, X2 540 CMOVQCS DX, R8 541 UCOMISD X1, X2 542 CMOVQHI DX, R9 543 MOVQ R8, X0 544 MOVQ R9, X1 545 INCQ SI 546 CMPQ SI, CX 547 JNE loop 548 done: 549 MOVQ R8, min+24(FP) 550 MOVQ R9, max+32(FP) 551 RET