github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/dictionary_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define errnoIndexOutOfBounds 1 6 7 // func dictionaryBoundsInt32(dict []int32, indexes []int32) (min, max int32, err errno) 8 TEXT ·dictionaryBoundsInt32(SB), NOSPLIT, $0-64 9 MOVQ dict_base+0(FP), AX 10 MOVQ dict_len+8(FP), BX 11 12 MOVQ indexes_base+24(FP), CX 13 MOVQ indexes_len+32(FP), DX 14 15 XORQ R10, R10 // min 16 XORQ R11, R11 // max 17 XORQ R12, R12 // err 18 XORQ SI, SI 19 20 CMPQ DX, $0 21 JE return 22 23 MOVL (CX), DI 24 CMPL DI, BX 25 JAE indexOutOfBounds 26 MOVL (AX)(DI*4), R10 27 MOVL R10, R11 28 29 CMPQ DX, $8 30 JB test 31 32 CMPB ·hasAVX512VL(SB), $0 33 JE test 34 35 MOVQ DX, DI 36 SHRQ $3, DI 37 SHLQ $3, DI 38 39 MOVQ $0xFFFF, R8 40 KMOVW R8, K1 41 42 VPBROADCASTD BX, Y2 // [len(dict)...] 43 VPBROADCASTD R10, Y3 // [min...] 44 VMOVDQU32 Y3, Y4 // [max...] 45 loopAVX512: 46 VMOVDQU32 (CX)(SI*4), Y0 47 VPCMPUD $1, Y2, Y0, K2 48 KMOVW K2, R9 49 CMPB R9, $0xFF 50 JNE indexOutOfBounds 51 VPGATHERDD (AX)(Y0*4), K1, Y1 52 VPMINSD Y1, Y3, Y3 53 VPMAXSD Y1, Y4, Y4 54 KMOVW R8, K1 55 ADDQ $8, SI 56 CMPQ SI, DI 57 JNE loopAVX512 58 59 VPERM2I128 $1, Y3, Y3, Y0 60 VPERM2I128 $1, Y4, Y4, Y1 61 VPMINSD Y0, Y3, Y3 62 VPMAXSD Y1, Y4, Y4 63 64 VPSHUFD $0b1110, Y3, Y0 65 VPSHUFD $0b1110, Y4, Y1 66 VPMINSD Y0, Y3, Y3 67 VPMAXSD Y1, Y4, Y4 68 69 VPSHUFD $1, Y3, Y0 70 VPSHUFD $1, Y4, Y1 71 VPMINSD Y0, Y3, Y3 72 VPMAXSD Y1, Y4, Y4 73 74 MOVQ X3, R10 75 MOVQ X4, R11 76 ANDQ $0xFFFFFFFF, R10 77 ANDQ $0xFFFFFFFF, R11 78 79 VZEROUPPER 80 JMP test 81 loop: 82 MOVL (CX)(SI*4), DI 83 CMPL DI, BX 84 JAE indexOutOfBounds 85 MOVL (AX)(DI*4), DI 86 CMPL DI, R10 87 CMOVLLT DI, R10 88 CMPL DI, R11 89 CMOVLGT DI, R11 90 INCQ SI 91 test: 92 CMPQ SI, DX 93 JNE loop 94 return: 95 MOVL R10, min+48(FP) 96 MOVL R11, max+52(FP) 97 MOVQ R12, err+56(FP) 98 RET 99 indexOutOfBounds: 100 MOVQ $errnoIndexOutOfBounds, R12 101 JMP return 102 103 // func dictionaryBoundsInt64(dict []int64, indexes []int32) (min, max int64, err errno) 104 TEXT ·dictionaryBoundsInt64(SB), NOSPLIT, $0-72 105 MOVQ dict_base+0(FP), AX 106 MOVQ dict_len+8(FP), BX 107 108 MOVQ indexes_base+24(FP), CX 109 MOVQ indexes_len+32(FP), DX 110 111 XORQ R10, R10 // min 112 XORQ R11, R11 // max 113 XORQ R12, R12 // err 114 XORQ SI, SI 115 116 CMPQ DX, $0 117 JE return 118 119 MOVL (CX), DI 120 CMPL DI, BX 121 JAE indexOutOfBounds 122 MOVQ (AX)(DI*8), R10 123 MOVQ R10, R11 124 125 CMPQ DX, $8 126 JB test 127 128 CMPB ·hasAVX512VL(SB), $0 129 JE test 130 131 MOVQ DX, DI 132 SHRQ $3, DI 133 SHLQ $3, DI 134 135 MOVQ $0xFFFF, R8 136 KMOVW R8, K1 137 138 VPBROADCASTD BX, Y2 // [len(dict)...] 139 VPBROADCASTQ R10, Z3 // [min...] 140 VMOVDQU64 Z3, Z4 // [max...] 141 loopAVX512: 142 VMOVDQU32 (CX)(SI*4), Y0 143 VPCMPUD $1, Y2, Y0, K2 144 KMOVW K2, R9 145 CMPB R9, $0xFF 146 JNE indexOutOfBounds 147 VPGATHERDQ (AX)(Y0*8), K1, Z1 148 VPMINSQ Z1, Z3, Z3 149 VPMAXSQ Z1, Z4, Z4 150 KMOVW R8, K1 151 ADDQ $8, SI 152 CMPQ SI, DI 153 JNE loopAVX512 154 155 VPERMQ $0b1110, Z3, Z0 156 VPERMQ $0b1110, Z4, Z1 157 VPMINSQ Z0, Z3, Z3 158 VPMAXSQ Z1, Z4, Z4 159 160 VPERMQ $1, Z3, Z0 161 VPERMQ $1, Z4, Z1 162 VPMINSQ Z0, Z3, Z3 163 VPMAXSQ Z1, Z4, Z4 164 165 VSHUFF64X2 $2, Z3, Z3, Z0 166 VSHUFF64X2 $2, Z4, Z4, Z1 167 VPMINSQ Z0, Z3, Z3 168 VPMAXSQ Z1, Z4, Z4 169 170 MOVQ X3, R10 171 MOVQ X4, R11 172 173 VZEROUPPER 174 JMP test 175 loop: 176 MOVL (CX)(SI*4), DI 177 CMPL DI, BX 178 JAE indexOutOfBounds 179 MOVQ (AX)(DI*8), DI 180 CMPQ DI, R10 181 CMOVQLT DI, R10 182 CMPQ DI, R11 183 CMOVQGT DI, R11 184 INCQ SI 185 test: 186 CMPQ SI, DX 187 JNE loop 188 return: 189 MOVQ R10, min+48(FP) 190 MOVQ R11, max+56(FP) 191 MOVQ R12, err+64(FP) 192 RET 193 indexOutOfBounds: 194 MOVQ $errnoIndexOutOfBounds, R12 195 JMP return 196 197 // func dictionaryBoundsFloat32(dict []float32, indexes []int32) (min, max float32, err errno) 198 TEXT ·dictionaryBoundsFloat32(SB), NOSPLIT, $0-64 199 MOVQ dict_base+0(FP), AX 200 MOVQ dict_len+8(FP), BX 201 202 MOVQ indexes_base+24(FP), CX 203 MOVQ indexes_len+32(FP), DX 204 205 PXOR X3, X3 // min 206 PXOR X4, X4 // max 207 XORQ R12, R12 // err 208 XORQ SI, SI 209 210 CMPQ DX, $0 211 JE return 212 213 MOVL (CX), DI 214 CMPL DI, BX 215 JAE indexOutOfBounds 216 MOVSS (AX)(DI*4), X3 217 MOVAPS X3, X4 218 219 CMPQ DX, $8 220 JB test 221 222 CMPB ·hasAVX512VL(SB), $0 223 JE test 224 225 MOVQ DX, DI 226 SHRQ $3, DI 227 SHLQ $3, DI 228 229 MOVQ $0xFFFF, R8 230 KMOVW R8, K1 231 232 VPBROADCASTD BX, Y2 // [len(dict)...] 233 VPBROADCASTD X3, Y3 // [min...] 234 VMOVDQU32 Y3, Y4 // [max...] 235 loopAVX512: 236 VMOVDQU32 (CX)(SI*4), Y0 237 VPCMPUD $1, Y2, Y0, K2 238 KMOVW K2, R9 239 CMPB R9, $0xFF 240 JNE indexOutOfBounds 241 VPGATHERDD (AX)(Y0*4), K1, Y1 242 VMINPS Y1, Y3, Y3 243 VMAXPS Y1, Y4, Y4 244 KMOVW R8, K1 245 ADDQ $8, SI 246 CMPQ SI, DI 247 JNE loopAVX512 248 249 VPERM2I128 $1, Y3, Y3, Y0 250 VPERM2I128 $1, Y4, Y4, Y1 251 VMINPS Y0, Y3, Y3 252 VMAXPS Y1, Y4, Y4 253 254 VPSHUFD $0b1110, Y3, Y0 255 VPSHUFD $0b1110, Y4, Y1 256 VMINPS Y0, Y3, Y3 257 VMAXPS Y1, Y4, Y4 258 259 VPSHUFD $1, Y3, Y0 260 VPSHUFD $1, Y4, Y1 261 VMINPS Y0, Y3, Y3 262 VMAXPS Y1, Y4, Y4 263 264 VZEROUPPER 265 JMP test 266 loop: 267 MOVL (CX)(SI*4), DI 268 CMPL DI, BX 269 JAE indexOutOfBounds 270 MOVSS (AX)(DI*4), X1 271 UCOMISS X3, X1 272 JAE skipAssignMin 273 MOVAPS X1, X3 274 skipAssignMin: 275 UCOMISS X4, X1 276 JBE skipAssignMax 277 MOVAPS X1, X4 278 skipAssignMax: 279 INCQ SI 280 test: 281 CMPQ SI, DX 282 JNE loop 283 return: 284 MOVSS X3, min+48(FP) 285 MOVSS X4, max+52(FP) 286 MOVQ R12, err+56(FP) 287 RET 288 indexOutOfBounds: 289 MOVQ $errnoIndexOutOfBounds, R12 290 JMP return 291 292 // func dictionaryBoundsFloat64(dict []float64, indexes []int32) (min, max float64, err errno) 293 TEXT ·dictionaryBoundsFloat64(SB), NOSPLIT, $0-72 294 MOVQ dict_base+0(FP), AX 295 MOVQ dict_len+8(FP), BX 296 297 MOVQ indexes_base+24(FP), CX 298 MOVQ indexes_len+32(FP), DX 299 300 PXOR X3, X3 // min 301 PXOR X4, X4 // max 302 XORQ R12, R12 // err 303 XORQ SI, SI 304 305 CMPQ DX, $0 306 JE return 307 308 MOVL (CX), DI 309 CMPL DI, BX 310 JAE indexOutOfBounds 311 MOVSD (AX)(DI*8), X3 312 MOVAPS X3, X4 313 314 CMPQ DX, $8 315 JB test 316 317 CMPB ·hasAVX512VL(SB), $0 318 JE test 319 320 MOVQ DX, DI 321 SHRQ $3, DI 322 SHLQ $3, DI 323 324 MOVQ $0xFFFF, R8 325 KMOVW R8, K1 326 327 VPBROADCASTD BX, Y2 // [len(dict)...] 328 VPBROADCASTQ X3, Z3 // [min...] 329 VMOVDQU64 Z3, Z4 // [max...] 330 loopAVX512: 331 VMOVDQU32 (CX)(SI*4), Y0 332 VPCMPUD $1, Y2, Y0, K2 333 KMOVW K2, R9 334 CMPB R9, $0xFF 335 JNE indexOutOfBounds 336 VPGATHERDQ (AX)(Y0*8), K1, Z1 337 VMINPD Z1, Z3, Z3 338 VMAXPD Z1, Z4, Z4 339 KMOVW R8, K1 340 ADDQ $8, SI 341 CMPQ SI, DI 342 JNE loopAVX512 343 344 VPERMQ $0b1110, Z3, Z0 345 VPERMQ $0b1110, Z4, Z1 346 VMINPD Z0, Z3, Z3 347 VMAXPD Z1, Z4, Z4 348 349 VPERMQ $1, Z3, Z0 350 VPERMQ $1, Z4, Z1 351 VMINPD Z0, Z3, Z3 352 VMAXPD Z1, Z4, Z4 353 354 VSHUFF64X2 $2, Z3, Z3, Z0 355 VSHUFF64X2 $2, Z4, Z4, Z1 356 VMINPD Z0, Z3, Z3 357 VMAXPD Z1, Z4, Z4 358 359 VZEROUPPER 360 JMP test 361 loop: 362 MOVL (CX)(SI*4), DI 363 CMPL DI, BX 364 JAE indexOutOfBounds 365 MOVSD (AX)(DI*8), X1 366 UCOMISD X3, X1 367 JAE skipAssignMin 368 MOVAPD X1, X3 369 skipAssignMin: 370 UCOMISD X4, X1 371 JBE skipAssignMax 372 MOVAPD X1, X4 373 skipAssignMax: 374 INCQ SI 375 test: 376 CMPQ SI, DX 377 JNE loop 378 return: 379 MOVSD X3, min+48(FP) 380 MOVSD X4, max+56(FP) 381 MOVQ R12, err+64(FP) 382 RET 383 indexOutOfBounds: 384 MOVQ $errnoIndexOutOfBounds, R12 385 JMP return 386 387 // func dictionaryBoundsUint32(dict []uint32, indexes []int32) (min, max uint32, err errno) 388 TEXT ·dictionaryBoundsUint32(SB), NOSPLIT, $0-64 389 MOVQ dict_base+0(FP), AX 390 MOVQ dict_len+8(FP), BX 391 392 MOVQ indexes_base+24(FP), CX 393 MOVQ indexes_len+32(FP), DX 394 395 XORQ R10, R10 // min 396 XORQ R11, R11 // max 397 XORQ R12, R12 // err 398 XORQ SI, SI 399 400 CMPQ DX, $0 401 JE return 402 403 MOVL (CX), DI 404 CMPL DI, BX 405 JAE indexOutOfBounds 406 MOVL (AX)(DI*4), R10 407 MOVL R10, R11 408 409 CMPQ DX, $8 410 JB test 411 412 CMPB ·hasAVX512VL(SB), $0 413 JE test 414 415 MOVQ DX, DI 416 SHRQ $3, DI 417 SHLQ $3, DI 418 419 MOVQ $0xFFFF, R8 420 KMOVW R8, K1 421 422 VPBROADCASTD BX, Y2 // [len(dict)...] 423 VPBROADCASTD R10, Y3 // [min...] 424 VMOVDQU32 Y3, Y4 // [max...] 425 loopAVX512: 426 VMOVDQU32 (CX)(SI*4), Y0 427 VPCMPUD $1, Y2, Y0, K2 428 KMOVW K2, R9 429 CMPB R9, $0xFF 430 JNE indexOutOfBounds 431 VPGATHERDD (AX)(Y0*4), K1, Y1 432 VPMINUD Y1, Y3, Y3 433 VPMAXUD Y1, Y4, Y4 434 KMOVW R8, K1 435 ADDQ $8, SI 436 CMPQ SI, DI 437 JNE loopAVX512 438 439 VPERM2I128 $1, Y3, Y3, Y0 440 VPERM2I128 $1, Y4, Y4, Y1 441 VPMINUD Y0, Y3, Y3 442 VPMAXUD Y1, Y4, Y4 443 444 VPSHUFD $0b1110, Y3, Y0 445 VPSHUFD $0b1110, Y4, Y1 446 VPMINUD Y0, Y3, Y3 447 VPMAXUD Y1, Y4, Y4 448 449 VPSHUFD $1, Y3, Y0 450 VPSHUFD $1, Y4, Y1 451 VPMINUD Y0, Y3, Y3 452 VPMAXUD Y1, Y4, Y4 453 454 MOVQ X3, R10 455 MOVQ X4, R11 456 ANDQ $0xFFFFFFFF, R10 457 ANDQ $0xFFFFFFFF, R11 458 459 VZEROUPPER 460 JMP test 461 loop: 462 MOVL (CX)(SI*4), DI 463 CMPL DI, BX 464 JAE indexOutOfBounds 465 MOVL (AX)(DI*4), DI 466 CMPL DI, R10 467 CMOVLCS DI, R10 468 CMPL DI, R11 469 CMOVLHI DI, R11 470 INCQ SI 471 test: 472 CMPQ SI, DX 473 JNE loop 474 return: 475 MOVL R10, min+48(FP) 476 MOVL R11, max+52(FP) 477 MOVQ R12, err+56(FP) 478 RET 479 indexOutOfBounds: 480 MOVQ $errnoIndexOutOfBounds, R12 481 JMP return 482 483 // func dictionaryBoundsUint64(dict []uint64, indexes []int32) (min, max uint64, err errno) 484 TEXT ·dictionaryBoundsUint64(SB), NOSPLIT, $0-72 485 MOVQ dict_base+0(FP), AX 486 MOVQ dict_len+8(FP), BX 487 488 MOVQ indexes_base+24(FP), CX 489 MOVQ indexes_len+32(FP), DX 490 491 XORQ R10, R10 // min 492 XORQ R11, R11 // max 493 XORQ R12, R12 // err 494 XORQ SI, SI 495 496 CMPQ DX, $0 497 JE return 498 499 MOVL (CX)(SI*4), DI 500 CMPL DI, BX 501 JAE indexOutOfBounds 502 MOVQ (AX)(DI*8), R10 503 MOVQ R10, R11 504 505 CMPQ DX, $8 506 JB test 507 508 CMPB ·hasAVX512VL(SB), $0 509 JE test 510 511 MOVQ DX, DI 512 SHRQ $3, DI 513 SHLQ $3, DI 514 515 MOVQ $0xFFFF, R8 516 KMOVW R8, K1 517 518 VPBROADCASTD BX, Y2 // [len(dict)...] 519 VPBROADCASTQ R10, Z3 // [min...] 520 VMOVDQU64 Z3, Z4 // [max...] 521 loopAVX512: 522 VMOVDQU32 (CX)(SI*4), Y0 523 VPCMPUD $1, Y2, Y0, K2 524 KMOVW K2, R9 525 CMPB R9, $0xFF 526 JNE indexOutOfBounds 527 VPGATHERDQ (AX)(Y0*8), K1, Z1 528 VPMINUQ Z1, Z3, Z3 529 VPMAXUQ Z1, Z4, Z4 530 KMOVW R8, K1 531 ADDQ $8, SI 532 CMPQ SI, DI 533 JNE loopAVX512 534 535 VPERMQ $0b1110, Z3, Z0 536 VPERMQ $0b1110, Z4, Z1 537 VPMINUQ Z0, Z3, Z3 538 VPMAXUQ Z1, Z4, Z4 539 540 VPERMQ $1, Z3, Z0 541 VPERMQ $1, Z4, Z1 542 VPMINUQ Z0, Z3, Z3 543 VPMAXUQ Z1, Z4, Z4 544 545 VSHUFF64X2 $2, Z3, Z3, Z0 546 VSHUFF64X2 $2, Z4, Z4, Z1 547 VPMINUQ Z0, Z3, Z3 548 VPMAXUQ Z1, Z4, Z4 549 550 MOVQ X3, R10 551 MOVQ X4, R11 552 553 VZEROUPPER 554 JMP test 555 loop: 556 MOVL (CX)(SI*4), DI 557 CMPL DI, BX 558 JAE indexOutOfBounds 559 MOVQ (AX)(DI*8), DI 560 CMPQ DI, R10 561 CMOVQCS DI, R10 562 CMPQ DI, R11 563 CMOVQHI DI, R11 564 INCQ SI 565 test: 566 CMPQ SI, DX 567 JNE loop 568 return: 569 MOVQ R10, min+48(FP) 570 MOVQ R11, max+56(FP) 571 MOVQ R12, err+64(FP) 572 RET 573 indexOutOfBounds: 574 MOVQ $errnoIndexOutOfBounds, R12 575 JMP return 576 577 // func dictionaryBoundsBE128(dict [][16]byte, indexes []int32) (min, max *[16]byte, err errno) 578 TEXT ·dictionaryBoundsBE128(SB), NOSPLIT, $0-72 579 MOVQ dict_base+0(FP), AX 580 MOVQ dict_len+8(FP), BX 581 582 MOVQ indexes_base+24(FP), CX 583 MOVQ indexes_len+32(FP), DX 584 SHLQ $2, DX // x 4 585 ADDQ CX, DX // end 586 587 XORQ R8, R8 // min (pointer) 588 XORQ R9, R9 // max (pointer) 589 XORQ SI, SI // err 590 XORQ DI, DI 591 592 CMPQ DX, $0 593 JE return 594 595 MOVL (CX), DI 596 CMPL DI, BX 597 JAE indexOutOfBounds 598 SHLQ $4, DI // the dictionary contains 16 byte words 599 LEAQ (AX)(DI*1), R8 600 MOVQ R8, R9 601 MOVQ 0(AX)(DI*1), R10 // min (high) 602 MOVQ 8(AX)(DI*1), R11 // min (low) 603 BSWAPQ R10 604 BSWAPQ R11 605 MOVQ R10, R12 // max (high) 606 MOVQ R11, R13 // max (low) 607 608 JMP next 609 loop: 610 MOVL (CX), DI 611 CMPL DI, BX 612 JAE indexOutOfBounds 613 SHLQ $4, DI 614 MOVQ 0(AX)(DI*1), R14 615 MOVQ 8(AX)(DI*1), R15 616 BSWAPQ R14 617 BSWAPQ R15 618 testLessThan: 619 CMPQ R14, R10 620 JA testGreaterThan 621 JB lessThan 622 CMPQ R15, R11 623 JAE testGreaterThan 624 lessThan: 625 LEAQ (AX)(DI*1), R8 626 MOVQ R14, R10 627 MOVQ R15, R11 628 JMP next 629 testGreaterThan: 630 CMPQ R14, R12 631 JB next 632 JA greaterThan 633 CMPQ R15, R13 634 JBE next 635 greaterThan: 636 LEAQ (AX)(DI*1), R9 637 MOVQ R14, R12 638 MOVQ R15, R13 639 next: 640 ADDQ $4, CX 641 CMPQ CX, DX 642 JNE loop 643 return: 644 MOVQ R8, min+48(FP) 645 MOVQ R9, max+56(FP) 646 MOVQ SI, err+64(FP) 647 RET 648 indexOutOfBounds: 649 MOVQ $errnoIndexOutOfBounds, SI 650 JMP return 651 652 // The lookup functions provide optimized versions of the dictionary index 653 // lookup logic. 654 // 655 // When AVX512 is available, the AVX512 versions of the functions are used 656 // which use the VPGATHER* instructions to perform 8 parallel lookups of the 657 // values in the dictionary, then VPSCATTER* to do 8 parallel writes to the 658 // sparse output buffer. 659 660 // func dictionaryLookup32(dict []uint32, indexes []int32, rows sparse.Array) errno 661 TEXT ·dictionaryLookup32(SB), NOSPLIT, $0-80 662 MOVQ dict_base+0(FP), AX 663 MOVQ dict_len+8(FP), BX 664 665 MOVQ indexes_base+24(FP), CX 666 MOVQ indexes_len+32(FP), DX 667 668 MOVQ rows_array_ptr+48(FP), R8 669 MOVQ rows_array_off+64(FP), R9 670 671 XORQ SI, SI 672 673 CMPQ DX, $8 674 JB test 675 676 CMPB ·hasAVX512VL(SB), $0 677 JE test 678 679 MOVQ DX, DI 680 SHRQ $3, DI 681 SHLQ $3, DI 682 683 MOVQ R9, R10 684 SHLQ $3, R10 // 8 * size 685 686 MOVW $0xFFFF, R11 687 KMOVW R11, K1 688 KMOVW R11, K2 689 690 VPBROADCASTD R9, Y2 // [size...] 691 VPMULLD ·range0n8(SB), Y2, Y2 // [0*size,1*size,...] 692 VPBROADCASTD BX, Y3 // [len(dict)...] 693 loopAVX512: 694 VMOVDQU32 (CX)(SI*4), Y0 695 VPCMPUD $1, Y3, Y0, K3 696 KMOVW K3, R11 697 CMPB R11, $0xFF 698 JNE indexOutOfBounds 699 VPGATHERDD (AX)(Y0*4), K1, Y1 700 VPSCATTERDD Y1, K2, (R8)(Y2*1) 701 KMOVW R11, K1 702 KMOVW R11, K2 703 ADDQ R10, R8 704 ADDQ $8, SI 705 CMPQ SI, DI 706 JNE loopAVX512 707 VZEROUPPER 708 JMP test 709 loop: 710 MOVL (CX)(SI*4), DI 711 CMPL DI, BX 712 JAE indexOutOfBounds 713 MOVL (AX)(DI*4), DI 714 MOVL DI, (R8) 715 ADDQ R9, R8 716 INCQ SI 717 test: 718 CMPQ SI, DX 719 JNE loop 720 XORQ AX, AX 721 return: 722 MOVQ AX, ret+72(FP) 723 RET 724 indexOutOfBounds: 725 MOVQ $errnoIndexOutOfBounds, AX 726 JMP return 727 728 // func dictionaryLookup64(dict []uint64, indexes []int32, rows sparse.Array) errno 729 TEXT ·dictionaryLookup64(SB), NOSPLIT, $0-80 730 MOVQ dict_base+0(FP), AX 731 MOVQ dict_len+8(FP), BX 732 733 MOVQ indexes_base+24(FP), CX 734 MOVQ indexes_len+32(FP), DX 735 736 MOVQ rows_array_ptr+48(FP), R8 737 MOVQ rows_array_off+64(FP), R9 738 739 XORQ SI, SI 740 741 CMPQ DX, $8 742 JB test 743 744 CMPB ·hasAVX512VL(SB), $0 745 JE test 746 747 MOVQ DX, DI 748 SHRQ $3, DI 749 SHLQ $3, DI 750 751 MOVQ R9, R10 752 SHLQ $3, R10 // 8 * size 753 754 MOVW $0xFFFF, R11 755 KMOVW R11, K1 756 KMOVW R11, K2 757 758 VPBROADCASTD R9, Y2 // [size...] 759 VPMULLD ·range0n8(SB), Y2, Y2 // [0*size,1*size,...] 760 VPBROADCASTD BX, Y3 // [len(dict)...] 761 loopAVX512: 762 VMOVDQU32 (CX)(SI*4), Y0 763 VPCMPUD $1, Y3, Y0, K3 764 KMOVW K3, R11 765 CMPB R11, $0xFF 766 JNE indexOutOfBounds 767 VPGATHERDQ (AX)(Y0*8), K1, Z1 768 VPSCATTERDQ Z1, K2, (R8)(Y2*1) 769 KMOVW R11, K1 770 KMOVW R11, K2 771 ADDQ R10, R8 772 ADDQ $8, SI 773 CMPQ SI, DI 774 JNE loopAVX512 775 VZEROUPPER 776 JMP test 777 loop: 778 MOVL (CX)(SI*4), DI 779 CMPL DI, BX 780 JAE indexOutOfBounds 781 MOVQ (AX)(DI*8), DI 782 MOVQ DI, (R8) 783 ADDQ R9, R8 784 INCQ SI 785 test: 786 CMPQ SI, DX 787 JNE loop 788 XORQ AX, AX 789 return: 790 MOVQ AX, ret+72(FP) 791 RET 792 indexOutOfBounds: 793 MOVQ $errnoIndexOutOfBounds, AX 794 JMP return 795 796 // func dictionaryLookupByteArrayString(dict []uint32, page []byte, indexes []int32, rows sparse.Array) errno 797 TEXT ·dictionaryLookupByteArrayString(SB), NOSPLIT, $0-104 798 MOVQ dict_base+0(FP), AX 799 MOVQ dict_len+8(FP), BX 800 801 MOVQ page+24(FP), CX 802 803 MOVQ indexes_base+48(FP), R8 804 MOVQ indexes_len+56(FP), R9 805 806 MOVQ rows_array_ptr+72(FP), R10 807 MOVQ rows_array_off+88(FP), R11 808 809 XORQ DI, DI 810 XORQ SI, SI 811 loop: 812 // Load the index that we want to read the value from. This may come from 813 // user input so we must validate that the indexes are within the bounds of 814 // the dictionary. 815 MOVL (R8)(SI*4), DI 816 CMPL DI, BX 817 JAE indexOutOfBounds 818 819 // Load the offset within the dictionary page where the value is stored. 820 // We trust the offsets to be correct since they are generated internally by 821 // the dictionary code, there is no need to check that they are within the 822 // bounds of the dictionary page. 823 MOVL (AX)(DI*4), DI 824 825 // Load the value from the dictionary page. The page uses the PLAIN encoding 826 // where each byte array is prefixed with a 4 bytes little endian length. 827 LEAQ 4(CX)(DI*1), DX 828 MOVL (CX)(DI*1), DI 829 830 // Store the length and pointer to the value into the output location. 831 // The memory layout is expected to hold a pointer and length, which are 832 // both 64 bits words. This is the layout used by parquet.Value and the Go 833 // string value type. 834 MOVQ DX, (R10) 835 MOVQ DI, 8(R10) 836 837 ADDQ R11, R10 838 INCQ SI 839 test: 840 CMPQ SI, R9 841 JNE loop 842 XORQ AX, AX 843 return: 844 MOVQ AX, ret+96(FP) 845 RET 846 indexOutOfBounds: 847 MOVQ $errnoIndexOutOfBounds, AX 848 JMP return 849 850 // func dictionaryLookupFixedLenByteArrayString(dict []byte, len int, indexes []int32, rows sparse.Array) errno 851 TEXT ·dictionaryLookupFixedLenByteArrayString(SB), NOSPLIT, $0-88 852 MOVQ dict_base+0(FP), AX 853 MOVQ dict_len+8(FP), BX 854 855 MOVQ len+24(FP), CX 856 857 MOVQ indexes_base+32(FP), DX 858 MOVQ indexes_len+40(FP), R8 859 860 MOVQ rows_array_ptr+56(FP), R9 861 MOVQ rows_array_off+72(FP), R10 862 863 XORQ DI, DI 864 XORQ SI, SI 865 loop: 866 MOVL (DX)(SI*4), DI 867 IMULQ CX, DI 868 CMPL DI, BX 869 JAE indexOutOfBounds 870 871 ADDQ AX, DI 872 MOVQ DI, (R9) 873 MOVQ CX, 8(R9) 874 875 ADDQ R10, R9 876 INCQ SI 877 test: 878 CMPQ SI, R8 879 JNE loop 880 XORQ AX, AX 881 return: 882 MOVQ AX, ret+80(FP) 883 RET 884 indexOutOfBounds: 885 MOVQ $errnoIndexOutOfBounds, AX 886 JMP return 887 888 // This is the same algorithm as dictionaryLookupFixedLenByteArrayString but we 889 // only store the pointer to the location holding the value instead of storing 890 // the pair of pointer and length. Since the length is fixed for this dictionary 891 // type, the application can assume it at the call site. 892 // 893 // func dictionaryLookupFixedLenByteArrayPointer(dict []byte, len int, indexes []int32, rows sparse.Array) errno 894 TEXT ·dictionaryLookupFixedLenByteArrayPointer(SB), NOSPLIT, $0-88 895 MOVQ dict_base+0(FP), AX 896 MOVQ dict_len+8(FP), BX 897 898 MOVQ len+24(FP), CX 899 900 MOVQ indexes_base+32(FP), DX 901 MOVQ indexes_len+40(FP), R8 902 903 MOVQ rows_array_ptr+56(FP), R9 904 MOVQ rows_array_off+72(FP), R10 905 906 XORQ DI, DI 907 XORQ SI, SI 908 loop: 909 MOVL (DX)(SI*4), DI 910 IMULQ CX, DI 911 CMPL DI, BX 912 JAE indexOutOfBounds 913 914 ADDQ AX, DI 915 MOVQ DI, (R9) 916 917 ADDQ R10, R9 918 INCQ SI 919 test: 920 CMPQ SI, R8 921 JNE loop 922 XORQ AX, AX 923 return: 924 MOVQ AX, ret+80(FP) 925 RET 926 indexOutOfBounds: 927 MOVQ $errnoIndexOutOfBounds, AX 928 JMP return 929 930 GLOBL ·range0n8(SB), RODATA|NOPTR, $40 931 DATA ·range0n8+0(SB)/4, $0 932 DATA ·range0n8+4(SB)/4, $1 933 DATA ·range0n8+8(SB)/4, $2 934 DATA ·range0n8+12(SB)/4, $3 935 DATA ·range0n8+16(SB)/4, $4 936 DATA ·range0n8+20(SB)/4, $5 937 DATA ·range0n8+24(SB)/4, $6 938 DATA ·range0n8+28(SB)/4, $7 939 DATA ·range0n8+32(SB)/4, $8