github.com/parquet-go/parquet-go@v0.20.0/encoding/delta/binary_packed_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define blockSize 128 6 #define numMiniBlocks 4 7 #define miniBlockSize 32 8 9 // ----------------------------------------------------------------------------- 10 // 32 bits 11 // ----------------------------------------------------------------------------- 12 13 #define deltaInt32AVX2x8(baseAddr) \ 14 VMOVDQU baseAddr, Y1 \ // [0,1,2,3,4,5,6,7] 15 VPERMD Y1, Y3, Y2 \ // [7,0,1,2,3,4,5,6] 16 VPBLENDD $1, Y0, Y2, Y2 \ // [x,0,1,2,3,4,5,6] 17 VPSUBD Y2, Y1, Y2 \ // [0,1,2,...] - [x,0,1,...] 18 VMOVDQU Y2, baseAddr \ 19 VPERMD Y1, Y3, Y0 20 21 // func blockDeltaInt32AVX2(block *[blockSize]int32, lastValue int32) int32 22 TEXT ·blockDeltaInt32AVX2(SB), NOSPLIT, $0-20 23 MOVQ block+0(FP), AX 24 MOVL 4*blockSize-4(AX), CX 25 MOVL CX, ret+16(FP) 26 27 VPBROADCASTD lastValue+8(FP), Y0 28 VMOVDQU ·rotateLeft32(SB), Y3 29 30 XORQ SI, SI 31 loop: 32 deltaInt32AVX2x8(0(AX)(SI*4)) 33 deltaInt32AVX2x8(32(AX)(SI*4)) 34 deltaInt32AVX2x8(64(AX)(SI*4)) 35 deltaInt32AVX2x8(96(AX)(SI*4)) 36 ADDQ $32, SI 37 CMPQ SI, $blockSize 38 JNE loop 39 VZEROUPPER 40 RET 41 42 // func blockMinInt32AVX2(block *[blockSize]int32) int32 43 TEXT ·blockMinInt32AVX2(SB), NOSPLIT, $0-12 44 MOVQ block+0(FP), AX 45 VPBROADCASTD (AX), Y15 46 47 VPMINSD 0(AX), Y15, Y0 48 VPMINSD 32(AX), Y15, Y1 49 VPMINSD 64(AX), Y15, Y2 50 VPMINSD 96(AX), Y15, Y3 51 VPMINSD 128(AX), Y15, Y4 52 VPMINSD 160(AX), Y15, Y5 53 VPMINSD 192(AX), Y15, Y6 54 VPMINSD 224(AX), Y15, Y7 55 VPMINSD 256(AX), Y15, Y8 56 VPMINSD 288(AX), Y15, Y9 57 VPMINSD 320(AX), Y15, Y10 58 VPMINSD 352(AX), Y15, Y11 59 VPMINSD 384(AX), Y15, Y12 60 VPMINSD 416(AX), Y15, Y13 61 VPMINSD 448(AX), Y15, Y14 62 VPMINSD 480(AX), Y15, Y15 63 64 VPMINSD Y1, Y0, Y0 65 VPMINSD Y3, Y2, Y2 66 VPMINSD Y5, Y4, Y4 67 VPMINSD Y7, Y6, Y6 68 VPMINSD Y9, Y8, Y8 69 VPMINSD Y11, Y10, Y10 70 VPMINSD Y13, Y12, Y12 71 VPMINSD Y15, Y14, Y14 72 73 VPMINSD Y2, Y0, Y0 74 VPMINSD Y6, Y4, Y4 75 VPMINSD Y10, Y8, Y8 76 VPMINSD Y14, Y12, Y12 77 78 VPMINSD Y4, Y0, Y0 79 VPMINSD Y12, Y8, Y8 80 81 VPMINSD Y8, Y0, Y0 82 83 VPERM2I128 $1, Y0, Y0, Y1 84 VPMINSD Y1, Y0, Y0 85 86 VPSHUFD $0b00011011, Y0, Y1 87 VPMINSD Y1, Y0, Y0 88 VZEROUPPER 89 90 MOVQ X0, CX 91 MOVL CX, BX 92 SHRQ $32, CX 93 CMPL CX, BX 94 CMOVLLT CX, BX 95 MOVL BX, ret+8(FP) 96 RET 97 98 #define subInt32AVX2x32(baseAddr, offset) \ 99 VMOVDQU offset+0(baseAddr), Y1 \ 100 VMOVDQU offset+32(baseAddr), Y2 \ 101 VMOVDQU offset+64(baseAddr), Y3 \ 102 VMOVDQU offset+96(baseAddr), Y4 \ 103 VPSUBD Y0, Y1, Y1 \ 104 VPSUBD Y0, Y2, Y2 \ 105 VPSUBD Y0, Y3, Y3 \ 106 VPSUBD Y0, Y4, Y4 \ 107 VMOVDQU Y1, offset+0(baseAddr) \ 108 VMOVDQU Y2, offset+32(baseAddr) \ 109 VMOVDQU Y3, offset+64(baseAddr) \ 110 VMOVDQU Y4, offset+96(baseAddr) 111 112 // func blockSubInt32AVX2(block *[blockSize]int32, value int32) 113 TEXT ·blockSubInt32AVX2(SB), NOSPLIT, $0-12 114 MOVQ block+0(FP), AX 115 VPBROADCASTD value+8(FP), Y0 116 subInt32AVX2x32(AX, 0) 117 subInt32AVX2x32(AX, 128) 118 subInt32AVX2x32(AX, 256) 119 subInt32AVX2x32(AX, 384) 120 VZEROUPPER 121 RET 122 123 // func blockBitWidthsInt32AVX2(bitWidths *[numMiniBlocks]byte, block *[blockSize]int32) 124 TEXT ·blockBitWidthsInt32AVX2(SB), NOSPLIT, $0-16 125 MOVQ bitWidths+0(FP), AX 126 MOVQ block+8(FP), BX 127 128 // AVX2 only has signed comparisons (and min/max), we emulate working on 129 // unsigned values by adding -2^31 to the values. Y5 is a vector of -2^31 130 // used to offset 8 packed 32 bits integers in other YMM registers where 131 // the block data are loaded. 132 VPCMPEQD Y5, Y5, Y5 133 VPSLLD $31, Y5, Y5 134 135 XORQ DI, DI 136 loop: 137 VPBROADCASTD (BX), Y0 // max 138 VPADDD Y5, Y0, Y0 139 140 VMOVDQU (BX), Y1 141 VMOVDQU 32(BX), Y2 142 VMOVDQU 64(BX), Y3 143 VMOVDQU 96(BX), Y4 144 145 VPADDD Y5, Y1, Y1 146 VPADDD Y5, Y2, Y2 147 VPADDD Y5, Y3, Y3 148 VPADDD Y5, Y4, Y4 149 150 VPMAXSD Y2, Y1, Y1 151 VPMAXSD Y4, Y3, Y3 152 VPMAXSD Y3, Y1, Y1 153 VPMAXSD Y1, Y0, Y0 154 155 VPERM2I128 $1, Y0, Y0, Y1 156 VPMAXSD Y1, Y0, Y0 157 158 VPSHUFD $0b00011011, Y0, Y1 159 VPMAXSD Y1, Y0, Y0 160 VPSUBD Y5, Y0, Y0 161 162 MOVQ X0, CX 163 MOVL CX, DX 164 SHRQ $32, CX 165 CMPL CX, DX 166 CMOVLHI CX, DX 167 168 LZCNTL DX, DX 169 NEGL DX 170 ADDL $32, DX 171 MOVB DX, (AX)(DI*1) 172 173 ADDQ $128, BX 174 INCQ DI 175 CMPQ DI, $numMiniBlocks 176 JNE loop 177 VZEROUPPER 178 RET 179 180 // encodeMiniBlockInt32Default is the generic implementation of the algorithm to 181 // pack 32 bit integers into values of a given bit width (<=32). 182 // 183 // This algorithm is much slower than the vectorized versions, but is useful 184 // as a reference implementation to run the tests against, and as fallback when 185 // the code runs on a CPU which does not support the AVX2 instruction set. 186 // 187 // func encodeMiniBlockInt32Default(dst *byte, src *[miniBlockSize]int32, bitWidth uint) 188 TEXT ·encodeMiniBlockInt32Default(SB), NOSPLIT, $0-24 189 MOVQ dst+0(FP), AX 190 MOVQ src+8(FP), BX 191 MOVQ bitWidth+16(FP), R9 192 193 XORQ DI, DI // bitOffset 194 XORQ SI, SI 195 loop: 196 MOVQ DI, CX 197 MOVQ DI, DX 198 199 ANDQ $0b11111, CX // bitOffset % 32 200 SHRQ $5, DX // bitOffset / 32 201 202 MOVLQZX (BX)(SI*4), R8 203 SHLQ CX, R8 204 ORQ R8, (AX)(DX*4) 205 206 ADDQ R9, DI 207 INCQ SI 208 CMPQ SI, $miniBlockSize 209 JNE loop 210 RET 211 212 // encodeMiniBlockInt32x1bitAVX2 packs 32 bit integers into 1 bit values in the 213 // the output buffer. 214 // 215 // The algorithm uses MOVMSKPS to extract the 8 relevant bits from the 8 values 216 // packed in YMM registers, then combines 4 of these into a 32 bit word which 217 // then gets written to the output. The result is 32 bits because each mini 218 // block has 32 values (the block size is 128 and there are 4 mini blocks per 219 // block). 220 // 221 // func encodeMiniBlockInt32x1bitAVX2(dst *byte, src *[miniBlockSize]int32) 222 TEXT ·encodeMiniBlockInt32x1bitAVX2(SB), NOSPLIT, $0-16 223 MOVQ dst+0(FP), AX 224 MOVQ src+8(FP), BX 225 226 VMOVDQU 0(BX), Y0 227 VMOVDQU 32(BX), Y1 228 VMOVDQU 64(BX), Y2 229 VMOVDQU 96(BX), Y3 230 231 VPSLLD $31, Y0, Y0 232 VPSLLD $31, Y1, Y1 233 VPSLLD $31, Y2, Y2 234 VPSLLD $31, Y3, Y3 235 236 VMOVMSKPS Y0, R8 237 VMOVMSKPS Y1, R9 238 VMOVMSKPS Y2, R10 239 VMOVMSKPS Y3, R11 240 241 SHLL $8, R9 242 SHLL $16, R10 243 SHLL $24, R11 244 245 ORL R9, R8 246 ORL R10, R8 247 ORL R11, R8 248 MOVL R8, (AX) 249 VZEROUPPER 250 RET 251 252 // encodeMiniBlockInt32x2bitsAVX2 implements an algorithm for packing 32 bit 253 // integers into 2 bit values. 254 // 255 // The algorithm is derived from the one employed in encodeMiniBlockInt32x1bitAVX2 256 // but needs to perform a bit extra work since MOVMSKPS can only extract one bit 257 // per packed integer of each YMM vector. We run two passes to extract the two 258 // bits needed to compose each item of the result, and merge the values by 259 // interleaving the first and second bits with PDEP. 260 // 261 // func encodeMiniBlockInt32x2bitsAVX2(dst *byte, src *[miniBlockSize]int32) 262 TEXT ·encodeMiniBlockInt32x2bitsAVX2(SB), NOSPLIT, $0-16 263 MOVQ dst+0(FP), AX 264 MOVQ src+8(FP), BX 265 266 VMOVDQU 0(BX), Y0 267 VMOVDQU 32(BX), Y1 268 VMOVDQU 64(BX), Y2 269 VMOVDQU 96(BX), Y3 270 271 VPSLLD $31, Y0, Y4 272 VPSLLD $31, Y1, Y5 273 VPSLLD $31, Y2, Y6 274 VPSLLD $31, Y3, Y7 275 276 VMOVMSKPS Y4, R8 277 VMOVMSKPS Y5, R9 278 VMOVMSKPS Y6, R10 279 VMOVMSKPS Y7, R11 280 281 SHLQ $8, R9 282 SHLQ $16, R10 283 SHLQ $24, R11 284 ORQ R9, R8 285 ORQ R10, R8 286 ORQ R11, R8 287 288 MOVQ $0x5555555555555555, DX // 0b010101... 289 PDEPQ DX, R8, R8 290 291 VPSLLD $30, Y0, Y8 292 VPSLLD $30, Y1, Y9 293 VPSLLD $30, Y2, Y10 294 VPSLLD $30, Y3, Y11 295 296 VMOVMSKPS Y8, R12 297 VMOVMSKPS Y9, R13 298 VMOVMSKPS Y10, R14 299 VMOVMSKPS Y11, R15 300 301 SHLQ $8, R13 302 SHLQ $16, R14 303 SHLQ $24, R15 304 ORQ R13, R12 305 ORQ R14, R12 306 ORQ R15, R12 307 308 MOVQ $0xAAAAAAAAAAAAAAAA, DI // 0b101010... 309 PDEPQ DI, R12, R12 310 311 ORQ R12, R8 312 MOVQ R8, (AX) 313 VZEROUPPER 314 RET 315 316 // encodeMiniBlockInt32x32bitsAVX2 is a specialization of the bit packing logic 317 // for 32 bit integers when the output bit width is also 32, in which case a 318 // simple copy of the mini block to the output buffer produces the result. 319 // 320 // func encodeMiniBlockInt32x32bitsAVX2(dst *byte, src *[miniBlockSize]int32) 321 TEXT ·encodeMiniBlockInt32x32bitsAVX2(SB), NOSPLIT, $0-16 322 MOVQ dst+0(FP), AX 323 MOVQ src+8(FP), BX 324 VMOVDQU 0(BX), Y0 325 VMOVDQU 32(BX), Y1 326 VMOVDQU 64(BX), Y2 327 VMOVDQU 96(BX), Y3 328 VMOVDQU Y0, 0(AX) 329 VMOVDQU Y1, 32(AX) 330 VMOVDQU Y2, 64(AX) 331 VMOVDQU Y3, 96(AX) 332 VZEROUPPER 333 RET 334 335 // encodeMiniBlockInt32x3to16bitsAVX2 is the algorithm used to bit-pack 32 bit 336 // integers into values of width 3 to 16 bits. 337 // 338 // This function is a small overhead due to having to initialize registers with 339 // values that depend on the bit width. We measured this cost at ~10% throughput 340 // in synthetic benchmarks compared to generating constant shifts and offsets 341 // using a macro. Using a single function rather than generating one for each 342 // bit width has the benefit of reducing the code size, which in practice can 343 // also yield benefits like reducing CPU cache misses. Not using a macro also 344 // has other advantages like providing accurate line number of stack traces and 345 // enabling the use of breakpoints when debugging. Overall, this approach seemed 346 // to be the right trade off between performance and maintainability. 347 // 348 // The algorithm treats chunks of 8 values in 4 iterations to process all 32 349 // values of the mini block. Writes to the output buffer are aligned on 128 bits 350 // since we may write up to 128 bits (8 x 16 bits). Padding is therefore 351 // required in the output buffer to avoid triggering a segfault. 352 // The encodeInt32AVX2 method adds enough padding when sizing the output buffer 353 // to account for this requirement. 354 // 355 // We leverage the two lanes of YMM registers to work on two sets of 4 values 356 // (in the sequence of VMOVDQU/VPSHUFD, VPAND, VPSLLQ, VPOR), resulting in having 357 // two sets of bit-packed values in the lower 64 bits of each YMM lane. 358 // The upper lane is then permuted into a lower lane to merge the two results, 359 // which may not be aligned on byte boundaries so we shift the lower and upper 360 // bits and compose two sets of 128 bits sequences (VPSLLQ, VPSRLQ, VBLENDPD), 361 // merge them and write the 16 bytes result to the output buffer. 362 TEXT ·encodeMiniBlockInt32x3to16bitsAVX2(SB), NOSPLIT, $0-24 363 MOVQ dst+0(FP), AX 364 MOVQ src+8(FP), BX 365 MOVQ bitWidth+16(FP), CX 366 367 VPBROADCASTQ bitWidth+16(FP), Y6 // [1*bitWidth...] 368 VPSLLQ $1, Y6, Y7 // [2*bitWidth...] 369 VPADDQ Y6, Y7, Y8 // [3*bitWidth...] 370 VPSLLQ $2, Y6, Y9 // [4*bitWidth...] 371 372 VPBROADCASTQ sixtyfour<>(SB), Y10 373 VPSUBQ Y6, Y10, Y11 // [64-1*bitWidth...] 374 VPSUBQ Y9, Y10, Y12 // [64-4*bitWidth...] 375 VPCMPEQQ Y4, Y4, Y4 376 VPSRLVQ Y11, Y4, Y4 377 378 VPXOR Y5, Y5, Y5 379 XORQ SI, SI 380 loop: 381 VMOVDQU (BX)(SI*4), Y0 382 VPSHUFD $0b01010101, Y0, Y1 383 VPSHUFD $0b10101010, Y0, Y2 384 VPSHUFD $0b11111111, Y0, Y3 385 386 VPAND Y4, Y0, Y0 387 VPAND Y4, Y1, Y1 388 VPAND Y4, Y2, Y2 389 VPAND Y4, Y3, Y3 390 391 VPSLLVQ Y6, Y1, Y1 392 VPSLLVQ Y7, Y2, Y2 393 VPSLLVQ Y8, Y3, Y3 394 395 VPOR Y1, Y0, Y0 396 VPOR Y3, Y2, Y2 397 VPOR Y2, Y0, Y0 398 399 VPERMQ $0b00001010, Y0, Y1 400 401 VPSLLVQ X9, X1, X2 402 VPSRLQ X12, X1, X3 403 VBLENDPD $0b10, X3, X2, X1 404 VBLENDPD $0b10, X5, X0, X0 405 VPOR X1, X0, X0 406 407 VMOVDQU X0, (AX) 408 409 ADDQ CX, AX 410 ADDQ $8, SI 411 CMPQ SI, $miniBlockSize 412 JNE loop 413 VZEROUPPER 414 RET 415 416 GLOBL sixtyfour<>(SB), RODATA|NOPTR, $32 417 DATA sixtyfour<>+0(SB)/8, $64 418 DATA sixtyfour<>+8(SB)/8, $64 419 DATA sixtyfour<>+16(SB)/8, $64 420 DATA sixtyfour<>+24(SB)/8, $64 421 422 // func decodeBlockInt32Default(dst []int32, minDelta, lastValue int32) int32 423 TEXT ·decodeBlockInt32Default(SB), NOSPLIT, $0-36 424 MOVQ dst_base+0(FP), AX 425 MOVQ dst_len+8(FP), BX 426 MOVLQZX minDelta+24(FP), CX 427 MOVLQZX lastValue+28(FP), DX 428 XORQ SI, SI 429 JMP test 430 loop: 431 MOVL (AX)(SI*4), DI 432 ADDL CX, DI 433 ADDL DI, DX 434 MOVL DX, (AX)(SI*4) 435 INCQ SI 436 test: 437 CMPQ SI, BX 438 JNE loop 439 done: 440 MOVL DX, ret+32(FP) 441 RET 442 443 // func decodeBlockInt32AVX2(dst []int32, minDelta, lastValue int32) int32 444 TEXT ·decodeBlockInt32AVX2(SB), NOSPLIT, $0-36 445 MOVQ dst_base+0(FP), AX 446 MOVQ dst_len+8(FP), BX 447 MOVLQZX minDelta+24(FP), CX 448 MOVLQZX lastValue+28(FP), DX 449 XORQ SI, SI 450 451 CMPQ BX, $8 452 JB test 453 454 MOVQ BX, DI 455 SHRQ $3, DI 456 SHLQ $3, DI 457 458 VPXOR X1, X1, X1 459 MOVQ CX, X0 460 MOVQ DX, X1 461 VPBROADCASTD X0, Y0 462 loopAVX2: 463 VMOVDQU (AX)(SI*4), Y2 464 VPADDD Y0, Y2, Y2 // Y2[:] += minDelta 465 VPADDD Y1, Y2, Y2 // Y2[0] += lastValue 466 467 VPSLLDQ $4, Y2, Y3 468 VPADDD Y3, Y2, Y2 469 470 VPSLLDQ $8, Y2, Y3 471 VPADDD Y3, Y2, Y2 472 473 VPSHUFD $0xFF, X2, X1 474 VPERM2I128 $1, Y2, Y2, Y3 475 VPADDD X1, X3, X3 476 477 VMOVDQU X2, (AX)(SI*4) 478 VMOVDQU X3, 16(AX)(SI*4) 479 VPSRLDQ $12, X3, X1 // lastValue 480 481 ADDQ $8, SI 482 CMPQ SI, DI 483 JNE loopAVX2 484 VZEROUPPER 485 MOVQ X1, DX 486 JMP test 487 loop: 488 MOVL (AX)(SI*4), DI 489 ADDL CX, DI 490 ADDL DI, DX 491 MOVL DX, (AX)(SI*4) 492 INCQ SI 493 test: 494 CMPQ SI, BX 495 JNE loop 496 done: 497 MOVL DX, ret+32(FP) 498 RET 499 500 // ----------------------------------------------------------------------------- 501 // 64 bits 502 // ----------------------------------------------------------------------------- 503 504 #define deltaInt64AVX2x4(baseAddr) \ 505 VMOVDQU baseAddr, Y1 \ // [0,1,2,3] 506 VPERMQ $0b10010011, Y1, Y2 \ // [3,0,1,2] 507 VPBLENDD $3, Y0, Y2, Y2 \ // [x,0,1,2] 508 VPSUBQ Y2, Y1, Y2 \ // [0,1,2,3] - [x,0,1,2] 509 VMOVDQU Y2, baseAddr \ 510 VPERMQ $0b10010011, Y1, Y0 511 512 // func blockDeltaInt64AVX2(block *[blockSize]int64, lastValue int64) int64 513 TEXT ·blockDeltaInt64AVX2(SB), NOSPLIT, $0-24 514 MOVQ block+0(FP), AX 515 MOVQ 8*blockSize-8(AX), CX 516 MOVQ CX, ret+16(FP) 517 518 VPBROADCASTQ lastValue+8(FP), Y0 519 XORQ SI, SI 520 loop: 521 deltaInt64AVX2x4((AX)(SI*8)) 522 deltaInt64AVX2x4(32(AX)(SI*8)) 523 deltaInt64AVX2x4(64(AX)(SI*8)) 524 deltaInt64AVX2x4(96(AX)(SI*8)) 525 ADDQ $16, SI 526 CMPQ SI, $blockSize 527 JNE loop 528 VZEROUPPER 529 RET 530 531 // vpminsq is an emulation of the AVX-512 VPMINSQ instruction with AVX2. 532 #define vpminsq(ones, tmp, arg2, arg1, ret) \ 533 VPCMPGTQ arg1, arg2, tmp \ 534 VPBLENDVB tmp, arg1, arg2, ret 535 536 // func blockMinInt64AVX2(block *[blockSize]int64) int64 537 TEXT ·blockMinInt64AVX2(SB), NOSPLIT, $0-16 538 MOVQ block+0(FP), AX 539 XORQ SI, SI 540 VPCMPEQQ Y9, Y9, Y9 // ones 541 VPBROADCASTQ (AX), Y0 542 loop: 543 VMOVDQU 0(AX)(SI*8), Y1 544 VMOVDQU 32(AX)(SI*8), Y2 545 VMOVDQU 64(AX)(SI*8), Y3 546 VMOVDQU 96(AX)(SI*8), Y4 547 VMOVDQU 128(AX)(SI*8), Y5 548 VMOVDQU 160(AX)(SI*8), Y6 549 VMOVDQU 192(AX)(SI*8), Y7 550 VMOVDQU 224(AX)(SI*8), Y8 551 552 vpminsq(Y9, Y10, Y0, Y1, Y1) 553 vpminsq(Y9, Y11, Y0, Y2, Y2) 554 vpminsq(Y9, Y12, Y0, Y3, Y3) 555 vpminsq(Y9, Y13, Y0, Y4, Y4) 556 vpminsq(Y9, Y14, Y0, Y5, Y5) 557 vpminsq(Y9, Y15, Y0, Y6, Y6) 558 vpminsq(Y9, Y10, Y0, Y7, Y7) 559 vpminsq(Y9, Y11, Y0, Y8, Y8) 560 561 vpminsq(Y9, Y12, Y2, Y1, Y1) 562 vpminsq(Y9, Y13, Y4, Y3, Y3) 563 vpminsq(Y9, Y14, Y6, Y5, Y5) 564 vpminsq(Y9, Y15, Y8, Y7, Y7) 565 566 vpminsq(Y9, Y10, Y3, Y1, Y1) 567 vpminsq(Y9, Y11, Y7, Y5, Y5) 568 vpminsq(Y9, Y12, Y5, Y1, Y0) 569 570 ADDQ $32, SI 571 CMPQ SI, $blockSize 572 JNE loop 573 574 VPERM2I128 $1, Y0, Y0, Y1 575 vpminsq(Y9, Y10, Y1, Y0, Y0) 576 577 MOVQ X0, CX 578 VPEXTRQ $1, X0, BX 579 CMPQ CX, BX 580 CMOVQLT CX, BX 581 MOVQ BX, ret+8(FP) 582 VZEROUPPER 583 RET 584 585 #define subInt64AVX2x32(baseAddr, offset) \ 586 VMOVDQU offset+0(baseAddr), Y1 \ 587 VMOVDQU offset+32(baseAddr), Y2 \ 588 VMOVDQU offset+64(baseAddr), Y3 \ 589 VMOVDQU offset+96(baseAddr), Y4 \ 590 VMOVDQU offset+128(baseAddr), Y5 \ 591 VMOVDQU offset+160(baseAddr), Y6 \ 592 VMOVDQU offset+192(baseAddr), Y7 \ 593 VMOVDQU offset+224(baseAddr), Y8 \ 594 VPSUBQ Y0, Y1, Y1 \ 595 VPSUBQ Y0, Y2, Y2 \ 596 VPSUBQ Y0, Y3, Y3 \ 597 VPSUBQ Y0, Y4, Y4 \ 598 VPSUBQ Y0, Y5, Y5 \ 599 VPSUBQ Y0, Y6, Y6 \ 600 VPSUBQ Y0, Y7, Y7 \ 601 VPSUBQ Y0, Y8, Y8 \ 602 VMOVDQU Y1, offset+0(baseAddr) \ 603 VMOVDQU Y2, offset+32(baseAddr) \ 604 VMOVDQU Y3, offset+64(baseAddr) \ 605 VMOVDQU Y4, offset+96(baseAddr) \ 606 VMOVDQU Y5, offset+128(baseAddr) \ 607 VMOVDQU Y6, offset+160(baseAddr) \ 608 VMOVDQU Y7, offset+192(baseAddr) \ 609 VMOVDQU Y8, offset+224(baseAddr) 610 611 // func blockSubInt64AVX2(block *[blockSize]int64, value int64) 612 TEXT ·blockSubInt64AVX2(SB), NOSPLIT, $0-16 613 MOVQ block+0(FP), AX 614 VPBROADCASTQ value+8(FP), Y0 615 subInt64AVX2x32(AX, 0) 616 subInt64AVX2x32(AX, 256) 617 subInt64AVX2x32(AX, 512) 618 subInt64AVX2x32(AX, 768) 619 VZEROUPPER 620 RET 621 622 // vpmaxsq is an emulation of the AVX-512 VPMAXSQ instruction with AVX2. 623 #define vpmaxsq(tmp, arg2, arg1, ret) \ 624 VPCMPGTQ arg2, arg1, tmp \ 625 VPBLENDVB tmp, arg1, arg2, ret 626 627 // func blockBitWidthsInt64AVX2(bitWidths *[numMiniBlocks]byte, block *[blockSize]int64) 628 TEXT ·blockBitWidthsInt64AVX2(SB), NOSPLIT, $0-16 629 MOVQ bitWidths+0(FP), AX 630 MOVQ block+8(FP), BX 631 632 // AVX2 only has signed comparisons (and min/max), we emulate working on 633 // unsigned values by adding -2^64 to the values. Y9 is a vector of -2^64 634 // used to offset 4 packed 64 bits integers in other YMM registers where 635 // the block data are loaded. 636 VPCMPEQQ Y9, Y9, Y9 637 VPSLLQ $63, Y9, Y9 638 639 XORQ DI, DI 640 loop: 641 VPBROADCASTQ (BX), Y0 // max 642 VPADDQ Y9, Y0, Y0 643 644 VMOVDQU (BX), Y1 645 VMOVDQU 32(BX), Y2 646 VMOVDQU 64(BX), Y3 647 VMOVDQU 96(BX), Y4 648 VMOVDQU 128(BX), Y5 649 VMOVDQU 160(BX), Y6 650 VMOVDQU 192(BX), Y7 651 VMOVDQU 224(BX), Y8 652 653 VPADDQ Y9, Y1, Y1 654 VPADDQ Y9, Y2, Y2 655 VPADDQ Y9, Y3, Y3 656 VPADDQ Y9, Y4, Y4 657 VPADDQ Y9, Y5, Y5 658 VPADDQ Y9, Y6, Y6 659 VPADDQ Y9, Y7, Y7 660 VPADDQ Y9, Y8, Y8 661 662 vpmaxsq(Y10, Y2, Y1, Y1) 663 vpmaxsq(Y11, Y4, Y3, Y3) 664 vpmaxsq(Y12, Y6, Y5, Y5) 665 vpmaxsq(Y13, Y8, Y7, Y7) 666 667 vpmaxsq(Y10, Y3, Y1, Y1) 668 vpmaxsq(Y11, Y7, Y5, Y5) 669 vpmaxsq(Y12, Y5, Y1, Y1) 670 vpmaxsq(Y13, Y1, Y0, Y0) 671 672 VPERM2I128 $1, Y0, Y0, Y1 673 vpmaxsq(Y10, Y1, Y0, Y0) 674 VPSUBQ Y9, Y0, Y0 675 676 MOVQ X0, CX 677 VPEXTRQ $1, X0, DX 678 CMPQ CX, DX 679 CMOVQHI CX, DX 680 681 LZCNTQ DX, DX 682 NEGQ DX 683 ADDQ $64, DX 684 MOVB DX, (AX)(DI*1) 685 686 ADDQ $256, BX 687 INCQ DI 688 CMPQ DI, $numMiniBlocks 689 JNE loop 690 VZEROUPPER 691 RET 692 693 // encodeMiniBlockInt64Default is the generic implementation of the algorithm to 694 // pack 64 bit integers into values of a given bit width (<=64). 695 // 696 // This algorithm is much slower than the vectorized versions, but is useful 697 // as a reference implementation to run the tests against, and as fallback when 698 // the code runs on a CPU which does not support the AVX2 instruction set. 699 // 700 // func encodeMiniBlockInt64Default(dst *byte, src *[miniBlockSize]int64, bitWidth uint) 701 TEXT ·encodeMiniBlockInt64Default(SB), NOSPLIT, $0-24 702 MOVQ dst+0(FP), AX 703 MOVQ src+8(FP), BX 704 MOVQ bitWidth+16(FP), R10 705 706 XORQ R11, R11 // zero 707 XORQ DI, DI // bitOffset 708 XORQ SI, SI 709 loop: 710 MOVQ DI, CX 711 MOVQ DI, DX 712 713 ANDQ $0b111111, CX // bitOffset % 64 714 SHRQ $6, DX // bitOffset / 64 715 716 MOVQ (BX)(SI*8), R8 717 MOVQ R8, R9 718 SHLQ CX, R8 719 NEGQ CX 720 ADDQ $64, CX 721 SHRQ CX, R9 722 CMPQ CX, $64 723 CMOVQEQ R11, R9 // needed because shifting by more than 63 is undefined 724 725 ORQ R8, 0(AX)(DX*8) 726 ORQ R9, 8(AX)(DX*8) 727 728 ADDQ R10, DI 729 INCQ SI 730 CMPQ SI, $miniBlockSize 731 JNE loop 732 RET 733 734 // func encodeMiniBlockInt64x1bitAVX2(dst *byte, src *[miniBlockSize]int64) 735 TEXT ·encodeMiniBlockInt64x1bitAVX2(SB), NOSPLIT, $0-16 736 MOVQ dst+0(FP), AX 737 MOVQ src+8(FP), BX 738 739 VMOVDQU 0(BX), Y0 740 VMOVDQU 32(BX), Y1 741 VMOVDQU 64(BX), Y2 742 VMOVDQU 96(BX), Y3 743 VMOVDQU 128(BX), Y4 744 VMOVDQU 160(BX), Y5 745 VMOVDQU 192(BX), Y6 746 VMOVDQU 224(BX), Y7 747 748 VPSLLQ $63, Y0, Y0 749 VPSLLQ $63, Y1, Y1 750 VPSLLQ $63, Y2, Y2 751 VPSLLQ $63, Y3, Y3 752 VPSLLQ $63, Y4, Y4 753 VPSLLQ $63, Y5, Y5 754 VPSLLQ $63, Y6, Y6 755 VPSLLQ $63, Y7, Y7 756 757 VMOVMSKPD Y0, R8 758 VMOVMSKPD Y1, R9 759 VMOVMSKPD Y2, R10 760 VMOVMSKPD Y3, R11 761 VMOVMSKPD Y4, R12 762 VMOVMSKPD Y5, R13 763 VMOVMSKPD Y6, R14 764 VMOVMSKPD Y7, R15 765 766 SHLL $4, R9 767 SHLL $8, R10 768 SHLL $12, R11 769 SHLL $16, R12 770 SHLL $20, R13 771 SHLL $24, R14 772 SHLL $28, R15 773 774 ORL R9, R8 775 ORL R11, R10 776 ORL R13, R12 777 ORL R15, R14 778 ORL R10, R8 779 ORL R14, R12 780 ORL R12, R8 781 782 MOVL R8, (AX) 783 VZEROUPPER 784 RET 785 786 // func encodeMiniBlockInt64x2bitsAVX2(dst *byte, src *[miniBlockSize]int64) 787 TEXT ·encodeMiniBlockInt64x2bitsAVX2(SB), NOSPLIT, $0-16 788 MOVQ dst+0(FP), AX 789 MOVQ src+8(FP), BX 790 791 VMOVDQU 0(BX), Y8 792 VMOVDQU 32(BX), Y9 793 VMOVDQU 64(BX), Y10 794 VMOVDQU 96(BX), Y11 795 VMOVDQU 128(BX), Y12 796 VMOVDQU 160(BX), Y13 797 VMOVDQU 192(BX), Y14 798 VMOVDQU 224(BX), Y15 799 800 VPSLLQ $63, Y8, Y0 801 VPSLLQ $63, Y9, Y1 802 VPSLLQ $63, Y10, Y2 803 VPSLLQ $63, Y11, Y3 804 VPSLLQ $63, Y12, Y4 805 VPSLLQ $63, Y13, Y5 806 VPSLLQ $63, Y14, Y6 807 VPSLLQ $63, Y15, Y7 808 809 VMOVMSKPD Y0, R8 810 VMOVMSKPD Y1, R9 811 VMOVMSKPD Y2, R10 812 VMOVMSKPD Y3, R11 813 VMOVMSKPD Y4, R12 814 VMOVMSKPD Y5, R13 815 VMOVMSKPD Y6, R14 816 VMOVMSKPD Y7, R15 817 818 SHLQ $4, R9 819 SHLQ $8, R10 820 SHLQ $12, R11 821 SHLQ $16, R12 822 SHLQ $20, R13 823 SHLQ $24, R14 824 SHLQ $28, R15 825 826 ORQ R9, R8 827 ORQ R11, R10 828 ORQ R13, R12 829 ORQ R15, R14 830 ORQ R10, R8 831 ORQ R14, R12 832 ORQ R12, R8 833 834 MOVQ $0x5555555555555555, CX // 0b010101... 835 PDEPQ CX, R8, CX 836 837 VPSLLQ $62, Y8, Y8 838 VPSLLQ $62, Y9, Y9 839 VPSLLQ $62, Y10, Y10 840 VPSLLQ $62, Y11, Y11 841 VPSLLQ $62, Y12, Y12 842 VPSLLQ $62, Y13, Y13 843 VPSLLQ $62, Y14, Y14 844 VPSLLQ $62, Y15, Y15 845 846 VMOVMSKPD Y8, R8 847 VMOVMSKPD Y9, R9 848 VMOVMSKPD Y10, R10 849 VMOVMSKPD Y11, R11 850 VMOVMSKPD Y12, R12 851 VMOVMSKPD Y13, R13 852 VMOVMSKPD Y14, R14 853 VMOVMSKPD Y15, R15 854 855 SHLQ $4, R9 856 SHLQ $8, R10 857 SHLQ $12, R11 858 SHLQ $16, R12 859 SHLQ $20, R13 860 SHLQ $24, R14 861 SHLQ $28, R15 862 863 ORQ R9, R8 864 ORQ R11, R10 865 ORQ R13, R12 866 ORQ R15, R14 867 ORQ R10, R8 868 ORQ R14, R12 869 ORQ R12, R8 870 871 MOVQ $0xAAAAAAAAAAAAAAAA, DX // 0b101010... 872 PDEPQ DX, R8, DX 873 ORQ DX, CX 874 MOVQ CX, (AX) 875 VZEROUPPER 876 RET 877 878 // func encodeMiniBlockInt64x64bitsAVX2(dst *byte, src *[miniBlockSize]int64) 879 TEXT ·encodeMiniBlockInt64x64bitsAVX2(SB), NOSPLIT, $0-16 880 MOVQ dst+0(FP), AX 881 MOVQ src+8(FP), BX 882 VMOVDQU 0(BX), Y0 883 VMOVDQU 32(BX), Y1 884 VMOVDQU 64(BX), Y2 885 VMOVDQU 96(BX), Y3 886 VMOVDQU 128(BX), Y4 887 VMOVDQU 160(BX), Y5 888 VMOVDQU 192(BX), Y6 889 VMOVDQU 224(BX), Y7 890 VMOVDQU Y0, 0(AX) 891 VMOVDQU Y1, 32(AX) 892 VMOVDQU Y2, 64(AX) 893 VMOVDQU Y3, 96(AX) 894 VMOVDQU Y4, 128(AX) 895 VMOVDQU Y5, 160(AX) 896 VMOVDQU Y6, 192(AX) 897 VMOVDQU Y7, 224(AX) 898 VZEROUPPER 899 RET 900 901 // func decodeBlockInt64Default(dst []int64, minDelta, lastValue int64) int64 902 TEXT ·decodeBlockInt64Default(SB), NOSPLIT, $0-48 903 MOVQ dst_base+0(FP), AX 904 MOVQ dst_len+8(FP), BX 905 MOVQ minDelta+24(FP), CX 906 MOVQ lastValue+32(FP), DX 907 XORQ SI, SI 908 JMP test 909 loop: 910 MOVQ (AX)(SI*8), DI 911 ADDQ CX, DI 912 ADDQ DI, DX 913 MOVQ DX, (AX)(SI*8) 914 INCQ SI 915 test: 916 CMPQ SI, BX 917 JNE loop 918 done: 919 MOVQ DX, ret+40(FP) 920 RET