github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/internal/bitpack/unpack_int64_amd64.s (about) 1 //go:build !purego 2 3 #include "funcdata.h" 4 #include "textflag.h" 5 6 // func unpackInt64Default(dst []int64, src []uint32, bitWidth uint) 7 TEXT ·unpackInt64Default(SB), NOSPLIT, $0-56 8 MOVQ dst_base+0(FP), AX 9 MOVQ dst_len+8(FP), DX 10 MOVQ src_base+24(FP), BX 11 MOVQ bitWidth+48(FP), CX 12 13 MOVQ $1, R8 // bitMask = (1 << bitWidth) - 1 14 SHLQ CX, R8, R8 15 DECQ R8 16 MOVQ CX, R9 // bitWidth 17 18 XORQ DI, DI // bitOffset 19 XORQ SI, SI // index 20 XORQ R10, R10 21 XORQ R11, R11 22 XORQ R14, R14 23 JMP test 24 loop: 25 MOVQ DI, R10 26 MOVQ DI, CX 27 SHRQ $5, R10 // i = bitOffset / 32 28 ANDQ $0b11111, CX // j = bitOffset % 32 29 30 MOVLQZX (BX)(R10*4), R11 31 MOVQ R8, R12 // d = bitMask 32 SHLQ CX, R12 // d = d << j 33 ANDQ R12, R11 // d = src[i] & d 34 SHRQ CX, R11 // d = d >> j 35 36 MOVQ CX, R13 37 ADDQ R9, R13 38 CMPQ R13, $32 39 JBE next // j+bitWidth <= 32 ? 40 MOVQ CX, R15 // j 41 42 MOVLQZX 4(BX)(R10*4), R14 43 MOVQ $32, CX 44 SUBQ R15, CX // k = 32 - j 45 MOVQ R8, R12 // c = bitMask 46 SHRQ CX, R12 // c = c >> k 47 ANDQ R12, R14 // c = src[i+1] & c 48 SHLQ CX, R14 // c = c << k 49 ORQ R14, R11 // d = d | c 50 51 CMPQ R13, $64 52 JBE next 53 54 MOVLQZX 8(BX)(R10*4), R14 55 MOVQ $64, CX 56 SUBQ R15, CX // k = 64 - j 57 MOVQ R8, R12 // c = bitMask 58 SHRQ CX, R12 // c = c >> k 59 ANDQ R12, R14 // c = src[i+2] & c 60 SHLQ CX, R14 // c = c << k 61 ORQ R14, R11 // d = d | c 62 next: 63 MOVQ R11, (AX)(SI*8) // dst[n] = d 64 ADDQ R9, DI // bitOffset += bitWidth 65 INCQ SI 66 test: 67 CMPQ SI, DX 68 JNE loop 69 RET 70 71 // This bit unpacking function was inspired from the 32 bit version, but 72 // adapted to account for the fact that eight 64 bit values span across 73 // two YMM registers, and across lanes of YMM registers. 74 // 75 // Because of the two lanes of YMM registers, we cannot use the VPSHUFB 76 // instruction to dispatch bytes of the input to the registers. Instead we use 77 // the VPERMD instruction, which has higher latency but supports dispatching 78 // bytes across register lanes. Measurable throughput gains remain despite the 79 // algorithm running on a few more CPU cycles per loop. 80 // 81 // The initialization phase of this algorithm generates masks for 82 // permutations and shifts used to decode the bit-packed values. 83 // 84 // The permutation masks are written to Y7 and Y8, and contain the results 85 // of this formula: 86 // 87 // temp[i] = (bitWidth * i) / 32 88 // mask[i] = temp[i] | ((temp[i] + 1) << 32) 89 // 90 // Since VPERMQ only supports reading the permutation combination from an 91 // immediate value, we use VPERMD and generate permutation for pairs of two 92 // consecutive 32 bit words, which is why we have the upper part of each 64 93 // bit word set with (x+1)<<32. 94 // 95 // The masks for right shifts are written to Y5 and Y6, and computed with 96 // this formula: 97 // 98 // shift[i] = (bitWidth * i) - (32 * ((bitWidth * i) / 32)) 99 // 100 // The amount to shift by is the number of values previously unpacked, offseted 101 // by the byte count of 32 bit words that we read from first bits from. 102 // 103 // Technically the masks could be precomputed and declared in global tables; 104 // however, declaring masks for all bit width is tedious and makes code 105 // maintenance more costly for no measurable benefits on production workloads. 106 // 107 // func unpackInt64x1to32bitsAVX2(dst []int64, src []byte, bitWidth uint) 108 TEXT ·unpackInt64x1to32bitsAVX2(SB), NOSPLIT, $56-56 109 NO_LOCAL_POINTERS 110 MOVQ dst_base+0(FP), AX 111 MOVQ dst_len+8(FP), DX 112 MOVQ src_base+24(FP), BX 113 MOVQ bitWidth+48(FP), CX 114 115 CMPQ DX, $8 116 JB tail 117 118 MOVQ DX, DI 119 SHRQ $3, DI 120 SHLQ $3, DI 121 XORQ SI, SI 122 123 MOVQ $1, R8 124 SHLQ CX, R8 125 DECQ R8 126 MOVQ R8, X0 127 VPBROADCASTQ X0, Y0 // bitMask = (1 << bitWidth) - 1 128 129 VPCMPEQQ Y1, Y1, Y1 130 VPSRLQ $63, Y1, Y1 // [1,1,1,1] 131 132 MOVQ CX, X2 133 VPBROADCASTQ X2, Y2 // [bitWidth] 134 135 VMOVDQU range0n7<>+0(SB), Y3 // [0,1,2,3] 136 VMOVDQU range0n7<>+32(SB), Y4 // [4,5,6,7] 137 138 VPMULLD Y2, Y3, Y5 // [bitWidth] * [0,1,2,3] 139 VPMULLD Y2, Y4, Y6 // [bitWidth] * [4,5,6,7] 140 141 VPSRLQ $5, Y5, Y7 // ([bitWidth] * [0,1,2,3]) / 32 142 VPSRLQ $5, Y6, Y8 // ([bitWidth] * [4,5,6,7]) / 32 143 144 VPSLLQ $5, Y7, Y9 // (([bitWidth] * [0,1,2,3]) / 32) * 32 145 VPSLLQ $5, Y8, Y10 // (([bitWidth] * [4,5,6,7]) / 32) * 32 146 147 VPADDQ Y1, Y7, Y11 148 VPADDQ Y1, Y8, Y12 149 VPSLLQ $32, Y11, Y11 150 VPSLLQ $32, Y12, Y12 151 VPOR Y11, Y7, Y7 // permutations[i] = [i | ((i + 1) << 32)] 152 VPOR Y12, Y8, Y8 // permutations[i] = [i | ((i + 1) << 32)] 153 154 VPSUBQ Y9, Y5, Y5 // shifts 155 VPSUBQ Y10, Y6, Y6 156 loop: 157 VMOVDQU (BX), Y1 158 159 VPERMD Y1, Y7, Y2 160 VPERMD Y1, Y8, Y3 161 162 VPSRLVQ Y5, Y2, Y2 163 VPSRLVQ Y6, Y3, Y3 164 165 VPAND Y0, Y2, Y2 166 VPAND Y0, Y3, Y3 167 168 VMOVDQU Y2, (AX)(SI*8) 169 VMOVDQU Y3, 32(AX)(SI*8) 170 171 ADDQ CX, BX 172 ADDQ $8, SI 173 CMPQ SI, DI 174 JNE loop 175 VZEROUPPER 176 177 CMPQ SI, DX 178 JE done 179 LEAQ (AX)(SI*8), AX 180 SUBQ SI, DX 181 tail: 182 MOVQ AX, dst_base-56(SP) 183 MOVQ DX, dst_len-48(SP) 184 MOVQ BX, src_base-32(SP) 185 MOVQ CX, bitWidth-8(SP) 186 CALL ·unpackInt64Default(SB) 187 done: 188 RET 189 190 GLOBL range0n7<>(SB), RODATA|NOPTR, $64 191 DATA range0n7<>+0(SB)/8, $0 192 DATA range0n7<>+8(SB)/8, $1 193 DATA range0n7<>+16(SB)/8, $2 194 DATA range0n7<>+24(SB)/8, $3 195 DATA range0n7<>+32(SB)/8, $4 196 DATA range0n7<>+40(SB)/8, $5 197 DATA range0n7<>+48(SB)/8, $6 198 DATA range0n7<>+56(SB)/8, $7