github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/internal/bitpack/unpack_int32_amd64.s (about) 1 //go:build !purego 2 3 #include "funcdata.h" 4 #include "textflag.h" 5 6 // func unpackInt32Default(dst []int32, src []byte, bitWidth uint) 7 TEXT ·unpackInt32Default(SB), NOSPLIT, $0-56 8 MOVQ dst_base+0(FP), AX 9 MOVQ dst_len+8(FP), DX 10 MOVQ src_base+24(FP), BX 11 MOVQ bitWidth+48(FP), CX 12 13 MOVQ $1, R8 // bitMask = (1 << bitWidth) - 1 14 SHLQ CX, R8 15 DECQ R8 16 MOVQ CX, R9 // bitWidth 17 18 XORQ DI, DI // bitOffset 19 XORQ SI, SI // index 20 JMP test 21 loop: 22 MOVQ DI, R10 23 MOVQ DI, CX 24 SHRQ $5, R10 // i = bitOffset / 32 25 ANDQ $0b11111, CX // j = bitOffset % 32 26 27 MOVL (BX)(R10*4), R11 28 MOVL R8, R12 // d = bitMask 29 SHLL CX, R12 // d = d << j 30 ANDL R12, R11 // d = src[i] & d 31 SHRL CX, R11 // d = d >> j 32 33 MOVL CX, R13 34 ADDL R9, R13 35 CMPL R13, $32 36 JBE next // j+bitWidth <= 32 ? 37 38 MOVL 4(BX)(R10*4), R14 39 MOVL CX, R12 40 MOVL $32, CX 41 SUBL R12, CX // k = 32 - j 42 MOVL R8, R12 // c = bitMask 43 SHRL CX, R12 // c = c >> k 44 ANDL R12, R14 // c = src[i+1] & c 45 SHLL CX, R14 // c = c << k 46 ORL R14, R11 // d = d | c 47 next: 48 MOVL R11, (AX)(SI*4) // dst[n] = d 49 ADDQ R9, DI // bitOffset += bitWidth 50 INCQ SI 51 test: 52 CMPQ SI, DX 53 JNE loop 54 RET 55 56 // ----------------------------------------------------------------------------- 57 // The unpack* functions below are adaptations of the algorithms 58 // described in "Decoding billions of integers per second through vectorization" 59 // from D. Lemire & L. Boytsov, the following changes were made: 60 // 61 // - The paper described two methods for decoding integers called "horizontal" 62 // and "vertical". The "horizontal" version is the one that applies the best 63 // to the bit packing done in the Parquet delta encoding; however, it also 64 // differs in some ways, many compression techniques discussed in the paper 65 // are not implemented in the Parquet format. 66 // 67 // - The paper focuses on implementations based on SSE instructions, which 68 // describes how to use PMULLD to emulate the lack of variable bit shift 69 // for packed integers. Our version of the bit unpacking algorithms here 70 // uses AVX2 and can perform variable bit shifts using VPSRLVD, which yields 71 // better throughput since the instruction latency is a single CPU cycle, 72 // vs 10 for VPMULLD. 73 // 74 // - The reference implementation at https://github.com/lemire/FastPFor/ uses 75 // specializations for each bit size, resulting in 32 unique functions. 76 // Our version here are more generic, we provide 3 variations of the 77 // algorithm for bit widths 1 to 16, 17 to 26, and 27 to 31 (unpacking 32 78 // bits values is a simple copy). In that regard, our implementation is 79 // somewhat an improvement over the reference, since it uses less code and 80 // less memory to hold the shuffle masks and shift tables. 81 // 82 // Technically, each specialization of our functions could be expressed by the 83 // algorithm used for unpacking values of 27 to 31 bits. However, multiple steps 84 // of the main loop can be removed for lower bit widths, providing up to ~35% 85 // better throughput for smaller sizes. Since we expect delta encoding to often 86 // result in bit packing values to smaller bit widths, the specializations are 87 // worth the extra complexity. 88 // 89 // For more details, see: https://arxiv.org/pdf/1209.2137v5.pdf 90 // ----------------------------------------------------------------------------- 91 92 // unpackInt32x1to16bitsAVX2 is the implementation of the bit unpacking 93 // algorithm for inputs of bit width 1 to 16. 94 // 95 // In this version of the algorithm, we can perform a single memory load in each 96 // loop iteration since we know that 8 values will fit in a single XMM register. 97 // 98 // func unpackInt32x1to16bitsAVX2(dst []int32, src []byte, bitWidth uint) 99 TEXT ·unpackInt32x1to16bitsAVX2(SB), NOSPLIT, $56-56 100 NO_LOCAL_POINTERS 101 MOVQ dst_base+0(FP), AX 102 MOVQ dst_len+8(FP), DX 103 MOVQ src_base+24(FP), BX 104 MOVQ bitWidth+48(FP), CX 105 106 CMPQ DX, $8 107 JB tail 108 109 MOVQ DX, DI 110 SHRQ $3, DI 111 SHLQ $3, DI 112 XORQ SI, SI 113 114 MOVQ $1, R8 115 SHLQ CX, R8 116 DECQ R8 117 MOVQ R8, X0 118 VPBROADCASTD X0, X0 // bitMask = (1 << bitWidth) - 1 119 120 MOVQ CX, R9 121 DECQ R9 122 SHLQ $5, R9 // 32 * (bitWidth - 1) 123 124 MOVQ CX, R10 125 DECQ R10 126 SHLQ $5, R10 127 ANDQ $0xFF, R10 // (32 * (bitWidth - 1)) % 256 128 129 LEAQ ·shuffleInt32x1to16bits(SB), R11 130 VMOVDQA (R11)(R9*1), X1 131 VMOVDQA 16(R11)(R9*1), X2 132 133 LEAQ ·shiftRightInt32(SB), R12 134 VMOVDQA (R12)(R10*1), X3 135 VMOVDQA 16(R12)(R10*1), X4 136 loop: 137 VMOVDQU (BX), X7 138 139 VPSHUFB X1, X7, X5 140 VPSHUFB X2, X7, X6 141 142 VPSRLVD X3, X5, X5 143 VPSRLVD X4, X6, X6 144 145 VPAND X0, X5, X5 146 VPAND X0, X6, X6 147 148 VMOVDQU X5, (AX)(SI*4) 149 VMOVDQU X6, 16(AX)(SI*4) 150 151 ADDQ CX, BX 152 ADDQ $8, SI 153 CMPQ SI, DI 154 JNE loop 155 VZEROUPPER 156 157 CMPQ SI, DX 158 JE done 159 LEAQ (AX)(SI*4), AX 160 SUBQ SI, DX 161 tail: 162 MOVQ AX, dst_base-56(SP) 163 MOVQ DX, dst_len-48(SP) 164 MOVQ BX, src_base-32(SP) 165 MOVQ CX, bitWidth-8(SP) 166 CALL ·unpackInt32Default(SB) 167 done: 168 RET 169 170 // unpackInt32x17to26bitsAVX2 is the implementation of the bit unpacking 171 // algorithm for inputs of bit width 17 to 26. 172 // 173 // In this version of the algorithm, we need to 32 bytes at each loop iteration 174 // because 8 bit-packed values will span across two XMM registers. 175 // 176 // func unpackInt32x17to26bitsAVX2(dst []int32, src []byte, bitWidth uint) 177 TEXT ·unpackInt32x17to26bitsAVX2(SB), NOSPLIT, $56-56 178 NO_LOCAL_POINTERS 179 MOVQ dst_base+0(FP), AX 180 MOVQ dst_len+8(FP), DX 181 MOVQ src_base+24(FP), BX 182 MOVQ bitWidth+48(FP), CX 183 184 CMPQ DX, $8 185 JB tail 186 187 MOVQ DX, DI 188 SHRQ $3, DI 189 SHLQ $3, DI 190 XORQ SI, SI 191 192 MOVQ $1, R8 193 SHLQ CX, R8 194 DECQ R8 195 MOVQ R8, X0 196 VPBROADCASTD X0, X0 197 198 MOVQ CX, R9 199 SUBQ $17, R9 200 IMULQ $48, R9 // 48 * (bitWidth - 17) 201 202 MOVQ CX, R10 203 DECQ R10 204 SHLQ $5, R10 205 ANDQ $0xFF, R10 // (32 * (bitWidth - 1)) % 256 206 207 LEAQ ·shuffleInt32x17to26bits(SB), R11 208 VMOVDQA (R11)(R9*1), X1 209 VMOVDQA 16(R11)(R9*1), X2 210 VMOVDQA 32(R11)(R9*1), X3 211 212 LEAQ ·shiftRightInt32(SB), R12 213 VMOVDQA (R12)(R10*1), X4 214 VMOVDQA 16(R12)(R10*1), X5 215 loop: 216 VMOVDQU (BX), X6 217 VMOVDQU 16(BX), X7 218 219 VPSHUFB X1, X6, X8 220 VPSHUFB X2, X6, X9 221 VPSHUFB X3, X7, X10 222 VPOR X10, X9, X9 223 224 VPSRLVD X4, X8, X8 225 VPSRLVD X5, X9, X9 226 227 VPAND X0, X8, X8 228 VPAND X0, X9, X9 229 230 VMOVDQU X8, (AX)(SI*4) 231 VMOVDQU X9, 16(AX)(SI*4) 232 233 ADDQ CX, BX 234 ADDQ $8, SI 235 CMPQ SI, DI 236 JNE loop 237 VZEROUPPER 238 239 CMPQ SI, DX 240 JE done 241 LEAQ (AX)(SI*4), AX 242 SUBQ SI, DX 243 tail: 244 MOVQ AX, dst_base-56(SP) 245 MOVQ DX, dst_len-48(SP) 246 MOVQ BX, src_base-32(SP) 247 MOVQ CX, bitWidth-8(SP) 248 CALL ·unpackInt32Default(SB) 249 done: 250 RET 251 252 // unpackInt32x27to31bitsAVX2 is the implementation of the bit unpacking 253 // algorithm for inputs of bit width 27 to 31. 254 // 255 // In this version of the algorithm the bit-packed values may span across up to 256 // 5 bytes. The simpler approach for smaller bit widths where we could perform a 257 // single shuffle + shift to unpack the values do not work anymore. 258 // 259 // Values are unpacked in two steps: the first one extracts lower bits which are 260 // shifted RIGHT to align on the beginning of 32 bit words, the second extracts 261 // upper bits which are shifted LEFT to be moved to the end of the 32 bit words. 262 // 263 // The amount of LEFT shifts is always "8 minus the amount of RIGHT shift". 264 // 265 // func unpackInt32x27to31bitsAVX2(dst []int32, src []byte, bitWidth uint) 266 TEXT ·unpackInt32x27to31bitsAVX2(SB), NOSPLIT, $56-56 267 NO_LOCAL_POINTERS 268 MOVQ dst_base+0(FP), AX 269 MOVQ dst_len+8(FP), DX 270 MOVQ src_base+24(FP), BX 271 MOVQ bitWidth+48(FP), CX 272 273 CMPQ DX, $8 274 JB tail 275 276 MOVQ DX, DI 277 SHRQ $3, DI 278 SHLQ $3, DI 279 XORQ SI, SI 280 281 MOVQ $1, R8 282 SHLQ CX, R8 283 DECQ R8 284 MOVQ R8, X0 285 VPBROADCASTD X0, X0 286 287 MOVQ CX, R9 288 SUBQ $27, R9 289 IMULQ $80, R9 // (80 * (bitWidth - 27)) 290 291 MOVQ CX, R10 292 DECQ R10 293 SHLQ $5, R10 294 ANDQ $0xFF, R10 // (32 * (bitWidth - 1)) % 256 295 296 LEAQ ·shuffleInt32x27to31bits(SB), R11 297 VMOVDQA (R11)(R9*1), X1 298 VMOVDQA 16(R11)(R9*1), X2 299 VMOVDQA 32(R11)(R9*1), X3 300 VMOVDQA 48(R11)(R9*1), X4 301 VMOVDQA 64(R11)(R9*1), X5 302 303 LEAQ ·shiftRightInt32(SB), R12 304 LEAQ ·shiftLeftInt32(SB), R13 305 VMOVDQA (R12)(R10*1), X6 306 VMOVDQA (R13)(R10*1), X7 307 VMOVDQA 16(R12)(R10*1), X8 308 VMOVDQA 16(R13)(R10*1), X9 309 loop: 310 VMOVDQU (BX), X10 311 VMOVDQU 16(BX), X11 312 313 VPSHUFB X1, X10, X12 314 VPSHUFB X2, X10, X13 315 VPSHUFB X3, X10, X14 316 VPSHUFB X4, X11, X15 317 VPSHUFB X5, X11, X11 318 319 VPSRLVD X6, X12, X12 320 VPSLLVD X7, X13, X13 321 VPSRLVD X8, X14, X14 322 VPSRLVD X8, X15, X15 323 VPSLLVD X9, X11, X11 324 325 VPOR X13, X12, X12 326 VPOR X15, X14, X14 327 VPOR X11, X14, X14 328 329 VPAND X0, X12, X12 330 VPAND X0, X14, X14 331 332 VMOVDQU X12, (AX)(SI*4) 333 VMOVDQU X14, 16(AX)(SI*4) 334 335 ADDQ CX, BX 336 ADDQ $8, SI 337 CMPQ SI, DI 338 JNE loop 339 VZEROUPPER 340 341 CMPQ SI, DX 342 JE done 343 LEAQ (AX)(SI*4), AX 344 SUBQ SI, DX 345 tail: 346 MOVQ AX, dst_base-56(SP) 347 MOVQ DX, dst_len-48(SP) 348 MOVQ BX, src_base-32(SP) 349 MOVQ CX, bitWidth-8(SP) 350 CALL ·unpackInt32Default(SB) 351 done: 352 RET