github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/encoding/bytestreamsplit/bytestreamsplit_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 // This file contains optimizations of the BYTE_STREAM_SPLIT encoding using AVX2 6 // and AVX512 (when available). 7 // 8 // The AVX2/512 instruction set comes with instructions to load memory from, or 9 // store memory at sparse locations called VPGATHER and VPSCATTER. VPGATHER was 10 // available in the AVX2 instruction set, VPSCATTER was introduced in AVX512 11 // (when the AVX512_VBMI extension is supported). Gathering bytes are sparse 12 // memory locations is useful during the decoding process since we are 13 // recomposing 32 or 64 bit floating point values from 4 or 8 bytes dispatched 14 // in the input byte array. 15 // 16 // To either deconstruct or reconstruct floating point values, we need to 17 // reorder the bytes of each value. If we have 4 32 bit floats, we can permute 18 // their bytes so that the first one contains all the first bytes, the second 19 // contains all the second bytes, etc... The VPSHUFB instruction is used to 20 // perform the byte permutation, or the VPERMB instruction for 64 bit floats. 21 // 22 // We use different instructions because the VPSHUFB instruction works on two 23 // lanes of 16 bytes when used on YMM registers. 4 32 bit floats take 16 bytes, 24 // so a a YMM register can hold two lanes of 4 32 bit floats and the VPSHUFB 25 // can permute the two sets of values in a single invocation. For 64 bit floats 26 // we need to permute 8 values, which take 64 bytes and therefore need to be 27 // held in a ZMM register and apply permutations across the entire register, 28 // which is only possible using VPERMB. 29 // 30 // Technically we could use ZMM registers when working on 32 bit floats to work 31 // on 16 values per iteration. However, measurements indicated that the latency 32 // of VPGATHERDD/VPSCATTERDD on ZMM registers did not provide any improvements 33 // to the throughput of the algorithms, but working on more values increased the 34 // code complexity. Using YMM registers offered the best balance between 35 // performance and maintainability. 36 // 37 // At a high level the vectorized algorithms are the following: 38 // 39 // encoding 40 // -------- 41 // * Load a vector of data from the input buffer 42 // * Permute bytes, grouping bytes by index 43 // * Scatter bytes of the register to the output buffer 44 // 45 // decoding 46 // -------- 47 // * Gather sparse bytes from the input buffer 48 // * Permute bytes, reconstructing the original values 49 // * Store the vector in the output buffer 50 // 51 // When AVX instructions are not available, the functions fallback to scalar 52 // implementations of the algorithms. These yield much lower throughput, but 53 // performed 20-30% better than the code generated by the Go compiler. 54 55 // func encodeFloat(dst, src []byte) 56 TEXT ·encodeFloat(SB), NOSPLIT, $0-48 57 MOVQ src_base+24(FP), AX 58 MOVQ src_len+32(FP), BX 59 MOVQ dst_base+0(FP), DX 60 61 MOVQ AX, CX 62 ADDQ BX, CX // end 63 SHRQ $2, BX // len 64 65 CMPQ BX, $0 66 JE done 67 68 CMPB ·encodeFloatHasAVX512(SB), $0 69 JE loop1x4 70 71 CMPQ BX, $8 72 JB loop1x4 73 74 MOVQ CX, DI 75 SUBQ AX, DI 76 SHRQ $5, DI 77 SHLQ $5, DI 78 ADDQ AX, DI 79 80 VMOVDQU32 shuffle8x4<>(SB), Y0 81 VPBROADCASTD BX, Y2 82 VPMULLD scale8x4<>(SB), Y2, Y2 83 VPADDD offset8x4<>(SB), Y2, Y2 84 loop8x4: 85 KXORQ K1, K1, K1 86 KNOTQ K1, K1 87 88 VMOVDQU32 (AX), Y1 89 VPSHUFB Y0, Y1, Y1 90 VPSCATTERDD Y1, K1, (DX)(Y2*1) 91 92 ADDQ $32, AX 93 ADDQ $8, DX 94 CMPQ AX, DI 95 JNE loop8x4 96 VZEROUPPER 97 98 CMPQ AX, CX 99 JE done 100 loop1x4: 101 MOVL (AX), SI 102 MOVQ DX, DI 103 104 MOVB SI, (DI) 105 SHRL $8, SI 106 ADDQ BX, DI 107 108 MOVB SI, (DI) 109 SHRL $8, SI 110 ADDQ BX, DI 111 112 MOVB SI, (DI) 113 SHRL $8, SI 114 ADDQ BX, DI 115 116 MOVB SI, (DI) 117 118 ADDQ $4, AX 119 INCQ DX 120 CMPQ AX, CX 121 JB loop1x4 122 done: 123 RET 124 125 // func encodeDouble(dst, src []byte) 126 TEXT ·encodeDouble(SB), NOSPLIT, $0-48 127 MOVQ src_base+24(FP), AX 128 MOVQ src_len+32(FP), BX 129 MOVQ dst_base+0(FP), DX 130 131 MOVQ AX, CX 132 ADDQ BX, CX 133 SHRQ $3, BX 134 135 CMPQ BX, $0 136 JE done 137 138 CMPB ·encodeDoubleHasAVX512(SB), $0 139 JE loop1x8 140 141 CMPQ BX, $8 142 JB loop1x8 143 144 MOVQ CX, DI 145 SUBQ AX, DI 146 SHRQ $6, DI 147 SHLQ $6, DI 148 ADDQ AX, DI 149 150 VMOVDQU64 shuffle8x8<>(SB), Z0 151 VPBROADCASTQ BX, Z2 152 VPMULLQ scale8x8<>(SB), Z2, Z2 153 loop8x8: 154 KXORQ K1, K1, K1 155 KNOTQ K1, K1 156 157 VMOVDQU64 (AX), Z1 158 VPERMB Z1, Z0, Z1 159 VPSCATTERQQ Z1, K1, (DX)(Z2*1) 160 161 ADDQ $64, AX 162 ADDQ $8, DX 163 CMPQ AX, DI 164 JNE loop8x8 165 VZEROUPPER 166 167 CMPQ AX, CX 168 JE done 169 loop1x8: 170 MOVQ (AX), SI 171 MOVQ DX, DI 172 173 MOVB SI, (DI) 174 SHRQ $8, SI 175 ADDQ BX, DI 176 177 MOVB SI, (DI) 178 SHRQ $8, SI 179 ADDQ BX, DI 180 181 MOVB SI, (DI) 182 SHRQ $8, SI 183 ADDQ BX, DI 184 185 MOVB SI, (DI) 186 SHRQ $8, SI 187 ADDQ BX, DI 188 189 MOVB SI, (DI) 190 SHRQ $8, SI 191 ADDQ BX, DI 192 193 MOVB SI, (DI) 194 SHRQ $8, SI 195 ADDQ BX, DI 196 197 MOVB SI, (DI) 198 SHRQ $8, SI 199 ADDQ BX, DI 200 201 MOVB SI, (DI) 202 203 ADDQ $8, AX 204 INCQ DX 205 CMPQ AX, CX 206 JB loop1x8 207 done: 208 RET 209 210 // func decodeFloat(dst, src []byte) 211 TEXT ·decodeFloat(SB), NOSPLIT, $0-48 212 MOVQ dst_base+0(FP), AX 213 MOVQ dst_len+8(FP), BX 214 MOVQ src_base+24(FP), DX 215 216 MOVQ AX, CX 217 ADDQ BX, CX // end 218 SHRQ $2, BX // len 219 220 CMPQ BX, $0 221 JE done 222 223 CMPB ·decodeFloatHasAVX2(SB), $0 224 JE loop1x4 225 226 CMPQ BX, $8 227 JB loop1x4 228 229 MOVQ CX, DI 230 SUBQ AX, DI 231 SHRQ $5, DI 232 SHLQ $5, DI 233 ADDQ AX, DI 234 235 MOVQ $0xFFFFFFFF, SI 236 MOVQ BX, X5 237 MOVQ SI, X6 238 VMOVDQU shuffle8x4<>(SB), Y0 239 VPBROADCASTD X5, Y2 240 VPBROADCASTD X6, Y3 241 VPMULLD scale8x4<>(SB), Y2, Y2 242 VPADDD offset8x4<>(SB), Y2, Y2 243 VMOVDQU Y3, Y4 244 loop8x4: 245 VPGATHERDD Y4, (DX)(Y2*1), Y1 246 VPSHUFB Y0, Y1, Y1 247 VMOVDQU Y1, (AX) 248 VMOVDQU Y3, Y4 249 250 ADDQ $32, AX 251 ADDQ $8, DX 252 CMPQ AX, DI 253 JNE loop8x4 254 VZEROUPPER 255 256 CMPQ AX, CX 257 JE done 258 loop1x4: 259 MOVQ DX, DI 260 MOVBLZX (DI), R8 261 ADDQ BX, DI 262 MOVBLZX (DI), R9 263 ADDQ BX, DI 264 MOVBLZX (DI), R10 265 ADDQ BX, DI 266 MOVBLZX (DI), R11 267 268 SHLL $8, R9 269 SHLL $16, R10 270 SHLL $24, R11 271 272 ORL R9, R8 273 ORL R10, R8 274 ORL R11, R8 275 276 MOVL R8, (AX) 277 278 ADDQ $4, AX 279 INCQ DX 280 CMPQ AX, CX 281 JB loop1x4 282 done: 283 RET 284 285 // func decodeDouble(dst, src []byte) 286 TEXT ·decodeDouble(SB), NOSPLIT, $0-48 287 MOVQ dst_base+0(FP), AX 288 MOVQ dst_len+8(FP), BX 289 MOVQ src_base+24(FP), DX 290 291 MOVQ AX, CX 292 ADDQ BX, CX 293 SHRQ $3, BX 294 295 CMPQ BX, $0 296 JE done 297 298 CMPB ·decodeDoubleHasAVX512(SB), $0 299 JE loop1x8 300 301 CMPQ BX, $8 302 JB loop1x8 303 304 MOVQ CX, DI 305 SUBQ AX, DI 306 SHRQ $6, DI 307 SHLQ $6, DI 308 ADDQ AX, DI 309 310 VMOVDQU64 shuffle8x8<>(SB), Z0 311 VPBROADCASTQ BX, Z2 312 VPMULLQ scale8x8<>(SB), Z2, Z2 313 loop8x8: 314 KXORQ K1, K1, K1 315 KNOTQ K1, K1 316 317 VPGATHERQQ (DX)(Z2*1), K1, Z1 318 VPERMB Z1, Z0, Z1 319 VMOVDQU64 Z1, (AX) 320 321 ADDQ $64, AX 322 ADDQ $8, DX 323 CMPQ AX, DI 324 JNE loop8x8 325 VZEROUPPER 326 327 CMPQ AX, CX 328 JE done 329 loop1x8: 330 MOVQ DX, DI 331 XORQ R12, R12 332 333 MOVBQZX (DI), R8 334 ADDQ BX, DI 335 MOVBQZX (DI), R9 336 ADDQ BX, DI 337 MOVBQZX (DI), R10 338 ADDQ BX, DI 339 MOVBQZX (DI), R11 340 ADDQ BX, DI 341 342 SHLQ $8, R9 343 SHLQ $16, R10 344 SHLQ $24, R11 345 346 ORQ R8, R12 347 ORQ R9, R12 348 ORQ R10, R12 349 ORQ R11, R12 350 351 MOVBQZX (DI), R8 352 ADDQ BX, DI 353 MOVBQZX (DI), R9 354 ADDQ BX, DI 355 MOVBQZX (DI), R10 356 ADDQ BX, DI 357 MOVBQZX (DI), R11 358 359 SHLQ $32, R8 360 SHLQ $40, R9 361 SHLQ $48, R10 362 SHLQ $56, R11 363 364 ORQ R8, R12 365 ORQ R9, R12 366 ORQ R10, R12 367 ORQ R11, R12 368 369 MOVQ R12, (AX) 370 371 ADDQ $8, AX 372 INCQ DX 373 CMPQ AX, CX 374 JB loop1x8 375 done: 376 RET 377 378 GLOBL scale8x4<>(SB), RODATA|NOPTR, $32 379 DATA scale8x4<>+0(SB)/4, $0 380 DATA scale8x4<>+4(SB)/4, $1 381 DATA scale8x4<>+8(SB)/4, $2 382 DATA scale8x4<>+12(SB)/4, $3 383 DATA scale8x4<>+16(SB)/4, $0 384 DATA scale8x4<>+20(SB)/4, $1 385 DATA scale8x4<>+24(SB)/4, $2 386 DATA scale8x4<>+28(SB)/4, $3 387 388 GLOBL offset8x4<>(SB), RODATA|NOPTR, $32 389 DATA offset8x4<>+0(SB)/4, $0 390 DATA offset8x4<>+4(SB)/4, $0 391 DATA offset8x4<>+8(SB)/4, $0 392 DATA offset8x4<>+12(SB)/4, $0 393 DATA offset8x4<>+16(SB)/4, $4 394 DATA offset8x4<>+20(SB)/4, $4 395 DATA offset8x4<>+24(SB)/4, $4 396 DATA offset8x4<>+28(SB)/4, $4 397 398 GLOBL shuffle8x4<>(SB), RODATA|NOPTR, $32 399 DATA shuffle8x4<>+0(SB)/4, $0x0C080400 400 DATA shuffle8x4<>+4(SB)/4, $0x0D090501 401 DATA shuffle8x4<>+8(SB)/4, $0x0E0A0602 402 DATA shuffle8x4<>+12(SB)/4, $0x0F0B0703 403 DATA shuffle8x4<>+16(SB)/4, $0x0C080400 404 DATA shuffle8x4<>+20(SB)/4, $0x0D090501 405 DATA shuffle8x4<>+24(SB)/4, $0x0E0A0602 406 DATA shuffle8x4<>+28(SB)/4, $0x0F0B0703 407 408 GLOBL scale8x8<>(SB), RODATA|NOPTR, $64 409 DATA scale8x8<>+0(SB)/8, $0 410 DATA scale8x8<>+8(SB)/8, $1 411 DATA scale8x8<>+16(SB)/8, $2 412 DATA scale8x8<>+24(SB)/8, $3 413 DATA scale8x8<>+32(SB)/8, $4 414 DATA scale8x8<>+40(SB)/8, $5 415 DATA scale8x8<>+48(SB)/8, $6 416 DATA scale8x8<>+56(SB)/8, $7 417 418 GLOBL shuffle8x8<>(SB), RODATA|NOPTR, $64 419 DATA shuffle8x8<>+0(SB)/8, $0x3830282018100800 420 DATA shuffle8x8<>+8(SB)/8, $0x3931292119110901 421 DATA shuffle8x8<>+16(SB)/8, $0x3A322A221A120A02 422 DATA shuffle8x8<>+24(SB)/8, $0x3B332B231B130B03 423 DATA shuffle8x8<>+32(SB)/8, $0x3C342C241C140C04 424 DATA shuffle8x8<>+40(SB)/8, $0x3D352D251D150D05 425 DATA shuffle8x8<>+48(SB)/8, $0x3E362E261E160E06 426 DATA shuffle8x8<>+56(SB)/8, $0x3F372F271F170F07