github.com/parquet-go/parquet-go@v0.20.0/value_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define sizeOfValue 24 6 7 // This function is an optimized implementation of the memsetValues function 8 // which assigns the parquet.Value passed as second argument to all elements of 9 // the first slice argument. 10 // 11 // The optimizations relies on the fact that we can pack 4 parquet.Value values 12 // into 3 YMM registers (24 x 4 = 32 x 3 = 96). 13 // 14 // func memsetValuesAVX2(values []Value, model Value, _ uint64) 15 TEXT ·memsetValuesAVX2(SB), NOSPLIT, $0-56 // 48 + padding to load model in YMM 16 MOVQ values_base+0(FP), AX 17 MOVQ values_len+8(FP), BX 18 19 MOVQ model_ptr+24(FP), R10 20 MOVQ model_u64+32(FP), R11 21 MOVQ model+40(FP), R12 // go vet complains about this line but it's OK 22 23 XORQ SI, SI // byte index 24 MOVQ BX, DI // byte count 25 IMULQ $sizeOfValue, DI 26 27 CMPQ BX, $4 28 JB test 29 30 MOVQ BX, R8 31 SHRQ $2, R8 32 SHLQ $2, R8 33 IMULQ $sizeOfValue, R8 34 35 VMOVDQU model+24(FP), Y0 36 VMOVDQU Y0, Y1 37 VMOVDQU Y0, Y2 38 39 VPERMQ $0b00100100, Y0, Y0 40 VPERMQ $0b01001001, Y1, Y1 41 VPERMQ $0b10010010, Y2, Y2 42 loop4: 43 VMOVDQU Y0, 0(AX)(SI*1) 44 VMOVDQU Y1, 32(AX)(SI*1) 45 VMOVDQU Y2, 64(AX)(SI*1) 46 ADDQ $4*sizeOfValue, SI 47 CMPQ SI, R8 48 JNE loop4 49 VZEROUPPER 50 JMP test 51 loop: 52 MOVQ R10, 0(AX)(SI*1) 53 MOVQ R11, 8(AX)(SI*1) 54 MOVQ R12, 16(AX)(SI*1) 55 ADDQ $sizeOfValue, SI 56 test: 57 CMPQ SI, DI 58 JNE loop 59 RET