github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/encoding/delta/length_byte_array_amd64.s (about) 1 //go:build !purego 2 3 #include "funcdata.h" 4 #include "textflag.h" 5 6 // func validateLengthValuesAVX2(lengths []int32) (totalLength int, ok bool) 7 TEXT ·validateLengthValuesAVX2(SB), NOSPLIT, $0-33 8 MOVQ lengths_base+0(FP), AX 9 MOVQ lengths_len+8(FP), CX 10 11 XORQ BX, BX // totalLength 12 XORQ DX, DX // err 13 XORQ SI, SI 14 XORQ DI, DI 15 XORQ R8, R8 16 17 CMPQ CX, $16 18 JB test 19 20 MOVQ CX, DI 21 SHRQ $4, DI 22 SHLQ $4, DI 23 24 VPXOR X0, X0, X0 // totalLengths 25 VPXOR X1, X1, X1 // negative test 26 loopAVX2: 27 VMOVDQU (AX)(SI*4), Y2 28 VMOVDQU 32(AX)(SI*4), Y3 29 VPADDD Y2, Y0, Y0 30 VPADDD Y3, Y0, Y0 31 VPOR Y2, Y1, Y1 32 VPOR Y3, Y1, Y1 33 ADDQ $16, SI 34 CMPQ SI, DI 35 JNE loopAVX2 36 37 // If any of the 32 bit words has its most significant bit set to 1, 38 // then at least one of the values was negative, which must be reported as 39 // an error. 40 VMOVMSKPS Y1, R8 41 CMPQ R8, $0 42 JNE done 43 44 VPSRLDQ $4, Y0, Y1 45 VPSRLDQ $8, Y0, Y2 46 VPSRLDQ $12, Y0, Y3 47 48 VPADDD Y1, Y0, Y0 49 VPADDD Y3, Y2, Y2 50 VPADDD Y2, Y0, Y0 51 52 VPERM2I128 $1, Y0, Y0, Y1 53 VPADDD Y1, Y0, Y0 54 VZEROUPPER 55 MOVQ X0, BX 56 ANDQ $0x7FFFFFFF, BX 57 58 JMP test 59 loop: 60 MOVL (AX)(SI*4), DI 61 ADDL DI, BX 62 ORL DI, R8 63 INCQ SI 64 test: 65 CMPQ SI, CX 66 JNE loop 67 CMPL R8, $0 68 JL done 69 MOVB $1, DX 70 done: 71 MOVQ BX, totalLength+24(FP) 72 MOVB DX, ok+32(FP) 73 RET 74 75 // This function is an optimization of the decodeLengthByteArray using AVX2 76 // instructions to implement an opportunistic copy strategy which improves 77 // throughput compared to using runtime.memmove (via Go's copy). 78 // 79 // Parquet columns of type BYTE_ARRAY will often hold short strings, rarely 80 // exceeding a couple hundred bytes in size. Making a function call to 81 // runtime.memmove for each value results in spending most of the CPU time 82 // on branching rather than actually copying bytes to the output buffer. 83 // 84 // This function works by always assuming it can copy 16 bytes of data between 85 // the input and outputs, even in the event where a value is shorter than this. 86 // 87 // The pointers to the current positions for input and output pointers are 88 // always adjusted by the right number of bytes so that the next writes 89 // overwrite any extra bytes that were written in the previous iteration of the 90 // copy loop. 91 // 92 // The throughput of this function is not as good as runtime.memmove for large 93 // buffers, but it ends up being close to an order of magnitude higher for the 94 // common case of working with short strings. 95 // 96 // func decodeLengthByteArrayAVX2(dst, src []byte, lengths []int32) int 97 TEXT ·decodeLengthByteArrayAVX2(SB), NOSPLIT, $0-80 98 MOVQ dst_base+0(FP), AX 99 MOVQ src_base+24(FP), BX 100 MOVQ lengths_base+48(FP), DX 101 MOVQ lengths_len+56(FP), DI 102 103 LEAQ (DX)(DI*4), DI 104 LEAQ 4(AX), AX 105 XORQ CX, CX 106 JMP test 107 loop: 108 MOVL (DX), CX 109 MOVL CX, -4(AX) 110 // First pass moves 16 bytes, this makes it a very fast path for short 111 // strings. 112 VMOVDQU (BX), X0 113 VMOVDQU X0, (AX) 114 CMPQ CX, $16 115 JA copy 116 next: 117 LEAQ 4(AX)(CX*1), AX 118 LEAQ 0(BX)(CX*1), BX 119 LEAQ 4(DX), DX 120 test: 121 CMPQ DX, DI 122 JNE loop 123 MOVQ dst_base+0(FP), BX 124 SUBQ BX, AX 125 SUBQ $4, AX 126 MOVQ AX, ret+72(FP) 127 VZEROUPPER 128 RET 129 copy: 130 // Values longer than 16 bytes enter this loop and move 32 byte chunks 131 // which helps improve throughput on larger chunks. 132 MOVQ $16, SI 133 copyLoop32: 134 VMOVDQU (BX)(SI*1), Y0 135 VMOVDQU Y0, (AX)(SI*1) 136 ADDQ $32, SI 137 CMPQ SI, CX 138 JAE next 139 JMP copyLoop32