github.com/parquet-go/parquet-go@v0.20.0/encoding/delta/length_byte_array_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 // func encodeByteArrayLengths(lengths []int32, offsets []uint32) 6 TEXT ·encodeByteArrayLengths(SB), NOSPLIT, $0-48 7 MOVQ lengths_base+0(FP), AX 8 MOVQ lengths_len+8(FP), CX 9 MOVQ offsets_base+24(FP), BX 10 XORQ SI, SI 11 12 CMPQ CX, $4 13 JB test 14 15 MOVQ CX, DX 16 SHRQ $2, DX 17 SHLQ $2, DX 18 19 loopSSE2: 20 MOVOU 0(BX)(SI*4), X0 21 MOVOU 4(BX)(SI*4), X1 22 PSUBL X0, X1 23 MOVOU X1, (AX)(SI*4) 24 ADDQ $4, SI 25 CMPQ SI, DX 26 JNE loopSSE2 27 JMP test 28 loop: 29 MOVL 0(BX)(SI*4), R8 30 MOVL 4(BX)(SI*4), R9 31 SUBL R8, R9 32 MOVL R9, (AX)(SI*4) 33 INCQ SI 34 test: 35 CMPQ SI, CX 36 JNE loop 37 RET 38 39 // func decodeByteArrayLengths(offsets []uint32, length []int32) (lastOffset uint32, invalidLength int32) 40 TEXT ·decodeByteArrayLengths(SB), NOSPLIT, $0-56 41 MOVQ offsets_base+0(FP), AX 42 MOVQ lengths_base+24(FP), BX 43 MOVQ lengths_len+32(FP), CX 44 45 XORQ DX, DX // lastOffset 46 XORQ DI, DI // invalidLength 47 XORQ SI, SI 48 49 CMPQ CX, $4 50 JL test 51 52 MOVQ CX, R8 53 SHRQ $2, R8 54 SHLQ $2, R8 55 56 MOVL $0, (AX) 57 PXOR X0, X0 58 PXOR X3, X3 59 // This loop computes the prefix sum of the lengths array in order to 60 // generate values of the offsets array. 61 // 62 // We stick to SSE2 to keep the code simple (the Go compiler appears to 63 // assume that SSE2 must be supported on AMD64) which already yields most 64 // of the performance that we could get on this subroutine if we were using 65 // AVX2. 66 // 67 // The X3 register also accumulates a mask of all length values, which is 68 // checked after the loop to determine whether any of the lengths were 69 // negative. 70 // 71 // The following article contains a description of the prefix sum algorithm 72 // used in this function: https://en.algorithmica.org/hpc/algorithms/prefix/ 73 loopSSE2: 74 MOVOU (BX)(SI*4), X1 75 POR X1, X3 76 77 MOVOA X1, X2 78 PSLLDQ $4, X2 79 PADDD X2, X1 80 81 MOVOA X1, X2 82 PSLLDQ $8, X2 83 PADDD X2, X1 84 85 PADDD X1, X0 86 MOVOU X0, 4(AX)(SI*4) 87 88 PSHUFD $0b11111111, X0, X0 89 90 ADDQ $4, SI 91 CMPQ SI, R8 92 JNE loopSSE2 93 94 // If any of the most significant bits of double words in the X3 register 95 // are set to 1, it indicates that one of the lengths was negative and 96 // therefore the prefix sum is invalid. 97 // 98 // TODO: we report the invalid length as -1, effectively losing the original 99 // value due to the aggregation within X3. This is something that we might 100 // want to address in the future to provide better error reporting. 101 MOVMSKPS X3, R8 102 MOVL $-1, R9 103 CMPL R8, $0 104 CMOVLNE R9, DI 105 106 MOVQ X0, DX 107 JMP test 108 loop: 109 MOVL (BX)(SI*4), R8 110 MOVL DX, (AX)(SI*4) 111 ADDL R8, DX 112 CMPL R8, $0 113 CMOVLLT R8, DI 114 INCQ SI 115 test: 116 CMPQ SI, CX 117 JNE loop 118 119 MOVL DX, (AX)(SI*4) 120 MOVL DX, lastOffset+48(FP) 121 MOVL DI, invalidLength+52(FP) 122 RET