github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/encoding/delta/byte_array_amd64.s (about) 1 //go:build !purego 2 3 #include "funcdata.h" 4 #include "textflag.h" 5 6 // func validatePrefixAndSuffixLengthValuesAVX2(prefix, suffix []int32, maxLength int) (totalPrefixLength, totalSuffixLength int, ok bool) 7 TEXT ·validatePrefixAndSuffixLengthValuesAVX2(SB), NOSPLIT, $0-73 8 MOVQ prefix_base+0(FP), AX 9 MOVQ suffix_base+24(FP), BX 10 MOVQ suffix_len+32(FP), CX 11 MOVQ maxLength+48(FP), DX 12 13 XORQ SI, SI 14 XORQ DI, DI // lastValueLength 15 XORQ R8, R8 16 XORQ R9, R9 17 XORQ R10, R10 // totalPrefixLength 18 XORQ R11, R11 // totalSuffixLength 19 XORQ R12, R12 // ok 20 21 CMPQ CX, $8 22 JB test 23 24 MOVQ CX, R13 25 SHRQ $3, R13 26 SHLQ $3, R13 27 28 VPXOR X0, X0, X0 // lastValueLengths 29 VPXOR X1, X1, X1 // totalPrefixLengths 30 VPXOR X2, X2, X2 // totalSuffixLengths 31 VPXOR X3, X3, X3 // negative prefix length sentinels 32 VPXOR X4, X4, X4 // negative suffix length sentinels 33 VPXOR X5, X5, X5 // prefix length overflow sentinels 34 VMOVDQU ·rotateLeft32(SB), Y6 35 36 loopAVX2: 37 VMOVDQU (AX)(SI*4), Y7 // p 38 VMOVDQU (BX)(SI*4), Y8 // n 39 40 VPADDD Y7, Y1, Y1 41 VPADDD Y8, Y2, Y2 42 43 VPOR Y7, Y3, Y3 44 VPOR Y8, Y4, Y4 45 46 VPADDD Y7, Y8, Y9 // p + n 47 VPERMD Y0, Y6, Y10 48 VPBLENDD $1, Y10, Y9, Y10 49 VPCMPGTD Y10, Y7, Y10 50 VPOR Y10, Y5, Y5 51 52 VMOVDQU Y9, Y0 53 ADDQ $8, SI 54 CMPQ SI, R13 55 JNE loopAVX2 56 57 // If any of the sentinel values has its most significant bit set then one 58 // of the values was negative or one of the prefixes was greater than the 59 // length of the previous value, return false. 60 VPOR Y4, Y3, Y3 61 VPOR Y5, Y3, Y3 62 VMOVMSKPS Y3, R13 63 CMPQ R13, $0 64 JNE done 65 66 // We computed 8 sums in parallel for the prefix and suffix arrays, they 67 // need to be accumulated into single values, which is what these reduction 68 // steps do. 69 VPSRLDQ $4, Y1, Y5 70 VPSRLDQ $8, Y1, Y6 71 VPSRLDQ $12, Y1, Y7 72 VPADDD Y5, Y1, Y1 73 VPADDD Y6, Y1, Y1 74 VPADDD Y7, Y1, Y1 75 VPERM2I128 $1, Y1, Y1, Y0 76 VPADDD Y0, Y1, Y1 77 MOVQ X1, R10 78 ANDQ $0x7FFFFFFF, R10 79 80 VPSRLDQ $4, Y2, Y5 81 VPSRLDQ $8, Y2, Y6 82 VPSRLDQ $12, Y2, Y7 83 VPADDD Y5, Y2, Y2 84 VPADDD Y6, Y2, Y2 85 VPADDD Y7, Y2, Y2 86 VPERM2I128 $1, Y2, Y2, Y0 87 VPADDD Y0, Y2, Y2 88 MOVQ X2, R11 89 ANDQ $0x7FFFFFFF, R11 90 91 JMP test 92 loop: 93 MOVLQSX (AX)(SI*4), R8 94 MOVLQSX (BX)(SI*4), R9 95 96 CMPQ R8, $0 // p < 0 ? 97 JL done 98 99 CMPQ R9, $0 // n < 0 ? 100 JL done 101 102 CMPQ R8, DI // p > lastValueLength ? 103 JG done 104 105 ADDQ R8, R10 106 ADDQ R9, R11 107 ADDQ R8, DI 108 ADDQ R9, DI 109 110 INCQ SI 111 test: 112 CMPQ SI, CX 113 JNE loop 114 115 CMPQ R11, DX // totalSuffixLength > maxLength ? 116 JG done 117 118 MOVB $1, R12 119 done: 120 MOVQ R10, totalPrefixLength+56(FP) 121 MOVQ R11, totalSuffixLength+64(FP) 122 MOVB R12, ok+72(FP) 123 RET 124 125 // func decodeByteArrayOffsets(offsets []uint32, prefix, suffix []int32) 126 TEXT ·decodeByteArrayOffsets(SB), NOSPLIT, $0-72 127 MOVQ offsets_base+0(FP), AX 128 MOVQ prefix_base+24(FP), BX 129 MOVQ suffix_base+48(FP), CX 130 MOVQ suffix_len+56(FP), DX 131 132 XORQ SI, SI 133 XORQ R10, R10 134 JMP test 135 loop: 136 MOVL (BX)(SI*4), R8 137 MOVL (CX)(SI*4), R9 138 MOVL R10, (AX)(SI*4) 139 ADDL R8, R10 140 ADDL R9, R10 141 INCQ SI 142 test: 143 CMPQ SI, DX 144 JNE loop 145 MOVL R10, (AX)(SI*4) 146 RET 147 148 // func decodeByteArrayAVX2(dst, src []byte, prefix, suffix []int32) int 149 TEXT ·decodeByteArrayAVX2(SB), NOSPLIT, $0-104 150 MOVQ dst_base+0(FP), AX 151 MOVQ src_base+24(FP), BX 152 MOVQ prefix_base+48(FP), CX 153 MOVQ suffix_base+72(FP), DX 154 MOVQ suffix_len+80(FP), DI 155 156 XORQ SI, SI 157 XORQ R8, R8 158 XORQ R9, R9 159 MOVQ AX, R10 // last value 160 161 JMP test 162 loop: 163 MOVLQZX (CX)(SI*4), R8 // prefix length 164 MOVLQZX (DX)(SI*4), R9 // suffix length 165 prefix: 166 VMOVDQU (R10), Y0 167 VMOVDQU Y0, (AX) 168 CMPQ R8, $32 169 JA copyPrefix 170 suffix: 171 VMOVDQU (BX), Y1 172 VMOVDQU Y1, (AX)(R8*1) 173 CMPQ R9, $32 174 JA copySuffix 175 next: 176 MOVQ AX, R10 177 ADDQ R9, R8 178 LEAQ (AX)(R8*1), AX 179 LEAQ (BX)(R9*1), BX 180 INCQ SI 181 test: 182 CMPQ SI, DI 183 JNE loop 184 MOVQ dst_base+0(FP), BX 185 SUBQ BX, AX 186 MOVQ AX, ret+96(FP) 187 VZEROUPPER 188 RET 189 copyPrefix: 190 MOVQ $32, R12 191 copyPrefixLoop: 192 VMOVDQU (R10)(R12*1), Y0 193 VMOVDQU Y0, (AX)(R12*1) 194 ADDQ $32, R12 195 CMPQ R12, R8 196 JB copyPrefixLoop 197 JMP suffix 198 copySuffix: 199 MOVQ $32, R12 200 LEAQ (AX)(R8*1), R13 201 copySuffixLoop: 202 VMOVDQU (BX)(R12*1), Y1 203 VMOVDQU Y1, (R13)(R12*1) 204 ADDQ $32, R12 205 CMPQ R12, R9 206 JB copySuffixLoop 207 JMP next 208 209 // func decodeByteArrayAVX2x128bits(dst, src []byte, prefix, suffix []int32) int 210 TEXT ·decodeByteArrayAVX2x128bits(SB), NOSPLIT, $0-104 211 MOVQ dst_base+0(FP), AX 212 MOVQ src_base+24(FP), BX 213 MOVQ prefix_base+48(FP), CX 214 MOVQ suffix_base+72(FP), DX 215 MOVQ suffix_len+80(FP), DI 216 217 XORQ SI, SI 218 XORQ R8, R8 219 XORQ R9, R9 220 VPXOR X0, X0, X0 221 222 JMP test 223 loop: 224 MOVLQZX (CX)(SI*4), R8 // prefix length 225 MOVLQZX (DX)(SI*4), R9 // suffix length 226 227 VMOVDQU (BX), X1 228 VMOVDQU X0, (AX) 229 VMOVDQU X1, (AX)(R8*1) 230 VMOVDQU (AX), X0 231 232 ADDQ R9, R8 233 LEAQ (AX)(R8*1), AX 234 LEAQ (BX)(R9*1), BX 235 INCQ SI 236 test: 237 CMPQ SI, DI 238 JNE loop 239 MOVQ dst_base+0(FP), BX 240 SUBQ BX, AX 241 MOVQ AX, ret+96(FP) 242 VZEROUPPER 243 RET