github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/encoding/delta/byte_array_amd64.s (about) 1 //go:build !purego 2 3 #include "funcdata.h" 4 #include "textflag.h" 5 6 // func validatePrefixAndSuffixLengthValuesAVX2(prefix, suffix []int32, maxLength int) (totalPrefixLength, totalSuffixLength int, ok bool) 7 TEXT ·validatePrefixAndSuffixLengthValuesAVX2(SB), NOSPLIT, $0-73 8 MOVQ prefix_base+0(FP), AX 9 MOVQ suffix_base+24(FP), BX 10 MOVQ suffix_len+32(FP), CX 11 MOVQ maxLength+48(FP), DX 12 13 XORQ SI, SI 14 XORQ DI, DI // lastValueLength 15 XORQ R8, R8 16 XORQ R9, R9 17 XORQ R10, R10 // totalPrefixLength 18 XORQ R11, R11 // totalSuffixLength 19 XORQ R12, R12 // ok 20 21 CMPQ CX, $8 22 JB test 23 24 MOVQ CX, R13 25 SHRQ $3, R13 26 SHLQ $3, R13 27 28 VPXOR X0, X0, X0 // lastValueLengths 29 VPXOR X1, X1, X1 // totalPrefixLengths 30 VPXOR X2, X2, X2 // totalSuffixLengths 31 VPXOR X3, X3, X3 // negative prefix length sentinels 32 VPXOR X4, X4, X4 // negative suffix length sentinels 33 VPXOR X5, X5, X5 // prefix length overflow sentinels 34 VMOVDQU ·rotateLeft32(SB), Y6 35 36 loopAVX2: 37 VMOVDQU (AX)(SI*4), Y7 // p 38 VMOVDQU (BX)(SI*4), Y8 // n 39 40 VPADDD Y7, Y1, Y1 41 VPADDD Y8, Y2, Y2 42 43 VPOR Y7, Y3, Y3 44 VPOR Y8, Y4, Y4 45 46 VPADDD Y7, Y8, Y9 // p + n 47 VPERMD Y0, Y6, Y10 48 VPBLENDD $1, Y10, Y9, Y10 49 VPCMPGTD Y10, Y7, Y10 50 VPOR Y10, Y5, Y5 51 52 VMOVDQU Y9, Y0 53 ADDQ $8, SI 54 CMPQ SI, R13 55 JNE loopAVX2 56 57 // If any of the sentinel values has its most significant bit set then one 58 // of the values was negative or one of the prefixes was greater than the 59 // length of the previous value, return false. 60 VPOR Y4, Y3, Y3 61 VPOR Y5, Y3, Y3 62 VMOVMSKPS Y3, R13 63 CMPQ R13, $0 64 JNE done 65 66 // We computed 8 sums in parallel for the prefix and suffix arrays, they 67 // need to be accumulated into single values, which is what these reduction 68 // steps do. 69 VPSRLDQ $4, Y1, Y5 70 VPSRLDQ $8, Y1, Y6 71 VPSRLDQ $12, Y1, Y7 72 VPADDD Y5, Y1, Y1 73 VPADDD Y6, Y1, Y1 74 VPADDD Y7, Y1, Y1 75 VPERM2I128 $1, Y1, Y1, Y0 76 VPADDD Y0, Y1, Y1 77 MOVQ X1, R10 78 ANDQ $0x7FFFFFFF, R10 79 80 VPSRLDQ $4, Y2, Y5 81 VPSRLDQ $8, Y2, Y6 82 VPSRLDQ $12, Y2, Y7 83 VPADDD Y5, Y2, Y2 84 VPADDD Y6, Y2, Y2 85 VPADDD Y7, Y2, Y2 86 VPERM2I128 $1, Y2, Y2, Y0 87 VPADDD Y0, Y2, Y2 88 MOVQ X2, R11 89 ANDQ $0x7FFFFFFF, R11 90 91 JMP test 92 loop: 93 MOVLQSX (AX)(SI*4), R8 94 MOVLQSX (BX)(SI*4), R9 95 96 CMPQ R8, $0 // p < 0 ? 97 JL done 98 99 CMPQ R9, $0 // n < 0 ? 100 JL done 101 102 CMPQ R8, DI // p > lastValueLength ? 103 JG done 104 105 ADDQ R8, R10 106 ADDQ R9, R11 107 ADDQ R8, DI 108 ADDQ R9, DI 109 110 INCQ SI 111 test: 112 CMPQ SI, CX 113 JNE loop 114 115 CMPQ R11, DX // totalSuffixLength > maxLength ? 116 JG done 117 118 MOVB $1, R12 119 done: 120 MOVQ R10, totalPrefixLength+56(FP) 121 MOVQ R11, totalSuffixLength+64(FP) 122 MOVB R12, ok+72(FP) 123 RET 124 125 // func decodeByteArrayAVX2(dst, src []byte, prefix, suffix []int32) int 126 TEXT ·decodeByteArrayAVX2(SB), NOSPLIT, $0-104 127 MOVQ dst_base+0(FP), AX 128 MOVQ src_base+24(FP), BX 129 MOVQ prefix_base+48(FP), CX 130 MOVQ suffix_base+72(FP), DX 131 MOVQ suffix_len+80(FP), DI 132 133 ADDQ $4, AX 134 XORQ SI, SI 135 XORQ R8, R8 136 XORQ R9, R9 137 MOVQ AX, R10 // last value 138 139 JMP test 140 loop: 141 MOVLQZX (CX)(SI*4), R8 // prefix length 142 MOVLQZX (DX)(SI*4), R9 // suffix length 143 MOVQ R8, R11 144 ADDQ R9, R11 145 MOVL R11, -4(AX) 146 prefix: 147 VMOVDQU (R10), X0 148 VMOVDQU X0, (AX) 149 CMPQ R8, $16 150 JA copyPrefix 151 suffix: 152 VMOVDQU (BX), X1 153 VMOVDQU X1, (AX)(R8*1) 154 CMPQ R9, $16 155 JA copySuffix 156 next: 157 MOVQ AX, R10 158 LEAQ 4(AX)(R11*1), AX 159 LEAQ 0(BX)(R9*1), BX 160 INCQ SI 161 test: 162 CMPQ SI, DI 163 JNE loop 164 MOVQ dst_base+0(FP), BX 165 SUBQ BX, AX 166 SUBQ $4, AX 167 MOVQ AX, ret+96(FP) 168 VZEROUPPER 169 RET 170 copyPrefix: 171 MOVQ $16, R12 172 copyPrefixLoop: 173 VMOVDQU (R10)(R12*1), Y0 174 VMOVDQU Y0, (AX)(R12*1) 175 ADDQ $32, R12 176 CMPQ R12, R8 177 JB copyPrefixLoop 178 JMP suffix 179 copySuffix: 180 MOVQ $16, R12 181 LEAQ (AX)(R8*1), R13 182 copySuffixLoop: 183 VMOVDQU (BX)(R12*1), Y1 184 VMOVDQU Y1, (R13)(R12*1) 185 ADDQ $32, R12 186 CMPQ R12, R9 187 JB copySuffixLoop 188 JMP next 189 190 // func decodeFixedLenByteArrayAVX2(dst, src []byte, prefix, suffix []int32) int 191 TEXT ·decodeFixedLenByteArrayAVX2(SB), NOSPLIT, $0-104 192 MOVQ dst_base+0(FP), AX 193 MOVQ src_base+24(FP), BX 194 MOVQ prefix_base+48(FP), CX 195 MOVQ suffix_base+72(FP), DX 196 MOVQ suffix_len+80(FP), DI 197 198 XORQ SI, SI 199 XORQ R8, R8 200 XORQ R9, R9 201 MOVQ AX, R10 // last value 202 203 JMP test 204 loop: 205 MOVLQZX (CX)(SI*4), R8 // prefix length 206 MOVLQZX (DX)(SI*4), R9 // suffix length 207 prefix: 208 VMOVDQU (R10), Y0 209 VMOVDQU Y0, (AX) 210 CMPQ R8, $32 211 JA copyPrefix 212 suffix: 213 VMOVDQU (BX), Y1 214 VMOVDQU Y1, (AX)(R8*1) 215 CMPQ R9, $32 216 JA copySuffix 217 next: 218 MOVQ AX, R10 219 ADDQ R9, R8 220 LEAQ (AX)(R8*1), AX 221 LEAQ (BX)(R9*1), BX 222 INCQ SI 223 test: 224 CMPQ SI, DI 225 JNE loop 226 MOVQ dst_base+0(FP), BX 227 SUBQ BX, AX 228 MOVQ AX, ret+96(FP) 229 VZEROUPPER 230 RET 231 copyPrefix: 232 MOVQ $32, R12 233 copyPrefixLoop: 234 VMOVDQU (R10)(R12*1), Y0 235 VMOVDQU Y0, (AX)(R12*1) 236 ADDQ $32, R12 237 CMPQ R12, R8 238 JB copyPrefixLoop 239 JMP suffix 240 copySuffix: 241 MOVQ $32, R12 242 LEAQ (AX)(R8*1), R13 243 copySuffixLoop: 244 VMOVDQU (BX)(R12*1), Y1 245 VMOVDQU Y1, (R13)(R12*1) 246 ADDQ $32, R12 247 CMPQ R12, R9 248 JB copySuffixLoop 249 JMP next 250 251 // func decodeFixedLenByteArrayAVX2x128bits(dst, src []byte, prefix, suffix []int32) int 252 TEXT ·decodeFixedLenByteArrayAVX2x128bits(SB), NOSPLIT, $0-104 253 MOVQ dst_base+0(FP), AX 254 MOVQ src_base+24(FP), BX 255 MOVQ prefix_base+48(FP), CX 256 MOVQ suffix_base+72(FP), DX 257 MOVQ suffix_len+80(FP), DI 258 259 XORQ SI, SI 260 XORQ R8, R8 261 XORQ R9, R9 262 VPXOR X0, X0, X0 263 264 JMP test 265 loop: 266 MOVLQZX (CX)(SI*4), R8 // prefix length 267 MOVLQZX (DX)(SI*4), R9 // suffix length 268 269 VMOVDQU (BX), X1 270 VMOVDQU X0, (AX) 271 VMOVDQU X1, (AX)(R8*1) 272 VMOVDQU (AX), X0 273 274 ADDQ R9, R8 275 LEAQ (AX)(R8*1), AX 276 LEAQ (BX)(R9*1), BX 277 INCQ SI 278 test: 279 CMPQ SI, DI 280 JNE loop 281 MOVQ dst_base+0(FP), BX 282 SUBQ BX, AX 283 MOVQ AX, ret+96(FP) 284 VZEROUPPER 285 RET