github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/encoding/delta/byte_array_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "funcdata.h"
     4  #include "textflag.h"
     5  
     6  // func validatePrefixAndSuffixLengthValuesAVX2(prefix, suffix []int32, maxLength int) (totalPrefixLength, totalSuffixLength int, ok bool)
     7  TEXT ·validatePrefixAndSuffixLengthValuesAVX2(SB), NOSPLIT, $0-73
     8      MOVQ prefix_base+0(FP), AX
     9      MOVQ suffix_base+24(FP), BX
    10      MOVQ suffix_len+32(FP), CX
    11      MOVQ maxLength+48(FP), DX
    12  
    13      XORQ SI, SI
    14      XORQ DI, DI // lastValueLength
    15      XORQ R8, R8
    16      XORQ R9, R9
    17      XORQ R10, R10 // totalPrefixLength
    18      XORQ R11, R11 // totalSuffixLength
    19      XORQ R12, R12 // ok
    20  
    21      CMPQ CX, $8
    22      JB test
    23  
    24      MOVQ CX, R13
    25      SHRQ $3, R13
    26      SHLQ $3, R13
    27  
    28      VPXOR X0, X0, X0 // lastValueLengths
    29      VPXOR X1, X1, X1 // totalPrefixLengths
    30      VPXOR X2, X2, X2 // totalSuffixLengths
    31      VPXOR X3, X3, X3 // negative prefix length sentinels
    32      VPXOR X4, X4, X4 // negative suffix length sentinels
    33      VPXOR X5, X5, X5 // prefix length overflow sentinels
    34      VMOVDQU ·rotateLeft32(SB), Y6
    35  
    36  loopAVX2:
    37      VMOVDQU (AX)(SI*4), Y7 // p
    38      VMOVDQU (BX)(SI*4), Y8 // n
    39  
    40      VPADDD Y7, Y1, Y1
    41      VPADDD Y8, Y2, Y2
    42  
    43      VPOR Y7, Y3, Y3
    44      VPOR Y8, Y4, Y4
    45  
    46      VPADDD Y7, Y8, Y9 // p + n
    47      VPERMD Y0, Y6, Y10
    48      VPBLENDD $1, Y10, Y9, Y10
    49      VPCMPGTD Y10, Y7, Y10
    50      VPOR Y10, Y5, Y5
    51  
    52      VMOVDQU Y9, Y0
    53      ADDQ $8, SI
    54      CMPQ SI, R13
    55      JNE loopAVX2
    56  
    57      // If any of the sentinel values has its most significant bit set then one
    58      // of the values was negative or one of the prefixes was greater than the
    59      // length of the previous value, return false.
    60      VPOR Y4, Y3, Y3
    61      VPOR Y5, Y3, Y3
    62      VMOVMSKPS Y3, R13
    63      CMPQ R13, $0
    64      JNE done
    65  
    66      // We computed 8 sums in parallel for the prefix and suffix arrays, they
    67      // need to be accumulated into single values, which is what these reduction
    68      // steps do.
    69      VPSRLDQ $4, Y1, Y5
    70      VPSRLDQ $8, Y1, Y6
    71      VPSRLDQ $12, Y1, Y7
    72      VPADDD Y5, Y1, Y1
    73      VPADDD Y6, Y1, Y1
    74      VPADDD Y7, Y1, Y1
    75      VPERM2I128 $1, Y1, Y1, Y0
    76      VPADDD Y0, Y1, Y1
    77      MOVQ X1, R10
    78      ANDQ $0x7FFFFFFF, R10
    79  
    80      VPSRLDQ $4, Y2, Y5
    81      VPSRLDQ $8, Y2, Y6
    82      VPSRLDQ $12, Y2, Y7
    83      VPADDD Y5, Y2, Y2
    84      VPADDD Y6, Y2, Y2
    85      VPADDD Y7, Y2, Y2
    86      VPERM2I128 $1, Y2, Y2, Y0
    87      VPADDD Y0, Y2, Y2
    88      MOVQ X2, R11
    89      ANDQ $0x7FFFFFFF, R11
    90  
    91      JMP test
    92  loop:
    93      MOVLQSX (AX)(SI*4), R8
    94      MOVLQSX (BX)(SI*4), R9
    95  
    96      CMPQ R8, $0 // p < 0 ?
    97      JL done
    98  
    99      CMPQ R9, $0 // n < 0 ?
   100      JL done
   101  
   102      CMPQ R8, DI // p > lastValueLength ?
   103      JG done
   104  
   105      ADDQ R8, R10
   106      ADDQ R9, R11
   107      ADDQ R8, DI
   108      ADDQ R9, DI
   109  
   110      INCQ SI
   111  test:
   112      CMPQ SI, CX
   113      JNE loop
   114  
   115      CMPQ R11, DX // totalSuffixLength > maxLength ?
   116      JG done
   117  
   118      MOVB $1, R12
   119  done:
   120      MOVQ R10, totalPrefixLength+56(FP)
   121      MOVQ R11, totalSuffixLength+64(FP)
   122      MOVB R12, ok+72(FP)
   123      RET
   124  
   125  // func decodeByteArrayOffsets(offsets []uint32, prefix, suffix []int32)
   126  TEXT ·decodeByteArrayOffsets(SB), NOSPLIT, $0-72
   127      MOVQ offsets_base+0(FP), AX
   128      MOVQ prefix_base+24(FP), BX
   129      MOVQ suffix_base+48(FP), CX
   130      MOVQ suffix_len+56(FP), DX
   131  
   132      XORQ SI, SI
   133      XORQ R10, R10
   134      JMP test
   135  loop:
   136      MOVL (BX)(SI*4), R8
   137      MOVL (CX)(SI*4), R9
   138      MOVL R10, (AX)(SI*4)
   139      ADDL R8, R10
   140      ADDL R9, R10
   141      INCQ SI
   142  test:
   143      CMPQ SI, DX
   144      JNE loop
   145      MOVL R10, (AX)(SI*4)
   146      RET
   147  
   148  // func decodeByteArrayAVX2(dst, src []byte, prefix, suffix []int32) int
   149  TEXT ·decodeByteArrayAVX2(SB), NOSPLIT, $0-104
   150      MOVQ dst_base+0(FP), AX
   151      MOVQ src_base+24(FP), BX
   152      MOVQ prefix_base+48(FP), CX
   153      MOVQ suffix_base+72(FP), DX
   154      MOVQ suffix_len+80(FP), DI
   155  
   156      XORQ SI, SI
   157      XORQ R8, R8
   158      XORQ R9, R9
   159      MOVQ AX, R10 // last value
   160  
   161      JMP test
   162  loop:
   163      MOVLQZX (CX)(SI*4), R8 // prefix length
   164      MOVLQZX (DX)(SI*4), R9 // suffix length
   165  prefix:
   166      VMOVDQU (R10), Y0
   167      VMOVDQU Y0, (AX)
   168      CMPQ R8, $32
   169      JA copyPrefix
   170  suffix:
   171      VMOVDQU (BX), Y1
   172      VMOVDQU Y1, (AX)(R8*1)
   173      CMPQ R9, $32
   174      JA copySuffix
   175  next:
   176      MOVQ AX, R10
   177      ADDQ R9, R8
   178      LEAQ (AX)(R8*1), AX
   179      LEAQ (BX)(R9*1), BX
   180      INCQ SI
   181  test:
   182      CMPQ SI, DI
   183      JNE loop
   184      MOVQ dst_base+0(FP), BX
   185      SUBQ BX, AX
   186      MOVQ AX, ret+96(FP)
   187      VZEROUPPER
   188      RET
   189  copyPrefix:
   190      MOVQ $32, R12
   191  copyPrefixLoop:
   192      VMOVDQU (R10)(R12*1), Y0
   193      VMOVDQU Y0, (AX)(R12*1)
   194      ADDQ $32, R12
   195      CMPQ R12, R8
   196      JB copyPrefixLoop
   197      JMP suffix
   198  copySuffix:
   199      MOVQ $32, R12
   200      LEAQ (AX)(R8*1), R13
   201  copySuffixLoop:
   202      VMOVDQU (BX)(R12*1), Y1
   203      VMOVDQU Y1, (R13)(R12*1)
   204      ADDQ $32, R12
   205      CMPQ R12, R9
   206      JB copySuffixLoop
   207      JMP next
   208  
   209  // func decodeByteArrayAVX2x128bits(dst, src []byte, prefix, suffix []int32) int
   210  TEXT ·decodeByteArrayAVX2x128bits(SB), NOSPLIT, $0-104
   211      MOVQ dst_base+0(FP), AX
   212      MOVQ src_base+24(FP), BX
   213      MOVQ prefix_base+48(FP), CX
   214      MOVQ suffix_base+72(FP), DX
   215      MOVQ suffix_len+80(FP), DI
   216  
   217      XORQ SI, SI
   218      XORQ R8, R8
   219      XORQ R9, R9
   220      VPXOR X0, X0, X0
   221  
   222      JMP test
   223  loop:
   224      MOVLQZX (CX)(SI*4), R8 // prefix length
   225      MOVLQZX (DX)(SI*4), R9 // suffix length
   226  
   227      VMOVDQU (BX), X1
   228      VMOVDQU X0, (AX)
   229      VMOVDQU X1, (AX)(R8*1)
   230      VMOVDQU (AX), X0
   231  
   232      ADDQ R9, R8
   233      LEAQ (AX)(R8*1), AX
   234      LEAQ (BX)(R9*1), BX
   235      INCQ SI
   236  test:
   237      CMPQ SI, DI
   238      JNE loop
   239      MOVQ dst_base+0(FP), BX
   240      SUBQ BX, AX
   241      MOVQ AX, ret+96(FP)
   242      VZEROUPPER
   243      RET