github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/encoding/delta/byte_array_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "funcdata.h"
     4  #include "textflag.h"
     5  
     6  // func validatePrefixAndSuffixLengthValuesAVX2(prefix, suffix []int32, maxLength int) (totalPrefixLength, totalSuffixLength int, ok bool)
     7  TEXT ·validatePrefixAndSuffixLengthValuesAVX2(SB), NOSPLIT, $0-73
     8      MOVQ prefix_base+0(FP), AX
     9      MOVQ suffix_base+24(FP), BX
    10      MOVQ suffix_len+32(FP), CX
    11      MOVQ maxLength+48(FP), DX
    12  
    13      XORQ SI, SI
    14      XORQ DI, DI // lastValueLength
    15      XORQ R8, R8
    16      XORQ R9, R9
    17      XORQ R10, R10 // totalPrefixLength
    18      XORQ R11, R11 // totalSuffixLength
    19      XORQ R12, R12 // ok
    20  
    21      CMPQ CX, $8
    22      JB test
    23  
    24      MOVQ CX, R13
    25      SHRQ $3, R13
    26      SHLQ $3, R13
    27  
    28      VPXOR X0, X0, X0 // lastValueLengths
    29      VPXOR X1, X1, X1 // totalPrefixLengths
    30      VPXOR X2, X2, X2 // totalSuffixLengths
    31      VPXOR X3, X3, X3 // negative prefix length sentinels
    32      VPXOR X4, X4, X4 // negative suffix length sentinels
    33      VPXOR X5, X5, X5 // prefix length overflow sentinels
    34      VMOVDQU ·rotateLeft32(SB), Y6
    35  
    36  loopAVX2:
    37      VMOVDQU (AX)(SI*4), Y7 // p
    38      VMOVDQU (BX)(SI*4), Y8 // n
    39  
    40      VPADDD Y7, Y1, Y1
    41      VPADDD Y8, Y2, Y2
    42  
    43      VPOR Y7, Y3, Y3
    44      VPOR Y8, Y4, Y4
    45  
    46      VPADDD Y7, Y8, Y9 // p + n
    47      VPERMD Y0, Y6, Y10
    48      VPBLENDD $1, Y10, Y9, Y10
    49      VPCMPGTD Y10, Y7, Y10
    50      VPOR Y10, Y5, Y5
    51  
    52      VMOVDQU Y9, Y0
    53      ADDQ $8, SI
    54      CMPQ SI, R13
    55      JNE loopAVX2
    56  
    57      // If any of the sentinel values has its most significant bit set then one
    58      // of the values was negative or one of the prefixes was greater than the
    59      // length of the previous value, return false.
    60      VPOR Y4, Y3, Y3
    61      VPOR Y5, Y3, Y3
    62      VMOVMSKPS Y3, R13
    63      CMPQ R13, $0
    64      JNE done
    65  
    66      // We computed 8 sums in parallel for the prefix and suffix arrays, they
    67      // need to be accumulated into single values, which is what these reduction
    68      // steps do.
    69      VPSRLDQ $4, Y1, Y5
    70      VPSRLDQ $8, Y1, Y6
    71      VPSRLDQ $12, Y1, Y7
    72      VPADDD Y5, Y1, Y1
    73      VPADDD Y6, Y1, Y1
    74      VPADDD Y7, Y1, Y1
    75      VPERM2I128 $1, Y1, Y1, Y0
    76      VPADDD Y0, Y1, Y1
    77      MOVQ X1, R10
    78      ANDQ $0x7FFFFFFF, R10
    79  
    80      VPSRLDQ $4, Y2, Y5
    81      VPSRLDQ $8, Y2, Y6
    82      VPSRLDQ $12, Y2, Y7
    83      VPADDD Y5, Y2, Y2
    84      VPADDD Y6, Y2, Y2
    85      VPADDD Y7, Y2, Y2
    86      VPERM2I128 $1, Y2, Y2, Y0
    87      VPADDD Y0, Y2, Y2
    88      MOVQ X2, R11
    89      ANDQ $0x7FFFFFFF, R11
    90  
    91      JMP test
    92  loop:
    93      MOVLQSX (AX)(SI*4), R8
    94      MOVLQSX (BX)(SI*4), R9
    95  
    96      CMPQ R8, $0 // p < 0 ?
    97      JL done
    98  
    99      CMPQ R9, $0 // n < 0 ?
   100      JL done
   101  
   102      CMPQ R8, DI // p > lastValueLength ?
   103      JG done
   104  
   105      ADDQ R8, R10
   106      ADDQ R9, R11
   107      ADDQ R8, DI
   108      ADDQ R9, DI
   109  
   110      INCQ SI
   111  test:
   112      CMPQ SI, CX
   113      JNE loop
   114  
   115      CMPQ R11, DX // totalSuffixLength > maxLength ?
   116      JG done
   117  
   118      MOVB $1, R12
   119  done:
   120      MOVQ R10, totalPrefixLength+56(FP)
   121      MOVQ R11, totalSuffixLength+64(FP)
   122      MOVB R12, ok+72(FP)
   123      RET
   124  
   125  // func decodeByteArrayAVX2(dst, src []byte, prefix, suffix []int32) int
   126  TEXT ·decodeByteArrayAVX2(SB), NOSPLIT, $0-104
   127      MOVQ dst_base+0(FP), AX
   128      MOVQ src_base+24(FP), BX
   129      MOVQ prefix_base+48(FP), CX
   130      MOVQ suffix_base+72(FP), DX
   131      MOVQ suffix_len+80(FP), DI
   132  
   133      ADDQ $4, AX
   134      XORQ SI, SI
   135      XORQ R8, R8
   136      XORQ R9, R9
   137      MOVQ AX, R10 // last value
   138  
   139      JMP test
   140  loop:
   141      MOVLQZX (CX)(SI*4), R8 // prefix length
   142      MOVLQZX (DX)(SI*4), R9 // suffix length
   143      MOVQ R8, R11
   144      ADDQ R9, R11
   145      MOVL R11, -4(AX)
   146  prefix:
   147      VMOVDQU (R10), X0
   148      VMOVDQU X0, (AX)
   149      CMPQ R8, $16
   150      JA copyPrefix
   151  suffix:
   152      VMOVDQU (BX), X1
   153      VMOVDQU X1, (AX)(R8*1)
   154      CMPQ R9, $16
   155      JA copySuffix
   156  next:
   157      MOVQ AX, R10
   158      LEAQ 4(AX)(R11*1), AX
   159      LEAQ 0(BX)(R9*1), BX
   160      INCQ SI
   161  test:
   162      CMPQ SI, DI
   163      JNE loop
   164      MOVQ dst_base+0(FP), BX
   165      SUBQ BX, AX
   166      SUBQ $4, AX
   167      MOVQ AX, ret+96(FP)
   168      VZEROUPPER
   169      RET
   170  copyPrefix:
   171      MOVQ $16, R12
   172  copyPrefixLoop:
   173      VMOVDQU (R10)(R12*1), Y0
   174      VMOVDQU Y0, (AX)(R12*1)
   175      ADDQ $32, R12
   176      CMPQ R12, R8
   177      JB copyPrefixLoop
   178      JMP suffix
   179  copySuffix:
   180      MOVQ $16, R12
   181      LEAQ (AX)(R8*1), R13
   182  copySuffixLoop:
   183      VMOVDQU (BX)(R12*1), Y1
   184      VMOVDQU Y1, (R13)(R12*1)
   185      ADDQ $32, R12
   186      CMPQ R12, R9
   187      JB copySuffixLoop
   188      JMP next
   189  
   190  // func decodeFixedLenByteArrayAVX2(dst, src []byte, prefix, suffix []int32) int
   191  TEXT ·decodeFixedLenByteArrayAVX2(SB), NOSPLIT, $0-104
   192      MOVQ dst_base+0(FP), AX
   193      MOVQ src_base+24(FP), BX
   194      MOVQ prefix_base+48(FP), CX
   195      MOVQ suffix_base+72(FP), DX
   196      MOVQ suffix_len+80(FP), DI
   197  
   198      XORQ SI, SI
   199      XORQ R8, R8
   200      XORQ R9, R9
   201      MOVQ AX, R10 // last value
   202  
   203      JMP test
   204  loop:
   205      MOVLQZX (CX)(SI*4), R8 // prefix length
   206      MOVLQZX (DX)(SI*4), R9 // suffix length
   207  prefix:
   208      VMOVDQU (R10), Y0
   209      VMOVDQU Y0, (AX)
   210      CMPQ R8, $32
   211      JA copyPrefix
   212  suffix:
   213      VMOVDQU (BX), Y1
   214      VMOVDQU Y1, (AX)(R8*1)
   215      CMPQ R9, $32
   216      JA copySuffix
   217  next:
   218      MOVQ AX, R10
   219      ADDQ R9, R8
   220      LEAQ (AX)(R8*1), AX
   221      LEAQ (BX)(R9*1), BX
   222      INCQ SI
   223  test:
   224      CMPQ SI, DI
   225      JNE loop
   226      MOVQ dst_base+0(FP), BX
   227      SUBQ BX, AX
   228      MOVQ AX, ret+96(FP)
   229      VZEROUPPER
   230      RET
   231  copyPrefix:
   232      MOVQ $32, R12
   233  copyPrefixLoop:
   234      VMOVDQU (R10)(R12*1), Y0
   235      VMOVDQU Y0, (AX)(R12*1)
   236      ADDQ $32, R12
   237      CMPQ R12, R8
   238      JB copyPrefixLoop
   239      JMP suffix
   240  copySuffix:
   241      MOVQ $32, R12
   242      LEAQ (AX)(R8*1), R13
   243  copySuffixLoop:
   244      VMOVDQU (BX)(R12*1), Y1
   245      VMOVDQU Y1, (R13)(R12*1)
   246      ADDQ $32, R12
   247      CMPQ R12, R9
   248      JB copySuffixLoop
   249      JMP next
   250  
   251  // func decodeFixedLenByteArrayAVX2x128bits(dst, src []byte, prefix, suffix []int32) int
   252  TEXT ·decodeFixedLenByteArrayAVX2x128bits(SB), NOSPLIT, $0-104
   253      MOVQ dst_base+0(FP), AX
   254      MOVQ src_base+24(FP), BX
   255      MOVQ prefix_base+48(FP), CX
   256      MOVQ suffix_base+72(FP), DX
   257      MOVQ suffix_len+80(FP), DI
   258  
   259      XORQ SI, SI
   260      XORQ R8, R8
   261      XORQ R9, R9
   262      VPXOR X0, X0, X0
   263  
   264      JMP test
   265  loop:
   266      MOVLQZX (CX)(SI*4), R8 // prefix length
   267      MOVLQZX (DX)(SI*4), R9 // suffix length
   268  
   269      VMOVDQU (BX), X1
   270      VMOVDQU X0, (AX)
   271      VMOVDQU X1, (AX)(R8*1)
   272      VMOVDQU (AX), X0
   273  
   274      ADDQ R9, R8
   275      LEAQ (AX)(R8*1), AX
   276      LEAQ (BX)(R9*1), BX
   277      INCQ SI
   278  test:
   279      CMPQ SI, DI
   280      JNE loop
   281      MOVQ dst_base+0(FP), BX
   282      SUBQ BX, AX
   283      MOVQ AX, ret+96(FP)
   284      VZEROUPPER
   285      RET