github.com/parquet-go/parquet-go@v0.20.0/encoding/delta/length_byte_array_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  // func encodeByteArrayLengths(lengths []int32, offsets []uint32)
     6  TEXT ·encodeByteArrayLengths(SB), NOSPLIT, $0-48
     7      MOVQ lengths_base+0(FP), AX
     8      MOVQ lengths_len+8(FP), CX
     9      MOVQ offsets_base+24(FP), BX
    10      XORQ SI, SI
    11  
    12      CMPQ CX, $4
    13      JB test
    14  
    15      MOVQ CX, DX
    16      SHRQ $2, DX
    17      SHLQ $2, DX
    18  
    19  loopSSE2:
    20      MOVOU 0(BX)(SI*4), X0
    21      MOVOU 4(BX)(SI*4), X1
    22      PSUBL X0, X1
    23      MOVOU X1, (AX)(SI*4)
    24      ADDQ $4, SI
    25      CMPQ SI, DX
    26      JNE loopSSE2
    27      JMP test
    28  loop:
    29      MOVL 0(BX)(SI*4), R8
    30      MOVL 4(BX)(SI*4), R9
    31      SUBL R8, R9
    32      MOVL R9, (AX)(SI*4)
    33      INCQ SI
    34  test:
    35      CMPQ SI, CX
    36      JNE loop
    37      RET
    38  
    39  // func decodeByteArrayLengths(offsets []uint32, length []int32) (lastOffset uint32, invalidLength int32)
    40  TEXT ·decodeByteArrayLengths(SB), NOSPLIT, $0-56
    41      MOVQ offsets_base+0(FP), AX
    42      MOVQ lengths_base+24(FP), BX
    43      MOVQ lengths_len+32(FP), CX
    44  
    45      XORQ DX, DX // lastOffset
    46      XORQ DI, DI // invalidLength
    47      XORQ SI, SI
    48  
    49      CMPQ CX, $4
    50      JL test
    51  
    52      MOVQ CX, R8
    53      SHRQ $2, R8
    54      SHLQ $2, R8
    55  
    56      MOVL $0, (AX)
    57      PXOR X0, X0
    58      PXOR X3, X3
    59      // This loop computes the prefix sum of the lengths array in order to
    60      // generate values of the offsets array.
    61      //
    62      // We stick to SSE2 to keep the code simple (the Go compiler appears to
    63      // assume that SSE2 must be supported on AMD64) which already yields most
    64      // of the performance that we could get on this subroutine if we were using
    65      // AVX2.
    66      //
    67      // The X3 register also accumulates a mask of all length values, which is
    68      // checked after the loop to determine whether any of the lengths were
    69      // negative.
    70      //
    71      // The following article contains a description of the prefix sum algorithm
    72      // used in this function: https://en.algorithmica.org/hpc/algorithms/prefix/
    73  loopSSE2:
    74      MOVOU (BX)(SI*4), X1
    75      POR X1, X3
    76  
    77      MOVOA X1, X2
    78      PSLLDQ $4, X2
    79      PADDD X2, X1
    80  
    81      MOVOA X1, X2
    82      PSLLDQ $8, X2
    83      PADDD X2, X1
    84  
    85      PADDD X1, X0
    86      MOVOU X0, 4(AX)(SI*4)
    87  
    88      PSHUFD $0b11111111, X0, X0
    89  
    90      ADDQ $4, SI
    91      CMPQ SI, R8
    92      JNE loopSSE2
    93  
    94      // If any of the most significant bits of double words in the X3 register
    95      // are set to 1, it indicates that one of the lengths was negative and
    96      // therefore the prefix sum is invalid.
    97      //
    98      // TODO: we report the invalid length as -1, effectively losing the original
    99      // value due to the aggregation within X3. This is something that we might
   100      // want to address in the future to provide better error reporting.
   101      MOVMSKPS X3, R8
   102      MOVL $-1, R9
   103      CMPL R8, $0
   104      CMOVLNE R9, DI
   105  
   106      MOVQ X0, DX
   107      JMP test
   108  loop:
   109      MOVL (BX)(SI*4), R8
   110      MOVL DX, (AX)(SI*4)
   111      ADDL R8, DX
   112      CMPL R8, $0
   113      CMOVLLT R8, DI
   114      INCQ SI
   115  test:
   116      CMPQ SI, CX
   117      JNE loop
   118  
   119      MOVL DX, (AX)(SI*4)
   120      MOVL DX, lastOffset+48(FP)
   121      MOVL DI, invalidLength+52(FP)
   122      RET