github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/encoding/delta/length_byte_array_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "funcdata.h"
     4  #include "textflag.h"
     5  
     6  // func validateLengthValuesAVX2(lengths []int32) (totalLength int, ok bool)
     7  TEXT ·validateLengthValuesAVX2(SB), NOSPLIT, $0-33
     8      MOVQ lengths_base+0(FP), AX
     9      MOVQ lengths_len+8(FP), CX
    10  
    11      XORQ BX, BX // totalLength
    12      XORQ DX, DX // err
    13      XORQ SI, SI
    14      XORQ DI, DI
    15      XORQ R8, R8
    16  
    17      CMPQ CX, $16
    18      JB test
    19  
    20      MOVQ CX, DI
    21      SHRQ $4, DI
    22      SHLQ $4, DI
    23  
    24      VPXOR X0, X0, X0 // totalLengths
    25      VPXOR X1, X1, X1 // negative test
    26  loopAVX2:
    27      VMOVDQU (AX)(SI*4), Y2
    28      VMOVDQU 32(AX)(SI*4), Y3
    29      VPADDD Y2, Y0, Y0
    30      VPADDD Y3, Y0, Y0
    31      VPOR Y2, Y1, Y1
    32      VPOR Y3, Y1, Y1
    33      ADDQ $16, SI
    34      CMPQ SI, DI
    35      JNE loopAVX2
    36  
    37      // If any of the 32 bit words has its most significant bit set to 1,
    38      // then at least one of the values was negative, which must be reported as
    39      // an error.
    40      VMOVMSKPS Y1, R8
    41      CMPQ R8, $0
    42      JNE done
    43  
    44      VPSRLDQ $4, Y0, Y1
    45      VPSRLDQ $8, Y0, Y2
    46      VPSRLDQ $12, Y0, Y3
    47  
    48      VPADDD Y1, Y0, Y0
    49      VPADDD Y3, Y2, Y2
    50      VPADDD Y2, Y0, Y0
    51  
    52      VPERM2I128 $1, Y0, Y0, Y1
    53      VPADDD Y1, Y0, Y0
    54      VZEROUPPER
    55      MOVQ X0, BX
    56      ANDQ $0x7FFFFFFF, BX
    57  
    58      JMP test
    59  loop:
    60      MOVL (AX)(SI*4), DI
    61      ADDL DI, BX
    62      ORL DI, R8
    63      INCQ SI
    64  test:
    65      CMPQ SI, CX
    66      JNE loop
    67      CMPL R8, $0
    68      JL done
    69      MOVB $1, DX
    70  done:
    71      MOVQ BX, totalLength+24(FP)
    72      MOVB DX, ok+32(FP)
    73      RET
    74  
    75  // This function is an optimization of the decodeLengthByteArray using AVX2
    76  // instructions to implement an opportunistic copy strategy which improves
    77  // throughput compared to using runtime.memmove (via Go's copy).
    78  //
    79  // Parquet columns of type BYTE_ARRAY will often hold short strings, rarely
    80  // exceeding a couple hundred bytes in size. Making a function call to
    81  // runtime.memmove for each value results in spending most of the CPU time
    82  // on branching rather than actually copying bytes to the output buffer.
    83  //
    84  // This function works by always assuming it can copy 16 bytes of data between
    85  // the input and outputs, even in the event where a value is shorter than this.
    86  //
    87  // The pointers to the current positions for input and output pointers are
    88  // always adjusted by the right number of bytes so that the next writes
    89  // overwrite any extra bytes that were written in the previous iteration of the
    90  // copy loop.
    91  //
    92  // The throughput of this function is not as good as runtime.memmove for large
    93  // buffers, but it ends up being close to an order of magnitude higher for the
    94  // common case of working with short strings.
    95  //
    96  // func decodeLengthByteArrayAVX2(dst, src []byte, lengths []int32) int
    97  TEXT ·decodeLengthByteArrayAVX2(SB), NOSPLIT, $0-80
    98      MOVQ dst_base+0(FP), AX
    99      MOVQ src_base+24(FP), BX
   100      MOVQ lengths_base+48(FP), DX
   101      MOVQ lengths_len+56(FP), DI
   102  
   103      LEAQ (DX)(DI*4), DI
   104      LEAQ 4(AX), AX
   105      XORQ CX, CX
   106      JMP test
   107  loop:
   108      MOVL (DX), CX
   109      MOVL CX, -4(AX)
   110      // First pass moves 16 bytes, this makes it a very fast path for short
   111      // strings.
   112      VMOVDQU (BX), X0
   113      VMOVDQU X0, (AX)
   114      CMPQ CX, $16
   115      JA copy
   116  next:
   117      LEAQ 4(AX)(CX*1), AX
   118      LEAQ 0(BX)(CX*1), BX
   119      LEAQ 4(DX), DX
   120  test:
   121      CMPQ DX, DI
   122      JNE loop
   123      MOVQ dst_base+0(FP), BX
   124      SUBQ BX, AX
   125      SUBQ $4, AX
   126      MOVQ AX, ret+72(FP)
   127      VZEROUPPER
   128      RET
   129  copy:
   130      // Values longer than 16 bytes enter this loop and move 32 byte chunks
   131      // which helps improve throughput on larger chunks.
   132      MOVQ $16, SI
   133  copyLoop32:
   134      VMOVDQU (BX)(SI*1), Y0
   135      VMOVDQU Y0, (AX)(SI*1)
   136      ADDQ $32, SI
   137      CMPQ SI, CX
   138      JAE next
   139      JMP copyLoop32