github.com/parquet-go/parquet-go@v0.20.0/value_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define sizeOfValue 24
     6  
     7  // This function is an optimized implementation of the memsetValues function
     8  // which assigns the parquet.Value passed as second argument to all elements of
     9  // the first slice argument.
    10  //
    11  // The optimizations relies on the fact that we can pack 4 parquet.Value values
    12  // into 3 YMM registers (24 x 4 = 32 x 3 = 96).
    13  //
    14  // func memsetValuesAVX2(values []Value, model Value, _ uint64)
    15  TEXT ·memsetValuesAVX2(SB), NOSPLIT, $0-56 // 48 + padding to load model in YMM
    16      MOVQ values_base+0(FP), AX
    17      MOVQ values_len+8(FP), BX
    18  
    19      MOVQ model_ptr+24(FP), R10
    20      MOVQ model_u64+32(FP), R11
    21      MOVQ model+40(FP), R12 // go vet complains about this line but it's OK
    22  
    23      XORQ SI, SI // byte index
    24      MOVQ BX, DI // byte count
    25      IMULQ $sizeOfValue, DI
    26  
    27      CMPQ BX, $4
    28      JB test
    29  
    30      MOVQ BX, R8
    31      SHRQ $2, R8
    32      SHLQ $2, R8
    33      IMULQ $sizeOfValue, R8
    34  
    35      VMOVDQU model+24(FP), Y0
    36      VMOVDQU Y0, Y1
    37      VMOVDQU Y0, Y2
    38  
    39      VPERMQ $0b00100100, Y0, Y0
    40      VPERMQ $0b01001001, Y1, Y1
    41      VPERMQ $0b10010010, Y2, Y2
    42  loop4:
    43      VMOVDQU Y0, 0(AX)(SI*1)
    44      VMOVDQU Y1, 32(AX)(SI*1)
    45      VMOVDQU Y2, 64(AX)(SI*1)
    46      ADDQ $4*sizeOfValue, SI
    47      CMPQ SI, R8
    48      JNE loop4
    49      VZEROUPPER
    50      JMP test
    51  loop:
    52      MOVQ R10, 0(AX)(SI*1)
    53      MOVQ R11, 8(AX)(SI*1)
    54      MOVQ R12, 16(AX)(SI*1)
    55      ADDQ $sizeOfValue, SI
    56  test:
    57      CMPQ SI, DI
    58      JNE loop
    59      RET