github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/sparse/gather_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  // func gatherBitsAVX2(dst []byte, src Uint8Array)
     6  TEXT ·gatherBitsAVX2(SB), NOSPLIT, $0-48
     7      MOVQ dst_base+0(FP), AX
     8      MOVQ src_array_ptr+24(FP), BX
     9      MOVQ src_array_len+32(FP), CX
    10      MOVQ src_array_off+40(FP), DX
    11      XORQ SI, SI
    12      SHRQ $3, CX
    13  
    14      VPBROADCASTD src_array_off+40(FP), Y0
    15      VPMULLD range0n7<>(SB), Y0, Y0
    16      VPCMPEQD Y1, Y1, Y1
    17      VPCMPEQD Y2, Y2, Y2
    18  loop:
    19      VPGATHERDD Y1, (BX)(Y0*1), Y3
    20      VMOVDQU Y2, Y1
    21      VPSLLD $31, Y3, Y3
    22      VMOVMSKPS Y3, DI
    23  
    24      MOVB DI, (AX)(SI*1)
    25  
    26      LEAQ (BX)(DX*8), BX
    27      INCQ SI
    28      CMPQ SI, CX
    29      JNE loop
    30      VZEROUPPER
    31      RET
    32  
    33  // func gatherBitsDefault(dst []byte, src Uint8Array)
    34  TEXT ·gatherBitsDefault(SB), NOSPLIT, $0-48
    35      MOVQ dst_base+0(FP), AX
    36      MOVQ src_array_ptr+24(FP), BX
    37      MOVQ src_array_len+32(FP), CX
    38      MOVQ src_array_off+40(FP), DX
    39      XORQ SI, SI
    40      SHRQ $3, CX
    41  loop:
    42      LEAQ (BX)(DX*2), DI
    43      MOVBQZX (BX), R8
    44      MOVBQZX (BX)(DX*1), R9
    45      MOVBQZX (DI), R10
    46      MOVBQZX (DI)(DX*1), R11
    47      LEAQ (BX)(DX*4), BX
    48      LEAQ (DI)(DX*4), DI
    49      MOVBQZX (BX), R12
    50      MOVBQZX (BX)(DX*1), R13
    51      MOVBQZX (DI), R14
    52      MOVBQZX (DI)(DX*1), R15
    53      LEAQ (BX)(DX*4), BX
    54  
    55      ANDQ $1, R8
    56      ANDQ $1, R9
    57      ANDQ $1, R10
    58      ANDQ $1, R11
    59      ANDQ $1, R12
    60      ANDQ $1, R13
    61      ANDQ $1, R14
    62      ANDQ $1, R15
    63  
    64      SHLQ $1, R9
    65      SHLQ $2, R10
    66      SHLQ $3, R11
    67      SHLQ $4, R12
    68      SHLQ $5, R13
    69      SHLQ $6, R14
    70      SHLQ $7, R15
    71  
    72      ORQ R9, R8
    73      ORQ R11, R10
    74      ORQ R13, R12
    75      ORQ R15, R14
    76      ORQ R10, R8
    77      ORQ R12, R8
    78      ORQ R14, R8
    79  
    80      MOVB R8, (AX)(SI*1)
    81  
    82      INCQ SI
    83      CMPQ SI, CX
    84      JNE loop
    85      RET
    86  
    87  // func gather32AVX2(dst []uint32, src Uint32Array)
    88  TEXT ·gather32AVX2(SB), NOSPLIT, $0-48
    89      MOVQ dst_base+0(FP), AX
    90      MOVQ dst_len+8(FP), CX
    91      MOVQ src_array_ptr+24(FP), BX
    92      MOVQ src_array_off+40(FP), DX
    93      XORQ SI, SI
    94  
    95      VPBROADCASTD src_array_off+40(FP), Y0
    96      VPMULLD range0n7<>(SB), Y0, Y0
    97      VPCMPEQD Y1, Y1, Y1
    98      VPCMPEQD Y2, Y2, Y2
    99  loop:
   100      VPGATHERDD Y1, (BX)(Y0*1), Y3
   101      VMOVDQU Y3, (AX)(SI*4)
   102      VMOVDQU Y2, Y1
   103  
   104      LEAQ (BX)(DX*8), BX
   105      ADDQ $8, SI
   106      CMPQ SI, CX
   107      JNE loop
   108      VZEROUPPER
   109      RET
   110  
   111  // func gather64AVX2(dst []uint64, src Uint64Array)
   112  TEXT ·gather64AVX2(SB), NOSPLIT, $0-48
   113      MOVQ dst_base+0(FP), AX
   114      MOVQ dst_len+8(FP), CX
   115      MOVQ src_array_ptr+24(FP), BX
   116      MOVQ src_array_off+40(FP), DX
   117      XORQ SI, SI
   118  
   119      VPBROADCASTQ src_array_off+40(FP), Y0
   120      VPMULLD range0n3<>(SB), Y0, Y0
   121      VPCMPEQQ Y1, Y1, Y1
   122      VPCMPEQQ Y2, Y2, Y2
   123  loop:
   124      VPGATHERQQ Y1, (BX)(Y0*1), Y3
   125      VMOVDQU Y3, (AX)(SI*8)
   126      VMOVDQU Y2, Y1
   127  
   128      LEAQ (BX)(DX*4), BX
   129      ADDQ $4, SI
   130      CMPQ SI, CX
   131      JNE loop
   132      VZEROUPPER
   133      RET
   134  
   135  // func gather128(dst [][16]byte, src Uint128Array) int
   136  TEXT ·gather128(SB), NOSPLIT, $0-56
   137      MOVQ dst_base+0(FP), AX
   138      MOVQ dst_len+8(FP), CX
   139      MOVQ src_array_ptr+24(FP), BX
   140      MOVQ src_array_len+32(FP), DI
   141      MOVQ src_array_off+40(FP), DX
   142      XORQ SI, SI
   143  
   144      CMPQ DI, CX
   145      CMOVQLT DI, CX
   146  
   147      CMPQ CX, $0
   148      JE done
   149  
   150      CMPQ CX, $1
   151      JE tail
   152  
   153      XORQ SI, SI
   154      MOVQ CX, DI
   155      SHRQ $1, DI
   156      SHLQ $1, DI
   157  loop:
   158      MOVOU (BX), X0
   159      MOVOU (BX)(DX*1), X1
   160  
   161      MOVOU X0, (AX)
   162      MOVOU X1, 16(AX)
   163  
   164      LEAQ (BX)(DX*2), BX
   165      ADDQ $32, AX
   166      ADDQ $2, SI
   167      CMPQ SI, DI
   168      JNE loop
   169  
   170      CMPQ SI, CX
   171      JE done
   172  tail:
   173      MOVOU (BX), X0
   174      MOVOU X0, (AX)
   175  done:
   176      MOVQ CX, ret+48(FP)
   177      RET
   178  
   179  GLOBL range0n3<>(SB), RODATA|NOPTR, $32
   180  DATA range0n3<>+0(SB)/8,  $0
   181  DATA range0n3<>+8(SB)/8,  $1
   182  DATA range0n3<>+16(SB)/8, $2
   183  DATA range0n3<>+24(SB)/8, $3
   184  
   185  GLOBL range0n7<>(SB), RODATA|NOPTR, $32
   186  DATA range0n7<>+0(SB)/4,  $0
   187  DATA range0n7<>+4(SB)/4,  $1
   188  DATA range0n7<>+8(SB)/4,  $2
   189  DATA range0n7<>+12(SB)/4, $3
   190  DATA range0n7<>+16(SB)/4, $4
   191  DATA range0n7<>+20(SB)/4, $5
   192  DATA range0n7<>+24(SB)/4, $6
   193  DATA range0n7<>+28(SB)/4, $7