github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/internal/bytealg/count_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  // func Count(data []byte, value byte) int
     6  TEXT ·Count(SB), NOSPLIT, $0-40
     7      MOVQ data_base+0(FP), AX
     8      MOVQ data_len+8(FP), CX
     9      MOVB value+24(FP), BX
    10      MOVQ CX, DX // len
    11      ADDQ AX, CX // end
    12      XORQ SI, SI // count
    13  
    14      CMPQ DX, $256
    15      JB test
    16  
    17      CMPB ·hasAVX2(SB), $0
    18      JE test
    19  
    20      XORQ R12, R12
    21      XORQ R13, R13
    22      XORQ R14, R14
    23      XORQ R15, R15
    24  
    25      CMPB ·hasAVX512Count(SB), $0
    26      JE initAVX2
    27  
    28      SHRQ $8, DX
    29      SHLQ $8, DX
    30      ADDQ AX, DX
    31      VPBROADCASTB BX, Z0
    32  loopAVX512:
    33      VMOVDQU64 (AX), Z1
    34      VMOVDQU64 64(AX), Z2
    35      VMOVDQU64 128(AX), Z3
    36      VMOVDQU64 192(AX), Z4
    37      VPCMPUB $0, Z0, Z1, K1
    38      VPCMPUB $0, Z0, Z2, K2
    39      VPCMPUB $0, Z0, Z3, K3
    40      VPCMPUB $0, Z0, Z4, K4
    41      KMOVQ K1, R8
    42      KMOVQ K2, R9
    43      KMOVQ K3, R10
    44      KMOVQ K4, R11
    45      POPCNTQ R8, R8
    46      POPCNTQ R9, R9
    47      POPCNTQ R10, R10
    48      POPCNTQ R11, R11
    49      ADDQ R8, R12
    50      ADDQ R9, R13
    51      ADDQ R10, R14
    52      ADDQ R11, R15
    53      ADDQ $256, AX
    54      CMPQ AX, DX
    55      JNE loopAVX512
    56      ADDQ R12, R13
    57      ADDQ R14, R15
    58      ADDQ R13, SI
    59      ADDQ R15, SI
    60      JMP doneAVX
    61  
    62  initAVX2:
    63      SHRQ $6, DX
    64      SHLQ $6, DX
    65      ADDQ AX, DX
    66      VPBROADCASTB value+24(FP), Y0
    67  loopAVX2:
    68      VMOVDQU (AX), Y1
    69      VMOVDQU 32(AX), Y2
    70      VPCMPEQB Y0, Y1, Y1
    71      VPCMPEQB Y0, Y2, Y2
    72      VPMOVMSKB Y1, R12
    73      VPMOVMSKB Y2, R13
    74      POPCNTL R12, R12
    75      POPCNTL R13, R13
    76      ADDQ R12, R14
    77      ADDQ R13, R15
    78      ADDQ $64, AX
    79      CMPQ AX, DX
    80      JNE loopAVX2
    81      ADDQ R14, SI
    82      ADDQ R15, SI
    83  
    84  doneAVX:
    85      VZEROUPPER
    86      JMP test
    87  
    88  loop:
    89      MOVQ SI, DI
    90      INCQ DI
    91      MOVB (AX), R8
    92      CMPB BX, R8
    93      CMOVQEQ DI, SI
    94      INCQ AX
    95  test:
    96      CMPQ AX, CX
    97      JNE loop
    98  done:
    99      MOVQ SI, ret+32(FP)
   100      RET