github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/count_amd64.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  // 
     4  // Copyright 2018 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && amd64
     9  
    10  #include "textflag.h"
    11  
    12  TEXT ·CountSlice(SB),NOSPLIT,$0-40
    13  	CMPB ·hasPOPCNT(SB), $1
    14  	JEQ 2(PC)
    15  	JMP ·countGeneric(SB)
    16  	MOVQ b_base+0(FP), SI
    17  	MOVQ b_len+8(FP), BX
    18  	MOVB c+24(FP), AL
    19  	LEAQ ret+32(FP), R8
    20  	JMP countbody<>(SB)
    21  
    22  TEXT ·Count(SB),NOSPLIT,$0-32
    23  	CMPB ·hasPOPCNT(SB), $1
    24  	JEQ 2(PC)
    25  	JMP ·countGenericString(SB)
    26  	MOVQ s_base+0(FP), SI
    27  	MOVQ s_len+8(FP), BX
    28  	MOVB c+16(FP), AL
    29  	LEAQ ret+24(FP), R8
    30  	JMP countbody<>(SB)
    31  
    32  // input:
    33  //   SI: data
    34  //   BX: data len
    35  //   AL: byte sought
    36  //   R8: address to put result
    37  // This function requires the POPCNT instruction.
    38  TEXT countbody<>(SB),NOSPLIT,$0
    39  	// Shuffle X0 around so that each byte contains
    40  	// the character we're looking for.
    41  	MOVD AX, X0
    42  	PUNPCKLBW X0, X0
    43  	PUNPCKLBW X0, X0
    44  	PSHUFL $0, X0, X0
    45  
    46  	CMPQ BX, $16
    47  	JLT small
    48  
    49  	MOVQ $0, R12 // Accumulator
    50  
    51  	MOVQ SI, DI
    52  
    53  	CMPQ BX, $32
    54  	JA avx2
    55  sse:
    56  	LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
    57  	JMP sseloopentry
    58  
    59  sseloop:
    60  	// Move the next 16-byte chunk of the data into X1.
    61  	MOVOU (DI), X1
    62  	// Compare bytes in X0 to X1.
    63  	PCMPEQB X0, X1
    64  	// Take the top bit of each byte in X1 and put the result in DX.
    65  	PMOVMSKB X1, DX
    66  	// Count number of matching bytes
    67  	POPCNTL DX, DX
    68  	// Accumulate into R12
    69  	ADDQ DX, R12
    70  	// Advance to next block.
    71  	ADDQ $16, DI
    72  sseloopentry:
    73  	CMPQ DI, AX
    74  	JBE sseloop
    75  
    76  	// Get the number of bytes to consider in the last 16 bytes
    77  	ANDQ $15, BX
    78  	JZ end
    79  
    80  	// Create mask to ignore overlap between previous 16 byte block
    81  	// and the next.
    82  	MOVQ $16,CX
    83  	SUBQ BX, CX
    84  	MOVQ $0xFFFF, R10
    85  	SARQ CL, R10
    86  	SALQ CL, R10
    87  
    88  	// Process the last 16-byte chunk. This chunk may overlap with the
    89  	// chunks we've already searched so we need to mask part of it.
    90  	MOVOU (AX), X1
    91  	PCMPEQB X0, X1
    92  	PMOVMSKB X1, DX
    93  	// Apply mask
    94  	ANDQ R10, DX
    95  	POPCNTL DX, DX
    96  	ADDQ DX, R12
    97  end:
    98  	MOVQ R12, (R8)
    99  	RET
   100  
   101  // handle for lengths < 16
   102  small:
   103  	TESTQ BX, BX
   104  	JEQ endzero
   105  
   106  	// Check if we'll load across a page boundary.
   107  	LEAQ 16(SI), AX
   108  	TESTW $0xff0, AX
   109  	JEQ endofpage
   110  
   111  	// We must ignore high bytes as they aren't part of our slice.
   112  	// Create mask.
   113  	MOVB BX, CX
   114  	MOVQ $1, R10
   115  	SALQ CL, R10
   116  	SUBQ $1, R10
   117  
   118  	// Load data
   119  	MOVOU (SI), X1
   120  	// Compare target byte with each byte in data.
   121  	PCMPEQB X0, X1
   122  	// Move result bits to integer register.
   123  	PMOVMSKB X1, DX
   124  	// Apply mask
   125  	ANDQ R10, DX
   126  	POPCNTL DX, DX
   127  	// Directly return DX, we don't need to accumulate
   128  	// since we have <16 bytes.
   129  	MOVQ DX, (R8)
   130  	RET
   131  endzero:
   132  	MOVQ $0, (R8)
   133  	RET
   134  
   135  endofpage:
   136  	// We must ignore low bytes as they aren't part of our slice.
   137  	MOVQ $16,CX
   138  	SUBQ BX, CX
   139  	MOVQ $0xFFFF, R10
   140  	SARQ CL, R10
   141  	SALQ CL, R10
   142  
   143  	// Load data into the high end of X1.
   144  	MOVOU -16(SI)(BX*1), X1
   145  	// Compare target byte with each byte in data.
   146  	PCMPEQB X0, X1
   147  	// Move result bits to integer register.
   148  	PMOVMSKB X1, DX
   149  	// Apply mask
   150  	ANDQ R10, DX
   151  	// Directly return DX, we don't need to accumulate
   152  	// since we have <16 bytes.
   153  	POPCNTL DX, DX
   154  	MOVQ DX, (R8)
   155  	RET
   156  
   157  avx2:
   158  	CMPB   ·hasAVX2(SB), $1
   159  	JNE sse
   160  	MOVD AX, X0
   161  	LEAQ -32(SI)(BX*1), R11
   162  	VPBROADCASTB  X0, Y1
   163  avx2_loop:
   164  	VMOVDQU (DI), Y2
   165  	VPCMPEQB Y1, Y2, Y3
   166  	VPMOVMSKB Y3, DX
   167  	POPCNTL DX, DX
   168  	ADDQ DX, R12
   169  	ADDQ $32, DI
   170  	CMPQ DI, R11
   171  	JLE avx2_loop
   172  
   173  	// If last block is already processed,
   174  	// skip to the end.
   175  	CMPQ DI, R11
   176  	JEQ endavx
   177  
   178  	// Load address of the last 32 bytes.
   179  	// There is an overlap with the previous block.
   180  	MOVQ R11, DI
   181  	VMOVDQU (DI), Y2
   182  	VPCMPEQB Y1, Y2, Y3
   183  	VPMOVMSKB Y3, DX
   184  	// Exit AVX mode.
   185  	VZEROUPPER
   186  
   187  	// Create mask to ignore overlap between previous 32 byte block
   188  	// and the next.
   189  	ANDQ $31, BX
   190  	MOVQ $32,CX
   191  	SUBQ BX, CX
   192  	MOVQ $0xFFFFFFFF, R10
   193  	SARQ CL, R10
   194  	SALQ CL, R10
   195  	// Apply mask
   196  	ANDQ R10, DX
   197  	POPCNTL DX, DX
   198  	ADDQ DX, R12
   199  	MOVQ R12, (R8)
   200  	RET
   201  endavx:
   202  	// Exit AVX mode.
   203  	VZEROUPPER
   204  	MOVQ R12, (R8)
   205  	RET