github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/internal/bytealg/count_amd64.s

github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/internal/bytealg/count_amd64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "asm_amd64.h"
     7  #include "textflag.h"
     8  
     9  TEXT ·Count(SB),NOSPLIT,$0-40
    10  #ifndef hasPOPCNT
    11  	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
    12  	JEQ	2(PC)
    13  	JMP	·countGeneric(SB)
    14  #endif
    15  	MOVQ	b_base+0(FP), SI
    16  	MOVQ	b_len+8(FP), BX
    17  	MOVB	c+24(FP), AL
    18  	LEAQ	ret+32(FP), R8
    19  	JMP	countbody<>(SB)
    20  
    21  TEXT ·CountString(SB),NOSPLIT,$0-32
    22  #ifndef hasPOPCNT
    23  	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
    24  	JEQ	2(PC)
    25  	JMP	·countGenericString(SB)
    26  #endif
    27  	MOVQ	s_base+0(FP), SI
    28  	MOVQ	s_len+8(FP), BX
    29  	MOVB	c+16(FP), AL
    30  	LEAQ	ret+24(FP), R8
    31  	JMP	countbody<>(SB)
    32  
    33  // input:
    34  //   SI: data
    35  //   BX: data len
    36  //   AL: byte sought
    37  //   R8: address to put result
    38  // This function requires the POPCNT instruction.
    39  TEXT countbody<>(SB),NOSPLIT,$0
    40  	// Shuffle X0 around so that each byte contains
    41  	// the character we're looking for.
    42  	MOVD AX, X0
    43  	PUNPCKLBW X0, X0
    44  	PUNPCKLBW X0, X0
    45  	PSHUFL $0, X0, X0
    46  
    47  	CMPQ BX, $16
    48  	JLT small
    49  
    50  	MOVQ $0, R12 // Accumulator
    51  
    52  	MOVQ SI, DI
    53  
    54  	CMPQ BX, $32
    55  	JA avx2
    56  sse:
    57  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
    58  	JMP	sseloopentry
    59  
    60  sseloop:
    61  	// Move the next 16-byte chunk of the data into X1.
    62  	MOVOU	(DI), X1
    63  	// Compare bytes in X0 to X1.
    64  	PCMPEQB	X0, X1
    65  	// Take the top bit of each byte in X1 and put the result in DX.
    66  	PMOVMSKB X1, DX
    67  	// Count number of matching bytes
    68  	POPCNTL DX, DX
    69  	// Accumulate into R12
    70  	ADDQ DX, R12
    71  	// Advance to next block.
    72  	ADDQ	$16, DI
    73  sseloopentry:
    74  	CMPQ	DI, AX
    75  	JBE	sseloop
    76  
    77  	// Get the number of bytes to consider in the last 16 bytes
    78  	ANDQ $15, BX
    79  	JZ end
    80  
    81  	// Create mask to ignore overlap between previous 16 byte block
    82  	// and the next.
    83  	MOVQ $16,CX
    84  	SUBQ BX, CX
    85  	MOVQ $0xFFFF, R10
    86  	SARQ CL, R10
    87  	SALQ CL, R10
    88  
    89  	// Process the last 16-byte chunk. This chunk may overlap with the
    90  	// chunks we've already searched so we need to mask part of it.
    91  	MOVOU	(AX), X1
    92  	PCMPEQB	X0, X1
    93  	PMOVMSKB X1, DX
    94  	// Apply mask
    95  	ANDQ R10, DX
    96  	POPCNTL DX, DX
    97  	ADDQ DX, R12
    98  end:
    99  	MOVQ R12, (R8)
   100  	RET
   101  
   102  // handle for lengths < 16
   103  small:
   104  	TESTQ	BX, BX
   105  	JEQ	endzero
   106  
   107  	// Check if we'll load across a page boundary.
   108  	LEAQ	16(SI), AX
   109  	TESTW	$0xff0, AX
   110  	JEQ	endofpage
   111  
   112  	// We must ignore high bytes as they aren't part of our slice.
   113  	// Create mask.
   114  	MOVB BX, CX
   115  	MOVQ $1, R10
   116  	SALQ CL, R10
   117  	SUBQ $1, R10
   118  
   119  	// Load data
   120  	MOVOU	(SI), X1
   121  	// Compare target byte with each byte in data.
   122  	PCMPEQB	X0, X1
   123  	// Move result bits to integer register.
   124  	PMOVMSKB X1, DX
   125  	// Apply mask
   126  	ANDQ R10, DX
   127  	POPCNTL DX, DX
   128  	// Directly return DX, we don't need to accumulate
   129  	// since we have <16 bytes.
   130  	MOVQ	DX, (R8)
   131  	RET
   132  endzero:
   133  	MOVQ $0, (R8)
   134  	RET
   135  
   136  endofpage:
   137  	// We must ignore low bytes as they aren't part of our slice.
   138  	MOVQ $16,CX
   139  	SUBQ BX, CX
   140  	MOVQ $0xFFFF, R10
   141  	SARQ CL, R10
   142  	SALQ CL, R10
   143  
   144  	// Load data into the high end of X1.
   145  	MOVOU	-16(SI)(BX*1), X1
   146  	// Compare target byte with each byte in data.
   147  	PCMPEQB	X0, X1
   148  	// Move result bits to integer register.
   149  	PMOVMSKB X1, DX
   150  	// Apply mask
   151  	ANDQ R10, DX
   152  	// Directly return DX, we don't need to accumulate
   153  	// since we have <16 bytes.
   154  	POPCNTL DX, DX
   155  	MOVQ	DX, (R8)
   156  	RET
   157  
   158  avx2:
   159  #ifndef hasAVX2
   160  	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
   161  	JNE sse
   162  #endif
   163  	MOVD AX, X0
   164  	LEAQ -32(SI)(BX*1), R11
   165  	VPBROADCASTB  X0, Y1
   166  avx2_loop:
   167  	VMOVDQU (DI), Y2
   168  	VPCMPEQB Y1, Y2, Y3
   169  	VPMOVMSKB Y3, DX
   170  	POPCNTL DX, DX
   171  	ADDQ DX, R12
   172  	ADDQ $32, DI
   173  	CMPQ DI, R11
   174  	JLE avx2_loop
   175  
   176  	// If last block is already processed,
   177  	// skip to the end.
   178  	CMPQ DI, R11
   179  	JEQ endavx
   180  
   181  	// Load address of the last 32 bytes.
   182  	// There is an overlap with the previous block.
   183  	MOVQ R11, DI
   184  	VMOVDQU (DI), Y2
   185  	VPCMPEQB Y1, Y2, Y3
   186  	VPMOVMSKB Y3, DX
   187  	// Exit AVX mode.
   188  	VZEROUPPER
   189  
   190  	// Create mask to ignore overlap between previous 32 byte block
   191  	// and the next.
   192  	ANDQ $31, BX
   193  	MOVQ $32,CX
   194  	SUBQ BX, CX
   195  	MOVQ $0xFFFFFFFF, R10
   196  	SARQ CL, R10
   197  	SALQ CL, R10
   198  	// Apply mask
   199  	ANDQ R10, DX
   200  	POPCNTL DX, DX
   201  	ADDQ DX, R12
   202  	MOVQ R12, (R8)
   203  	RET
   204  endavx:
   205  	// Exit AVX mode.
   206  	VZEROUPPER
   207  	MOVQ R12, (R8)
   208  	RET