github.com/sandwichdev/go-internals@v0.0.0-20210605002614-12311ac6b2c5/bytealg/count_amd64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  TEXT ·Count(SB),NOSPLIT,$0-40
     9  	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
    10  	JEQ	2(PC)
    11  	JMP	·countGeneric(SB)
    12  	MOVQ	b_base+0(FP), SI
    13  	MOVQ	b_len+8(FP), BX
    14  	MOVB	c+24(FP), AL
    15  	LEAQ	ret+32(FP), R8
    16  	JMP	countbody<>(SB)
    17  
    18  TEXT ·CountString(SB),NOSPLIT,$0-32
    19  	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
    20  	JEQ	2(PC)
    21  	JMP	·countGenericString(SB)
    22  	MOVQ	s_base+0(FP), SI
    23  	MOVQ	s_len+8(FP), BX
    24  	MOVB	c+16(FP), AL
    25  	LEAQ	ret+24(FP), R8
    26  	JMP	countbody<>(SB)
    27  
    28  // input:
    29  //   SI: data
    30  //   BX: data len
    31  //   AL: byte sought
    32  //   R8: address to put result
    33  // This function requires the POPCNT instruction.
    34  TEXT countbody<>(SB),NOSPLIT,$0
    35  	// Shuffle X0 around so that each byte contains
    36  	// the character we're looking for.
    37  	MOVD AX, X0
    38  	PUNPCKLBW X0, X0
    39  	PUNPCKLBW X0, X0
    40  	PSHUFL $0, X0, X0
    41  
    42  	CMPQ BX, $16
    43  	JLT small
    44  
    45  	MOVQ $0, R12 // Accumulator
    46  
    47  	MOVQ SI, DI
    48  
    49  	CMPQ BX, $32
    50  	JA avx2
    51  sse:
    52  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
    53  	JMP	sseloopentry
    54  
    55  sseloop:
    56  	// Move the next 16-byte chunk of the data into X1.
    57  	MOVOU	(DI), X1
    58  	// Compare bytes in X0 to X1.
    59  	PCMPEQB	X0, X1
    60  	// Take the top bit of each byte in X1 and put the result in DX.
    61  	PMOVMSKB X1, DX
    62  	// Count number of matching bytes
    63  	POPCNTL DX, DX
    64  	// Accumulate into R12
    65  	ADDQ DX, R12
    66  	// Advance to next block.
    67  	ADDQ	$16, DI
    68  sseloopentry:
    69  	CMPQ	DI, AX
    70  	JBE	sseloop
    71  
    72  	// Get the number of bytes to consider in the last 16 bytes
    73  	ANDQ $15, BX
    74  	JZ end
    75  
    76  	// Create mask to ignore overlap between previous 16 byte block
    77  	// and the next.
    78  	MOVQ $16,CX
    79  	SUBQ BX, CX
    80  	MOVQ $0xFFFF, R10
    81  	SARQ CL, R10
    82  	SALQ CL, R10
    83  
    84  	// Process the last 16-byte chunk. This chunk may overlap with the
    85  	// chunks we've already searched so we need to mask part of it.
    86  	MOVOU	(AX), X1
    87  	PCMPEQB	X0, X1
    88  	PMOVMSKB X1, DX
    89  	// Apply mask
    90  	ANDQ R10, DX
    91  	POPCNTL DX, DX
    92  	ADDQ DX, R12
    93  end:
    94  	MOVQ R12, (R8)
    95  	RET
    96  
    97  // handle for lengths < 16
    98  small:
    99  	TESTQ	BX, BX
   100  	JEQ	endzero
   101  
   102  	// Check if we'll load across a page boundary.
   103  	LEAQ	16(SI), AX
   104  	TESTW	$0xff0, AX
   105  	JEQ	endofpage
   106  
   107  	// We must ignore high bytes as they aren't part of our slice.
   108  	// Create mask.
   109  	MOVB BX, CX
   110  	MOVQ $1, R10
   111  	SALQ CL, R10
   112  	SUBQ $1, R10
   113  
   114  	// Load data
   115  	MOVOU	(SI), X1
   116  	// Compare target byte with each byte in data.
   117  	PCMPEQB	X0, X1
   118  	// Move result bits to integer register.
   119  	PMOVMSKB X1, DX
   120  	// Apply mask
   121  	ANDQ R10, DX
   122  	POPCNTL DX, DX
   123  	// Directly return DX, we don't need to accumulate
   124  	// since we have <16 bytes.
   125  	MOVQ	DX, (R8)
   126  	RET
   127  endzero:
   128  	MOVQ $0, (R8)
   129  	RET
   130  
   131  endofpage:
   132  	// We must ignore low bytes as they aren't part of our slice.
   133  	MOVQ $16,CX
   134  	SUBQ BX, CX
   135  	MOVQ $0xFFFF, R10
   136  	SARQ CL, R10
   137  	SALQ CL, R10
   138  
   139  	// Load data into the high end of X1.
   140  	MOVOU	-16(SI)(BX*1), X1
   141  	// Compare target byte with each byte in data.
   142  	PCMPEQB	X0, X1
   143  	// Move result bits to integer register.
   144  	PMOVMSKB X1, DX
   145  	// Apply mask
   146  	ANDQ R10, DX
   147  	// Directly return DX, we don't need to accumulate
   148  	// since we have <16 bytes.
   149  	POPCNTL DX, DX
   150  	MOVQ	DX, (R8)
   151  	RET
   152  
   153  avx2:
   154  	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
   155  	JNE sse
   156  	MOVD AX, X0
   157  	LEAQ -32(SI)(BX*1), R11
   158  	VPBROADCASTB  X0, Y1
   159  avx2_loop:
   160  	VMOVDQU (DI), Y2
   161  	VPCMPEQB Y1, Y2, Y3
   162  	VPMOVMSKB Y3, DX
   163  	POPCNTL DX, DX
   164  	ADDQ DX, R12
   165  	ADDQ $32, DI
   166  	CMPQ DI, R11
   167  	JLE avx2_loop
   168  
   169  	// If last block is already processed,
   170  	// skip to the end.
   171  	CMPQ DI, R11
   172  	JEQ endavx
   173  
   174  	// Load address of the last 32 bytes.
   175  	// There is an overlap with the previous block.
   176  	MOVQ R11, DI
   177  	VMOVDQU (DI), Y2
   178  	VPCMPEQB Y1, Y2, Y3
   179  	VPMOVMSKB Y3, DX
   180  	// Exit AVX mode.
   181  	VZEROUPPER
   182  
   183  	// Create mask to ignore overlap between previous 32 byte block
   184  	// and the next.
   185  	ANDQ $31, BX
   186  	MOVQ $32,CX
   187  	SUBQ BX, CX
   188  	MOVQ $0xFFFFFFFF, R10
   189  	SARQ CL, R10
   190  	SALQ CL, R10
   191  	// Apply mask
   192  	ANDQ R10, DX
   193  	POPCNTL DX, DX
   194  	ADDQ DX, R12
   195  	MOVQ R12, (R8)
   196  	RET
   197  endavx:
   198  	// Exit AVX mode.
   199  	VZEROUPPER
   200  	MOVQ R12, (R8)
   201  	RET