github.com/bir3/gocompiler@v0.9.2202/src/internal/bytealg/indexbyte_amd64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !plan9
     6  
     7  #include "go_asm.h"
     8  #include "textflag.h"
     9  
    10  TEXT	·IndexByte(SB), NOSPLIT, $0-40
    11  	MOVQ b_base+0(FP), SI
    12  	MOVQ b_len+8(FP), BX
    13  	MOVB c+24(FP), AL
    14  	LEAQ ret+32(FP), R8
    15  	JMP  indexbytebody<>(SB)
    16  
    17  TEXT	·IndexByteString(SB), NOSPLIT, $0-32
    18  	MOVQ s_base+0(FP), SI
    19  	MOVQ s_len+8(FP), BX
    20  	MOVB c+16(FP), AL
    21  	LEAQ ret+24(FP), R8
    22  	JMP  indexbytebody<>(SB)
    23  
    24  // input:
    25  //   SI: data
    26  //   BX: data len
    27  //   AL: byte sought
    28  //   R8: address to put result
    29  TEXT	indexbytebody<>(SB), NOSPLIT, $0
    30  	// Shuffle X0 around so that each byte contains
    31  	// the character we're looking for.
    32  	MOVD AX, X0
    33  	PUNPCKLBW X0, X0
    34  	PUNPCKLBW X0, X0
    35  	PSHUFL $0, X0, X0
    36  
    37  	CMPQ BX, $16
    38  	JLT small
    39  
    40  	MOVQ SI, DI
    41  
    42  	CMPQ BX, $32
    43  	JA avx2
    44  sse:
    45  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
    46  	JMP	sseloopentry
    47  
    48  	PCALIGN $16
    49  sseloop:
    50  	// Move the next 16-byte chunk of the data into X1.
    51  	MOVOU	(DI), X1
    52  	// Compare bytes in X0 to X1.
    53  	PCMPEQB	X0, X1
    54  	// Take the top bit of each byte in X1 and put the result in DX.
    55  	PMOVMSKB X1, DX
    56  	// Find first set bit, if any.
    57  	BSFL	DX, DX
    58  	JNZ	ssesuccess
    59  	// Advance to next block.
    60  	ADDQ	$16, DI
    61  sseloopentry:
    62  	CMPQ	DI, AX
    63  	JB	sseloop
    64  
    65  	// Search the last 16-byte chunk. This chunk may overlap with the
    66  	// chunks we've already searched, but that's ok.
    67  	MOVQ	AX, DI
    68  	MOVOU	(AX), X1
    69  	PCMPEQB	X0, X1
    70  	PMOVMSKB X1, DX
    71  	BSFL	DX, DX
    72  	JNZ	ssesuccess
    73  
    74  failure:
    75  	MOVQ $-1, (R8)
    76  	RET
    77  
    78  // We've found a chunk containing the byte.
    79  // The chunk was loaded from DI.
    80  // The index of the matching byte in the chunk is DX.
    81  // The start of the data is SI.
    82  ssesuccess:
    83  	SUBQ SI, DI	// Compute offset of chunk within data.
    84  	ADDQ DX, DI	// Add offset of byte within chunk.
    85  	MOVQ DI, (R8)
    86  	RET
    87  
    88  // handle for lengths < 16
    89  small:
    90  	TESTQ	BX, BX
    91  	JEQ	failure
    92  
    93  	// Check if we'll load across a page boundary.
    94  	LEAQ	16(SI), AX
    95  	TESTW	$0xff0, AX
    96  	JEQ	endofpage
    97  
    98  	MOVOU	(SI), X1 // Load data
    99  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
   100  	PMOVMSKB X1, DX	// Move result bits to integer register.
   101  	BSFL	DX, DX	// Find first set bit.
   102  	JZ	failure	// No set bit, failure.
   103  	CMPL	DX, BX
   104  	JAE	failure	// Match is past end of data.
   105  	MOVQ	DX, (R8)
   106  	RET
   107  
   108  endofpage:
   109  	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
   110  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
   111  	PMOVMSKB X1, DX	// Move result bits to integer register.
   112  	MOVL	BX, CX
   113  	SHLL	CX, DX
   114  	SHRL	$16, DX	// Shift desired bits down to bottom of register.
   115  	BSFL	DX, DX	// Find first set bit.
   116  	JZ	failure	// No set bit, failure.
   117  	MOVQ	DX, (R8)
   118  	RET
   119  
   120  avx2:
   121  #ifndef hasAVX2
   122  	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
   123  	JNE sse
   124  #endif
   125  	MOVD AX, X0
   126  	LEAQ -32(SI)(BX*1), R11
   127  	VPBROADCASTB  X0, Y1
   128  
   129  	PCALIGN $32
   130  avx2_loop:
   131  	VMOVDQU (DI), Y2
   132  	VPCMPEQB Y1, Y2, Y3
   133  	VPTEST Y3, Y3
   134  	JNZ avx2success
   135  	ADDQ $32, DI
   136  	CMPQ DI, R11
   137  	JLT avx2_loop
   138  	MOVQ R11, DI
   139  	VMOVDQU (DI), Y2
   140  	VPCMPEQB Y1, Y2, Y3
   141  	VPTEST Y3, Y3
   142  	JNZ avx2success
   143  	VZEROUPPER
   144  	MOVQ $-1, (R8)
   145  	RET
   146  
   147  avx2success:
   148  	VPMOVMSKB Y3, DX
   149  	BSFL DX, DX
   150  	SUBQ SI, DI
   151  	ADDQ DI, DX
   152  	MOVQ DX, (R8)
   153  	VZEROUPPER
   154  	RET