github.com/JimmyHuang454/JLS-go@v0.0.0-20230831150107-90d536585ba0/internal/bytealg/indexbyte_amd64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  TEXT	·IndexByte(SB), NOSPLIT, $0-40
     9  	MOVQ b_base+0(FP), SI
    10  	MOVQ b_len+8(FP), BX
    11  	MOVB c+24(FP), AL
    12  	LEAQ ret+32(FP), R8
    13  	JMP  indexbytebody<>(SB)
    14  
    15  TEXT	·IndexByteString(SB), NOSPLIT, $0-32
    16  	MOVQ s_base+0(FP), SI
    17  	MOVQ s_len+8(FP), BX
    18  	MOVB c+16(FP), AL
    19  	LEAQ ret+24(FP), R8
    20  	JMP  indexbytebody<>(SB)
    21  
    22  // input:
    23  //   SI: data
    24  //   BX: data len
    25  //   AL: byte sought
    26  //   R8: address to put result
    27  TEXT	indexbytebody<>(SB), NOSPLIT, $0
    28  	// Shuffle X0 around so that each byte contains
    29  	// the character we're looking for.
    30  	MOVD AX, X0
    31  	PUNPCKLBW X0, X0
    32  	PUNPCKLBW X0, X0
    33  	PSHUFL $0, X0, X0
    34  
    35  	CMPQ BX, $16
    36  	JLT small
    37  
    38  	MOVQ SI, DI
    39  
    40  	CMPQ BX, $32
    41  	JA avx2
    42  sse:
    43  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
    44  	JMP	sseloopentry
    45  
    46  sseloop:
    47  	// Move the next 16-byte chunk of the data into X1.
    48  	MOVOU	(DI), X1
    49  	// Compare bytes in X0 to X1.
    50  	PCMPEQB	X0, X1
    51  	// Take the top bit of each byte in X1 and put the result in DX.
    52  	PMOVMSKB X1, DX
    53  	// Find first set bit, if any.
    54  	BSFL	DX, DX
    55  	JNZ	ssesuccess
    56  	// Advance to next block.
    57  	ADDQ	$16, DI
    58  sseloopentry:
    59  	CMPQ	DI, AX
    60  	JB	sseloop
    61  
    62  	// Search the last 16-byte chunk. This chunk may overlap with the
    63  	// chunks we've already searched, but that's ok.
    64  	MOVQ	AX, DI
    65  	MOVOU	(AX), X1
    66  	PCMPEQB	X0, X1
    67  	PMOVMSKB X1, DX
    68  	BSFL	DX, DX
    69  	JNZ	ssesuccess
    70  
    71  failure:
    72  	MOVQ $-1, (R8)
    73  	RET
    74  
    75  // We've found a chunk containing the byte.
    76  // The chunk was loaded from DI.
    77  // The index of the matching byte in the chunk is DX.
    78  // The start of the data is SI.
    79  ssesuccess:
    80  	SUBQ SI, DI	// Compute offset of chunk within data.
    81  	ADDQ DX, DI	// Add offset of byte within chunk.
    82  	MOVQ DI, (R8)
    83  	RET
    84  
    85  // handle for lengths < 16
    86  small:
    87  	TESTQ	BX, BX
    88  	JEQ	failure
    89  
    90  	// Check if we'll load across a page boundary.
    91  	LEAQ	16(SI), AX
    92  	TESTW	$0xff0, AX
    93  	JEQ	endofpage
    94  
    95  	MOVOU	(SI), X1 // Load data
    96  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
    97  	PMOVMSKB X1, DX	// Move result bits to integer register.
    98  	BSFL	DX, DX	// Find first set bit.
    99  	JZ	failure	// No set bit, failure.
   100  	CMPL	DX, BX
   101  	JAE	failure	// Match is past end of data.
   102  	MOVQ	DX, (R8)
   103  	RET
   104  
   105  endofpage:
   106  	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
   107  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
   108  	PMOVMSKB X1, DX	// Move result bits to integer register.
   109  	MOVL	BX, CX
   110  	SHLL	CX, DX
   111  	SHRL	$16, DX	// Shift desired bits down to bottom of register.
   112  	BSFL	DX, DX	// Find first set bit.
   113  	JZ	failure	// No set bit, failure.
   114  	MOVQ	DX, (R8)
   115  	RET
   116  
   117  avx2:
   118  #ifndef hasAVX2
   119  	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
   120  	JNE sse
   121  #endif
   122  	MOVD AX, X0
   123  	LEAQ -32(SI)(BX*1), R11
   124  	VPBROADCASTB  X0, Y1
   125  avx2_loop:
   126  	VMOVDQU (DI), Y2
   127  	VPCMPEQB Y1, Y2, Y3
   128  	VPTEST Y3, Y3
   129  	JNZ avx2success
   130  	ADDQ $32, DI
   131  	CMPQ DI, R11
   132  	JLT avx2_loop
   133  	MOVQ R11, DI
   134  	VMOVDQU (DI), Y2
   135  	VPCMPEQB Y1, Y2, Y3
   136  	VPTEST Y3, Y3
   137  	JNZ avx2success
   138  	VZEROUPPER
   139  	MOVQ $-1, (R8)
   140  	RET
   141  
   142  avx2success:
   143  	VPMOVMSKB Y3, DX
   144  	BSFL DX, DX
   145  	SUBQ SI, DI
   146  	ADDQ DI, DX
   147  	MOVQ DX, (R8)
   148  	VZEROUPPER
   149  	RET