github.com/hikaru7719/go@v0.0.0-20181025140707-c8b2ac68906a/src/internal/bytealg/indexbyte_amd64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  TEXT	·IndexByte(SB), NOSPLIT, $0-40
     9  	MOVQ b_base+0(FP), SI
    10  	MOVQ b_len+8(FP), BX
    11  	MOVB c+24(FP), AL
    12  	LEAQ ret+32(FP), R8
    13  	JMP  indexbytebody<>(SB)
    14  
    15  TEXT	·IndexByteString(SB), NOSPLIT, $0-32
    16  	MOVQ s_base+0(FP), SI
    17  	MOVQ s_len+8(FP), BX
    18  	MOVB c+16(FP), AL
    19  	LEAQ ret+24(FP), R8
    20  	JMP  indexbytebody<>(SB)
    21  
    22  	// Provide direct access to these functions from other packages.
    23  	// This is the equivlant of doing:
    24  	//     package bytes
    25  	//     func IndexByte(b []byte, c byte) int {
    26  	//         return bytealg.IndexByte(s, c)
    27  	//     }
    28  	// but involves no call overhead.
    29  	// TODO: remove this hack when midstack inlining is enabled?
    30  TEXT	bytes·IndexByte(SB), NOSPLIT, $0-40
    31  	FUNCDATA $0, ·IndexByte·args_stackmap(SB)
    32  	MOVQ b_base+0(FP), SI
    33  	MOVQ b_len+8(FP), BX
    34  	MOVB c+24(FP), AL
    35  	LEAQ ret+32(FP), R8
    36  	JMP  indexbytebody<>(SB)
    37  
    38  TEXT	strings·IndexByte(SB), NOSPLIT, $0-32
    39  	FUNCDATA $0, ·IndexByteString·args_stackmap(SB)
    40  	MOVQ s_base+0(FP), SI
    41  	MOVQ s_len+8(FP), BX
    42  	MOVB c+16(FP), AL
    43  	LEAQ ret+24(FP), R8
    44  	JMP  indexbytebody<>(SB)
    45  
    46  // input:
    47  //   SI: data
    48  //   BX: data len
    49  //   AL: byte sought
    50  //   R8: address to put result
    51  TEXT	indexbytebody<>(SB), NOSPLIT, $0
    52  	// Shuffle X0 around so that each byte contains
    53  	// the character we're looking for.
    54  	MOVD AX, X0
    55  	PUNPCKLBW X0, X0
    56  	PUNPCKLBW X0, X0
    57  	PSHUFL $0, X0, X0
    58  
    59  	CMPQ BX, $16
    60  	JLT small
    61  
    62  	MOVQ SI, DI
    63  
    64  	CMPQ BX, $32
    65  	JA avx2
    66  sse:
    67  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
    68  	JMP	sseloopentry
    69  
    70  sseloop:
    71  	// Move the next 16-byte chunk of the data into X1.
    72  	MOVOU	(DI), X1
    73  	// Compare bytes in X0 to X1.
    74  	PCMPEQB	X0, X1
    75  	// Take the top bit of each byte in X1 and put the result in DX.
    76  	PMOVMSKB X1, DX
    77  	// Find first set bit, if any.
    78  	BSFL	DX, DX
    79  	JNZ	ssesuccess
    80  	// Advance to next block.
    81  	ADDQ	$16, DI
    82  sseloopentry:
    83  	CMPQ	DI, AX
    84  	JB	sseloop
    85  
    86  	// Search the last 16-byte chunk. This chunk may overlap with the
    87  	// chunks we've already searched, but that's ok.
    88  	MOVQ	AX, DI
    89  	MOVOU	(AX), X1
    90  	PCMPEQB	X0, X1
    91  	PMOVMSKB X1, DX
    92  	BSFL	DX, DX
    93  	JNZ	ssesuccess
    94  
    95  failure:
    96  	MOVQ $-1, (R8)
    97  	RET
    98  
    99  // We've found a chunk containing the byte.
   100  // The chunk was loaded from DI.
   101  // The index of the matching byte in the chunk is DX.
   102  // The start of the data is SI.
   103  ssesuccess:
   104  	SUBQ SI, DI	// Compute offset of chunk within data.
   105  	ADDQ DX, DI	// Add offset of byte within chunk.
   106  	MOVQ DI, (R8)
   107  	RET
   108  
   109  // handle for lengths < 16
   110  small:
   111  	TESTQ	BX, BX
   112  	JEQ	failure
   113  
   114  	// Check if we'll load across a page boundary.
   115  	LEAQ	16(SI), AX
   116  	TESTW	$0xff0, AX
   117  	JEQ	endofpage
   118  
   119  	MOVOU	(SI), X1 // Load data
   120  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
   121  	PMOVMSKB X1, DX	// Move result bits to integer register.
   122  	BSFL	DX, DX	// Find first set bit.
   123  	JZ	failure	// No set bit, failure.
   124  	CMPL	DX, BX
   125  	JAE	failure	// Match is past end of data.
   126  	MOVQ	DX, (R8)
   127  	RET
   128  
   129  endofpage:
   130  	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
   131  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
   132  	PMOVMSKB X1, DX	// Move result bits to integer register.
   133  	MOVL	BX, CX
   134  	SHLL	CX, DX
   135  	SHRL	$16, DX	// Shift desired bits down to bottom of register.
   136  	BSFL	DX, DX	// Find first set bit.
   137  	JZ	failure	// No set bit, failure.
   138  	MOVQ	DX, (R8)
   139  	RET
   140  
   141  avx2:
   142  	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
   143  	JNE sse
   144  	MOVD AX, X0
   145  	LEAQ -32(SI)(BX*1), R11
   146  	VPBROADCASTB  X0, Y1
   147  avx2_loop:
   148  	VMOVDQU (DI), Y2
   149  	VPCMPEQB Y1, Y2, Y3
   150  	VPTEST Y3, Y3
   151  	JNZ avx2success
   152  	ADDQ $32, DI
   153  	CMPQ DI, R11
   154  	JLT avx2_loop
   155  	MOVQ R11, DI
   156  	VMOVDQU (DI), Y2
   157  	VPCMPEQB Y1, Y2, Y3
   158  	VPTEST Y3, Y3
   159  	JNZ avx2success
   160  	VZEROUPPER
   161  	MOVQ $-1, (R8)
   162  	RET
   163  
   164  avx2success:
   165  	VPMOVMSKB Y3, DX
   166  	BSFL DX, DX
   167  	SUBQ SI, DI
   168  	ADDQ DI, DX
   169  	MOVQ DX, (R8)
   170  	VZEROUPPER
   171  	RET