github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/bytealg/indexbyte_amd64p32.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  TEXT ·IndexByte(SB),NOSPLIT,$0-20
     9  	MOVL b_base+0(FP), SI
    10  	MOVL b_len+4(FP), BX
    11  	MOVB c+12(FP), AL
    12  	CALL indexbytebody<>(SB)
    13  	MOVL AX, ret+16(FP)
    14  	RET
    15  
    16  TEXT ·IndexByteString(SB),NOSPLIT,$0-20
    17  	MOVL s_base+0(FP), SI
    18  	MOVL s_len+4(FP), BX
    19  	MOVB c+8(FP), AL
    20  	CALL indexbytebody<>(SB)
    21  	MOVL AX, ret+16(FP)
    22  	RET
    23  
    24  // input:
    25  //   SI: data
    26  //   BX: data len
    27  //   AL: byte sought
    28  // output:
    29  //   AX
    30  TEXT indexbytebody<>(SB),NOSPLIT,$0
    31  	MOVL SI, DI
    32  
    33  	CMPL BX, $16
    34  	JLT small
    35  
    36  	// round up to first 16-byte boundary
    37  	TESTL $15, SI
    38  	JZ aligned
    39  	MOVL SI, CX
    40  	ANDL $~15, CX
    41  	ADDL $16, CX
    42  
    43  	// search the beginning
    44  	SUBL SI, CX
    45  	REPN; SCASB
    46  	JZ success
    47  
    48  // DI is 16-byte aligned; get ready to search using SSE instructions
    49  aligned:
    50  	// round down to last 16-byte boundary
    51  	MOVL BX, R11
    52  	ADDL SI, R11
    53  	ANDL $~15, R11
    54  
    55  	// shuffle X0 around so that each byte contains c
    56  	MOVD AX, X0
    57  	PUNPCKLBW X0, X0
    58  	PUNPCKLBW X0, X0
    59  	PSHUFL $0, X0, X0
    60  	JMP condition
    61  
    62  sse:
    63  	// move the next 16-byte chunk of the buffer into X1
    64  	MOVO (DI), X1
    65  	// compare bytes in X0 to X1
    66  	PCMPEQB X0, X1
    67  	// take the top bit of each byte in X1 and put the result in DX
    68  	PMOVMSKB X1, DX
    69  	TESTL DX, DX
    70  	JNZ ssesuccess
    71  	ADDL $16, DI
    72  
    73  condition:
    74  	CMPL DI, R11
    75  	JNE sse
    76  
    77  	// search the end
    78  	MOVL SI, CX
    79  	ADDL BX, CX
    80  	SUBL R11, CX
    81  	// if CX == 0, the zero flag will be set and we'll end up
    82  	// returning a false success
    83  	JZ failure
    84  	REPN; SCASB
    85  	JZ success
    86  
    87  failure:
    88  	MOVL $-1, AX
    89  	RET
    90  
    91  // handle for lengths < 16
    92  small:
    93  	MOVL BX, CX
    94  	REPN; SCASB
    95  	JZ success
    96  	MOVL $-1, AX
    97  	RET
    98  
    99  // we've found the chunk containing the byte
   100  // now just figure out which specific byte it is
   101  ssesuccess:
   102  	// get the index of the least significant set bit
   103  	BSFW DX, DX
   104  	SUBL SI, DI
   105  	ADDL DI, DX
   106  	MOVL DX, AX
   107  	RET
   108  
   109  success:
   110  	SUBL SI, DI
   111  	SUBL $1, DI
   112  	MOVL DI, AX
   113  	RET