github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/indexbyte_amd64.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  // 
     4  // Copyright 2018 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && amd64
     9  
    10  #include "textflag.h"
    11  
    12  TEXT ·IndexSliceByte(SB),NOSPLIT,$0-40
    13  	MOVQ b_base+0(FP), SI
    14  	MOVQ b_len+8(FP), BX
    15  	MOVB c+24(FP), AL
    16  	LEAQ ret+32(FP), R8
    17  	JMP  indexbytebody<>(SB)
    18  
    19  TEXT ·IndexByte(SB),NOSPLIT,$0-32
    20  	MOVQ s_base+0(FP), SI
    21  	MOVQ s_len+8(FP), BX
    22  	MOVB c+16(FP), AL
    23  	LEAQ ret+24(FP), R8
    24  	JMP  indexbytebody<>(SB)
    25  
    26  // input:
    27  //   SI: data
    28  //   BX: data len
    29  //   AL: byte sought
    30  //   R8: address to put result
    31  TEXT indexbytebody<>(SB), NOSPLIT, $0
    32  	// Shuffle X0 around so that each byte contains
    33  	// the character we're looking for.
    34  	MOVD AX, X0
    35  	PUNPCKLBW X0, X0
    36  	PUNPCKLBW X0, X0
    37  	PSHUFL $0, X0, X0
    38  
    39  	CMPQ BX, $16
    40  	JLT small
    41  
    42  	MOVQ SI, DI
    43  
    44  	CMPQ BX, $32
    45  	JA avx2
    46  sse:
    47  	LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
    48  	JMP sseloopentry
    49  
    50  sseloop:
    51  	// Move the next 16-byte chunk of the data into X1.
    52  	MOVOU (DI), X1
    53  	// Compare bytes in X0 to X1.
    54  	PCMPEQB X0, X1
    55  	// Take the top bit of each byte in X1 and put the result in DX.
    56  	PMOVMSKB X1, DX
    57  	// Find first set bit, if any.
    58  	BSFL DX, DX
    59  	JNZ ssesuccess
    60  	// Advance to next block.
    61  	ADDQ $16, DI
    62  sseloopentry:
    63  	CMPQ DI, AX
    64  	JB sseloop
    65  
    66  	// Search the last 16-byte chunk. This chunk may overlap with the
    67  	// chunks we've already searched, but that's ok.
    68  	MOVQ AX, DI
    69  	MOVOU (AX), X1
    70  	PCMPEQB X0, X1
    71  	PMOVMSKB X1, DX
    72  	BSFL DX, DX
    73  	JNZ ssesuccess
    74  
    75  failure:
    76  	MOVQ $-1, (R8)
    77  	RET
    78  
    79  // We've found a chunk containing the byte.
    80  // The chunk was loaded from DI.
    81  // The index of the matching byte in the chunk is DX.
    82  // The start of the data is SI.
    83  ssesuccess:
    84  	SUBQ SI, DI // Compute offset of chunk within data.
    85  	ADDQ DX, DI // Add offset of byte within chunk.
    86  	MOVQ DI, (R8)
    87  	RET
    88  
    89  // handle for lengths < 16
    90  small:
    91  	TESTQ BX, BX
    92  	JEQ failure
    93  
    94  	// Check if we'll load across a page boundary.
    95  	LEAQ 16(SI), AX
    96  	TESTW $0xff0, AX
    97  	JEQ endofpage
    98  
    99  	MOVOU (SI), X1 // Load data
   100  	PCMPEQB X0, X1 // Compare target byte with each byte in data.
   101  	PMOVMSKB X1, DX // Move result bits to integer register.
   102  	BSFL DX, DX // Find first set bit.
   103  	JZ failure // No set bit, failure.
   104  	CMPL DX, BX
   105  	JAE failure // Match is past end of data.
   106  	MOVQ DX, (R8)
   107  	RET
   108  
   109  endofpage:
   110  	MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
   111  	PCMPEQB X0, X1 // Compare target byte with each byte in data.
   112  	PMOVMSKB X1, DX // Move result bits to integer register.
   113  	MOVL BX, CX
   114  	SHLL CX, DX
   115  	SHRL $16, DX // Shift desired bits down to bottom of register.
   116  	BSFL DX, DX // Find first set bit.
   117  	JZ failure // No set bit, failure.
   118  	MOVQ DX, (R8)
   119  	RET
   120  
   121  avx2:
   122  	CMPB   ·hasAVX2(SB), $1
   123  	JNE sse
   124  	MOVD AX, X0
   125  	LEAQ -32(SI)(BX*1), R11
   126  	VPBROADCASTB  X0, Y1
   127  avx2_loop:
   128  	VMOVDQU (DI), Y2
   129  	VPCMPEQB Y1, Y2, Y3
   130  	VPTEST Y3, Y3
   131  	JNZ avx2success
   132  	ADDQ $32, DI
   133  	CMPQ DI, R11
   134  	JLT avx2_loop
   135  	MOVQ R11, DI
   136  	VMOVDQU (DI), Y2
   137  	VPCMPEQB Y1, Y2, Y3
   138  	VPTEST Y3, Y3
   139  	JNZ avx2success
   140  	VZEROUPPER
   141  	MOVQ $-1, (R8)
   142  	RET
   143  
   144  avx2success:
   145  	VPMOVMSKB Y3, DX
   146  	BSFL DX, DX
   147  	SUBQ SI, DI
   148  	ADDQ DI, DX
   149  	MOVQ DX, (R8)
   150  	VZEROUPPER
   151  	RET