github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/bytealg/indexbyte_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "textflag.h" 7 8 TEXT ·IndexByte(SB), NOSPLIT, $0-40 9 MOVQ b_base+0(FP), SI 10 MOVQ b_len+8(FP), BX 11 MOVB c+24(FP), AL 12 LEAQ ret+32(FP), R8 13 JMP indexbytebody<>(SB) 14 15 TEXT ·IndexByteString(SB), NOSPLIT, $0-32 16 MOVQ s_base+0(FP), SI 17 MOVQ s_len+8(FP), BX 18 MOVB c+16(FP), AL 19 LEAQ ret+24(FP), R8 20 JMP indexbytebody<>(SB) 21 22 // input: 23 // SI: data 24 // BX: data len 25 // AL: byte sought 26 // R8: address to put result 27 TEXT indexbytebody<>(SB), NOSPLIT, $0 28 // Shuffle X0 around so that each byte contains 29 // the character we're looking for. 30 MOVD AX, X0 31 PUNPCKLBW X0, X0 32 PUNPCKLBW X0, X0 33 PSHUFL $0, X0, X0 34 35 CMPQ BX, $16 36 JLT small 37 38 MOVQ SI, DI 39 40 CMPQ BX, $32 41 JA avx2 42 sse: 43 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 44 JMP sseloopentry 45 46 sseloop: 47 // Move the next 16-byte chunk of the data into X1. 48 MOVOU (DI), X1 49 // Compare bytes in X0 to X1. 50 PCMPEQB X0, X1 51 // Take the top bit of each byte in X1 and put the result in DX. 52 PMOVMSKB X1, DX 53 // Find first set bit, if any. 54 BSFL DX, DX 55 JNZ ssesuccess 56 // Advance to next block. 57 ADDQ $16, DI 58 sseloopentry: 59 CMPQ DI, AX 60 JB sseloop 61 62 // Search the last 16-byte chunk. This chunk may overlap with the 63 // chunks we've already searched, but that's ok. 64 MOVQ AX, DI 65 MOVOU (AX), X1 66 PCMPEQB X0, X1 67 PMOVMSKB X1, DX 68 BSFL DX, DX 69 JNZ ssesuccess 70 71 failure: 72 MOVQ $-1, (R8) 73 RET 74 75 // We've found a chunk containing the byte. 76 // The chunk was loaded from DI. 77 // The index of the matching byte in the chunk is DX. 78 // The start of the data is SI. 79 ssesuccess: 80 SUBQ SI, DI // Compute offset of chunk within data. 81 ADDQ DX, DI // Add offset of byte within chunk. 82 MOVQ DI, (R8) 83 RET 84 85 // handle for lengths < 16 86 small: 87 TESTQ BX, BX 88 JEQ failure 89 90 // Check if we'll load across a page boundary. 91 LEAQ 16(SI), AX 92 TESTW $0xff0, AX 93 JEQ endofpage 94 95 MOVOU (SI), X1 // Load data 96 PCMPEQB X0, X1 // Compare target byte with each byte in data. 97 PMOVMSKB X1, DX // Move result bits to integer register. 98 BSFL DX, DX // Find first set bit. 99 JZ failure // No set bit, failure. 100 CMPL DX, BX 101 JAE failure // Match is past end of data. 102 MOVQ DX, (R8) 103 RET 104 105 endofpage: 106 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 107 PCMPEQB X0, X1 // Compare target byte with each byte in data. 108 PMOVMSKB X1, DX // Move result bits to integer register. 109 MOVL BX, CX 110 SHLL CX, DX 111 SHRL $16, DX // Shift desired bits down to bottom of register. 112 BSFL DX, DX // Find first set bit. 113 JZ failure // No set bit, failure. 114 MOVQ DX, (R8) 115 RET 116 117 avx2: 118 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 119 JNE sse 120 MOVD AX, X0 121 LEAQ -32(SI)(BX*1), R11 122 VPBROADCASTB X0, Y1 123 avx2_loop: 124 VMOVDQU (DI), Y2 125 VPCMPEQB Y1, Y2, Y3 126 VPTEST Y3, Y3 127 JNZ avx2success 128 ADDQ $32, DI 129 CMPQ DI, R11 130 JLT avx2_loop 131 MOVQ R11, DI 132 VMOVDQU (DI), Y2 133 VPCMPEQB Y1, Y2, Y3 134 VPTEST Y3, Y3 135 JNZ avx2success 136 VZEROUPPER 137 MOVQ $-1, (R8) 138 RET 139 140 avx2success: 141 VPMOVMSKB Y3, DX 142 BSFL DX, DX 143 SUBQ SI, DI 144 ADDQ DI, DX 145 MOVQ DX, (R8) 146 VZEROUPPER 147 RET