github.com/JimmyHuang454/JLS-go@v0.0.0-20230831150107-90d536585ba0/internal/bytealg/indexbyte_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "textflag.h" 7 8 TEXT ·IndexByte(SB), NOSPLIT, $0-40 9 MOVQ b_base+0(FP), SI 10 MOVQ b_len+8(FP), BX 11 MOVB c+24(FP), AL 12 LEAQ ret+32(FP), R8 13 JMP indexbytebody<>(SB) 14 15 TEXT ·IndexByteString(SB), NOSPLIT, $0-32 16 MOVQ s_base+0(FP), SI 17 MOVQ s_len+8(FP), BX 18 MOVB c+16(FP), AL 19 LEAQ ret+24(FP), R8 20 JMP indexbytebody<>(SB) 21 22 // input: 23 // SI: data 24 // BX: data len 25 // AL: byte sought 26 // R8: address to put result 27 TEXT indexbytebody<>(SB), NOSPLIT, $0 28 // Shuffle X0 around so that each byte contains 29 // the character we're looking for. 30 MOVD AX, X0 31 PUNPCKLBW X0, X0 32 PUNPCKLBW X0, X0 33 PSHUFL $0, X0, X0 34 35 CMPQ BX, $16 36 JLT small 37 38 MOVQ SI, DI 39 40 CMPQ BX, $32 41 JA avx2 42 sse: 43 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 44 JMP sseloopentry 45 46 sseloop: 47 // Move the next 16-byte chunk of the data into X1. 48 MOVOU (DI), X1 49 // Compare bytes in X0 to X1. 50 PCMPEQB X0, X1 51 // Take the top bit of each byte in X1 and put the result in DX. 52 PMOVMSKB X1, DX 53 // Find first set bit, if any. 54 BSFL DX, DX 55 JNZ ssesuccess 56 // Advance to next block. 57 ADDQ $16, DI 58 sseloopentry: 59 CMPQ DI, AX 60 JB sseloop 61 62 // Search the last 16-byte chunk. This chunk may overlap with the 63 // chunks we've already searched, but that's ok. 64 MOVQ AX, DI 65 MOVOU (AX), X1 66 PCMPEQB X0, X1 67 PMOVMSKB X1, DX 68 BSFL DX, DX 69 JNZ ssesuccess 70 71 failure: 72 MOVQ $-1, (R8) 73 RET 74 75 // We've found a chunk containing the byte. 76 // The chunk was loaded from DI. 77 // The index of the matching byte in the chunk is DX. 78 // The start of the data is SI. 79 ssesuccess: 80 SUBQ SI, DI // Compute offset of chunk within data. 81 ADDQ DX, DI // Add offset of byte within chunk. 82 MOVQ DI, (R8) 83 RET 84 85 // handle for lengths < 16 86 small: 87 TESTQ BX, BX 88 JEQ failure 89 90 // Check if we'll load across a page boundary. 91 LEAQ 16(SI), AX 92 TESTW $0xff0, AX 93 JEQ endofpage 94 95 MOVOU (SI), X1 // Load data 96 PCMPEQB X0, X1 // Compare target byte with each byte in data. 97 PMOVMSKB X1, DX // Move result bits to integer register. 98 BSFL DX, DX // Find first set bit. 99 JZ failure // No set bit, failure. 100 CMPL DX, BX 101 JAE failure // Match is past end of data. 102 MOVQ DX, (R8) 103 RET 104 105 endofpage: 106 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 107 PCMPEQB X0, X1 // Compare target byte with each byte in data. 108 PMOVMSKB X1, DX // Move result bits to integer register. 109 MOVL BX, CX 110 SHLL CX, DX 111 SHRL $16, DX // Shift desired bits down to bottom of register. 112 BSFL DX, DX // Find first set bit. 113 JZ failure // No set bit, failure. 114 MOVQ DX, (R8) 115 RET 116 117 avx2: 118 #ifndef hasAVX2 119 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 120 JNE sse 121 #endif 122 MOVD AX, X0 123 LEAQ -32(SI)(BX*1), R11 124 VPBROADCASTB X0, Y1 125 avx2_loop: 126 VMOVDQU (DI), Y2 127 VPCMPEQB Y1, Y2, Y3 128 VPTEST Y3, Y3 129 JNZ avx2success 130 ADDQ $32, DI 131 CMPQ DI, R11 132 JLT avx2_loop 133 MOVQ R11, DI 134 VMOVDQU (DI), Y2 135 VPCMPEQB Y1, Y2, Y3 136 VPTEST Y3, Y3 137 JNZ avx2success 138 VZEROUPPER 139 MOVQ $-1, (R8) 140 RET 141 142 avx2success: 143 VPMOVMSKB Y3, DX 144 BSFL DX, DX 145 SUBQ SI, DI 146 ADDQ DI, DX 147 MOVQ DX, (R8) 148 VZEROUPPER 149 RET