github.com/bir3/gocompiler@v0.9.2202/src/internal/bytealg/indexbyte_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !plan9 6 7 #include "go_asm.h" 8 #include "textflag.h" 9 10 TEXT ·IndexByte(SB), NOSPLIT, $0-40 11 MOVQ b_base+0(FP), SI 12 MOVQ b_len+8(FP), BX 13 MOVB c+24(FP), AL 14 LEAQ ret+32(FP), R8 15 JMP indexbytebody<>(SB) 16 17 TEXT ·IndexByteString(SB), NOSPLIT, $0-32 18 MOVQ s_base+0(FP), SI 19 MOVQ s_len+8(FP), BX 20 MOVB c+16(FP), AL 21 LEAQ ret+24(FP), R8 22 JMP indexbytebody<>(SB) 23 24 // input: 25 // SI: data 26 // BX: data len 27 // AL: byte sought 28 // R8: address to put result 29 TEXT indexbytebody<>(SB), NOSPLIT, $0 30 // Shuffle X0 around so that each byte contains 31 // the character we're looking for. 32 MOVD AX, X0 33 PUNPCKLBW X0, X0 34 PUNPCKLBW X0, X0 35 PSHUFL $0, X0, X0 36 37 CMPQ BX, $16 38 JLT small 39 40 MOVQ SI, DI 41 42 CMPQ BX, $32 43 JA avx2 44 sse: 45 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 46 JMP sseloopentry 47 48 PCALIGN $16 49 sseloop: 50 // Move the next 16-byte chunk of the data into X1. 51 MOVOU (DI), X1 52 // Compare bytes in X0 to X1. 53 PCMPEQB X0, X1 54 // Take the top bit of each byte in X1 and put the result in DX. 55 PMOVMSKB X1, DX 56 // Find first set bit, if any. 57 BSFL DX, DX 58 JNZ ssesuccess 59 // Advance to next block. 60 ADDQ $16, DI 61 sseloopentry: 62 CMPQ DI, AX 63 JB sseloop 64 65 // Search the last 16-byte chunk. This chunk may overlap with the 66 // chunks we've already searched, but that's ok. 67 MOVQ AX, DI 68 MOVOU (AX), X1 69 PCMPEQB X0, X1 70 PMOVMSKB X1, DX 71 BSFL DX, DX 72 JNZ ssesuccess 73 74 failure: 75 MOVQ $-1, (R8) 76 RET 77 78 // We've found a chunk containing the byte. 79 // The chunk was loaded from DI. 80 // The index of the matching byte in the chunk is DX. 81 // The start of the data is SI. 82 ssesuccess: 83 SUBQ SI, DI // Compute offset of chunk within data. 84 ADDQ DX, DI // Add offset of byte within chunk. 85 MOVQ DI, (R8) 86 RET 87 88 // handle for lengths < 16 89 small: 90 TESTQ BX, BX 91 JEQ failure 92 93 // Check if we'll load across a page boundary. 94 LEAQ 16(SI), AX 95 TESTW $0xff0, AX 96 JEQ endofpage 97 98 MOVOU (SI), X1 // Load data 99 PCMPEQB X0, X1 // Compare target byte with each byte in data. 100 PMOVMSKB X1, DX // Move result bits to integer register. 101 BSFL DX, DX // Find first set bit. 102 JZ failure // No set bit, failure. 103 CMPL DX, BX 104 JAE failure // Match is past end of data. 105 MOVQ DX, (R8) 106 RET 107 108 endofpage: 109 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 110 PCMPEQB X0, X1 // Compare target byte with each byte in data. 111 PMOVMSKB X1, DX // Move result bits to integer register. 112 MOVL BX, CX 113 SHLL CX, DX 114 SHRL $16, DX // Shift desired bits down to bottom of register. 115 BSFL DX, DX // Find first set bit. 116 JZ failure // No set bit, failure. 117 MOVQ DX, (R8) 118 RET 119 120 avx2: 121 #ifndef hasAVX2 122 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 123 JNE sse 124 #endif 125 MOVD AX, X0 126 LEAQ -32(SI)(BX*1), R11 127 VPBROADCASTB X0, Y1 128 129 PCALIGN $32 130 avx2_loop: 131 VMOVDQU (DI), Y2 132 VPCMPEQB Y1, Y2, Y3 133 VPTEST Y3, Y3 134 JNZ avx2success 135 ADDQ $32, DI 136 CMPQ DI, R11 137 JLT avx2_loop 138 MOVQ R11, DI 139 VMOVDQU (DI), Y2 140 VPCMPEQB Y1, Y2, Y3 141 VPTEST Y3, Y3 142 JNZ avx2success 143 VZEROUPPER 144 MOVQ $-1, (R8) 145 RET 146 147 avx2success: 148 VPMOVMSKB Y3, DX 149 BSFL DX, DX 150 SUBQ SI, DI 151 ADDQ DI, DX 152 MOVQ DX, (R8) 153 VZEROUPPER 154 RET