github.com/hikaru7719/go@v0.0.0-20181025140707-c8b2ac68906a/src/internal/bytealg/indexbyte_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "textflag.h" 7 8 TEXT ·IndexByte(SB), NOSPLIT, $0-40 9 MOVQ b_base+0(FP), SI 10 MOVQ b_len+8(FP), BX 11 MOVB c+24(FP), AL 12 LEAQ ret+32(FP), R8 13 JMP indexbytebody<>(SB) 14 15 TEXT ·IndexByteString(SB), NOSPLIT, $0-32 16 MOVQ s_base+0(FP), SI 17 MOVQ s_len+8(FP), BX 18 MOVB c+16(FP), AL 19 LEAQ ret+24(FP), R8 20 JMP indexbytebody<>(SB) 21 22 // Provide direct access to these functions from other packages. 23 // This is the equivlant of doing: 24 // package bytes 25 // func IndexByte(b []byte, c byte) int { 26 // return bytealg.IndexByte(s, c) 27 // } 28 // but involves no call overhead. 29 // TODO: remove this hack when midstack inlining is enabled? 30 TEXT bytes·IndexByte(SB), NOSPLIT, $0-40 31 FUNCDATA $0, ·IndexByte·args_stackmap(SB) 32 MOVQ b_base+0(FP), SI 33 MOVQ b_len+8(FP), BX 34 MOVB c+24(FP), AL 35 LEAQ ret+32(FP), R8 36 JMP indexbytebody<>(SB) 37 38 TEXT strings·IndexByte(SB), NOSPLIT, $0-32 39 FUNCDATA $0, ·IndexByteString·args_stackmap(SB) 40 MOVQ s_base+0(FP), SI 41 MOVQ s_len+8(FP), BX 42 MOVB c+16(FP), AL 43 LEAQ ret+24(FP), R8 44 JMP indexbytebody<>(SB) 45 46 // input: 47 // SI: data 48 // BX: data len 49 // AL: byte sought 50 // R8: address to put result 51 TEXT indexbytebody<>(SB), NOSPLIT, $0 52 // Shuffle X0 around so that each byte contains 53 // the character we're looking for. 54 MOVD AX, X0 55 PUNPCKLBW X0, X0 56 PUNPCKLBW X0, X0 57 PSHUFL $0, X0, X0 58 59 CMPQ BX, $16 60 JLT small 61 62 MOVQ SI, DI 63 64 CMPQ BX, $32 65 JA avx2 66 sse: 67 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 68 JMP sseloopentry 69 70 sseloop: 71 // Move the next 16-byte chunk of the data into X1. 72 MOVOU (DI), X1 73 // Compare bytes in X0 to X1. 74 PCMPEQB X0, X1 75 // Take the top bit of each byte in X1 and put the result in DX. 76 PMOVMSKB X1, DX 77 // Find first set bit, if any. 78 BSFL DX, DX 79 JNZ ssesuccess 80 // Advance to next block. 81 ADDQ $16, DI 82 sseloopentry: 83 CMPQ DI, AX 84 JB sseloop 85 86 // Search the last 16-byte chunk. This chunk may overlap with the 87 // chunks we've already searched, but that's ok. 88 MOVQ AX, DI 89 MOVOU (AX), X1 90 PCMPEQB X0, X1 91 PMOVMSKB X1, DX 92 BSFL DX, DX 93 JNZ ssesuccess 94 95 failure: 96 MOVQ $-1, (R8) 97 RET 98 99 // We've found a chunk containing the byte. 100 // The chunk was loaded from DI. 101 // The index of the matching byte in the chunk is DX. 102 // The start of the data is SI. 103 ssesuccess: 104 SUBQ SI, DI // Compute offset of chunk within data. 105 ADDQ DX, DI // Add offset of byte within chunk. 106 MOVQ DI, (R8) 107 RET 108 109 // handle for lengths < 16 110 small: 111 TESTQ BX, BX 112 JEQ failure 113 114 // Check if we'll load across a page boundary. 115 LEAQ 16(SI), AX 116 TESTW $0xff0, AX 117 JEQ endofpage 118 119 MOVOU (SI), X1 // Load data 120 PCMPEQB X0, X1 // Compare target byte with each byte in data. 121 PMOVMSKB X1, DX // Move result bits to integer register. 122 BSFL DX, DX // Find first set bit. 123 JZ failure // No set bit, failure. 124 CMPL DX, BX 125 JAE failure // Match is past end of data. 126 MOVQ DX, (R8) 127 RET 128 129 endofpage: 130 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 131 PCMPEQB X0, X1 // Compare target byte with each byte in data. 132 PMOVMSKB X1, DX // Move result bits to integer register. 133 MOVL BX, CX 134 SHLL CX, DX 135 SHRL $16, DX // Shift desired bits down to bottom of register. 136 BSFL DX, DX // Find first set bit. 137 JZ failure // No set bit, failure. 138 MOVQ DX, (R8) 139 RET 140 141 avx2: 142 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 143 JNE sse 144 MOVD AX, X0 145 LEAQ -32(SI)(BX*1), R11 146 VPBROADCASTB X0, Y1 147 avx2_loop: 148 VMOVDQU (DI), Y2 149 VPCMPEQB Y1, Y2, Y3 150 VPTEST Y3, Y3 151 JNZ avx2success 152 ADDQ $32, DI 153 CMPQ DI, R11 154 JLT avx2_loop 155 MOVQ R11, DI 156 VMOVDQU (DI), Y2 157 VPCMPEQB Y1, Y2, Y3 158 VPTEST Y3, Y3 159 JNZ avx2success 160 VZEROUPPER 161 MOVQ $-1, (R8) 162 RET 163 164 avx2success: 165 VPMOVMSKB Y3, DX 166 BSFL DX, DX 167 SUBQ SI, DI 168 ADDQ DI, DX 169 MOVQ DX, (R8) 170 VZEROUPPER 171 RET